├── .gitignore ├── LICENSE ├── README.md ├── examples ├── baseline │ └── ebnerd_feat_baselines.py ├── beyond_accuracy │ └── make_beyond_accuracy.ipynb ├── datasets │ ├── ebnerd_descriptive_analysis.ipynb │ ├── ebnerd_overview.ipynb │ └── plot │ │ ├── article_read_time.png │ │ ├── body_len.png │ │ ├── category_distribution.png │ │ ├── category_distribution_ba.png │ │ ├── front_article_page.png │ │ ├── front_read_time.png │ │ ├── inview_len.png │ │ ├── subtitle_len.png │ │ └── title_len.png ├── quick_start │ ├── lstur_dummy.py │ ├── make_embedding_artifacts.ipynb │ ├── naml_dummy.py │ ├── npa_dummy.py │ ├── nrms_docvec_dummy.py │ ├── nrms_dummy.py │ ├── nrms_ebnerd.ipynb │ └── nrms_ebnerd.py └── reproducibility_scripts │ ├── args_nrms.py │ ├── args_nrms_docvec.py │ ├── ebnerd_nrms.py │ ├── ebnerd_nrms_doc_hist.py │ └── ebnerd_nrms_docvec.py ├── pyproject.toml ├── src ├── __init__.py └── ebrec │ ├── evaluation │ ├── __init__.py │ ├── _ba_test.py │ ├── beyond_accuracy.py │ ├── metrics │ │ ├── __init__.py │ │ ├── _beyond_accuracy.py │ │ ├── _classification.py │ │ ├── _ranking.py │ │ └── _sklearn.py │ ├── metrics_protocols.py │ ├── protocols.py │ └── utils.py │ ├── models │ ├── fastformer │ │ ├── __init__.py │ │ ├── dataloader.py │ │ ├── fastformer.py │ │ └── fastformer_wu.py │ └── newsrec │ │ ├── __init__.py │ │ ├── base_model.py │ │ ├── dataloader.py │ │ ├── layers.py │ │ ├── lstur.py │ │ ├── model_config.py │ │ ├── naml.py │ │ ├── npa.py │ │ ├── nrms.py │ │ ├── nrms_docvec.py │ │ └── utils.py │ └── utils │ ├── __init__.py │ ├── _articles.py │ ├── _articles_behaviors.py │ ├── _behaviors.py │ ├── _constants.py │ ├── _decay.py │ ├── _descriptive_analysis.py │ ├── _nlp.py │ ├── _polars.py │ ├── _python.py │ └── _torch.py └── test ├── bombing └── bomb_dataloader.py ├── data └── ebnerd │ ├── articles.parquet │ ├── behaviors.parquet │ ├── document_vector.parquet │ └── history.parquet ├── dataloader ├── test_fastformer.py └── test_newsrec.py └── evaluation └── test_beyond_accuracy.py /.gitignore: -------------------------------------------------------------------------------- 1 | share/python-wheels/ 2 | pip-wheel-metadata/ 3 | .ipynb_checkpoints/ 4 | .installed.cfg 5 | develop-eggs/ 6 | __pycache__/ 7 | *.egg-info/ 8 | downloads/ 9 | .DS_Store 10 | .Python 11 | wheels/ 12 | .vscode 13 | mlruns 14 | build/ 15 | .eggs/ 16 | lib64/ 17 | parts/ 18 | sdist/ 19 | dist/ 20 | eggs/ 21 | lib/ 22 | var/ 23 | *.egg 24 | build 25 | .venv 26 | venv 27 | 28 | # just for now: 29 | evaluate_predictions.py 30 | ebnerd_predictions/ 31 | downloads.py 32 | -------------------------------------------------------------------------------- /LICENSE: -------------------------------------------------------------------------------- 1 | MIT License 2 | 3 | Copyright (c) 2024 Ekstra Bladet 4 | 5 | Permission is hereby granted, free of charge, to any person obtaining a copy 6 | of this software and associated documentation files (the "Software"), to deal 7 | in the Software without restriction, including without limitation the rights 8 | to use, copy, modify, merge, publish, distribute, sublicense, and/or sell 9 | copies of the Software, and to permit persons to whom the Software is 10 | furnished to do so, subject to the following conditions: 11 | 12 | The above copyright notice and this permission notice shall be included in all 13 | copies or substantial portions of the Software. 14 | 15 | THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR 16 | IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, 17 | FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE 18 | AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER 19 | LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, 20 | OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE 21 | SOFTWARE. 22 | -------------------------------------------------------------------------------- /README.md: -------------------------------------------------------------------------------- 1 | # Contributors 2 |

3 | 4 |

5 | 6 | # Introduction 7 | Hello there 👋🏽 8 | 9 | We recommend to check the repository frequently, as we are updating and documenting it along the way! 10 | 11 | ## EBNeRD 12 | Ekstra Bladet Recommender System repository, created for the RecSys'24 Challenge. 13 | 14 | # Getting Started 15 | We recommend [conda](https://docs.conda.io/projects/conda/en/latest/glossary.html#conda-environment) for environment management, and [VS Code](https://code.visualstudio.com/) for development. To install the necessart packages and run the example notebook: 16 | 17 | ``` 18 | # 1. Create and activate a new conda environment 19 | conda create -n python=3.11 20 | conda activate 21 | 22 | # 2. Clone this repo within VSCode or using command line: 23 | git clone https://github.com/ebanalyse/ebnerd-benchmark.git 24 | 25 | # 3. Install the core ebrec package to the enviroment: 26 | pip install . 27 | ``` 28 | 29 | We have experienced issues installing *tensorflow* for M1 Macbooks (```sys_platform == 'darwin'```) when using conda. To avoid this, we suggest to use venv if running on macbooks. 30 | ``` 31 | python3 -m .venv .venv 32 | source .venv/bin/activate 33 | ``` 34 | 35 | Installing ```.venv``` in project folder: 36 | ``` 37 | conda create -p .venv python==3.11.8 38 | conda activate ./.venv 39 | ``` 40 | 41 | ## Running GPU 42 | ``` 43 | tensorflow-gpu; sys_platform == 'linux' 44 | tensorflow-macos; sys_platform == 'darwin' 45 | ``` 46 | 47 | # Algorithms 48 | To get started quickly, we have implemented a couple of News Recommender Systems, specifically, 49 | [Neural Recommendation with Long- and Short-term User Representations](https://aclanthology.org/P19-1033/) (LSTUR), 50 | [Neural Recommendation with Personalized Attention](https://arxiv.org/abs/1907.05559) (NPA), 51 | [Neural Recommendation with Attentive Multi-View Learning](https://arxiv.org/abs/1907.05576) (NAML), and 52 | [Neural Recommendation with Multi-Head Self-Attention](https://aclanthology.org/D19-1671/) (NRMS). 53 | The source code originates from the brilliant RS repository, [recommenders](https://github.com/recommenders-team/recommenders). We have simply stripped it of all non-model-related code. 54 | 55 | 56 | # Notebooks 57 | To help you get started, we have created a few notebooks. These are somewhat simple and designed to get you started. We do plan to have more at a later stage, such as reproducible model trainings. 58 | The notebooks were made on macOS, and you might need to perform small modifications to have them running on your system. 59 | 60 | ## Model training 61 | We have created a [notebook](https://github.com/ebanalyse/ebnerd-benchmark/blob/main/examples/00_quick_start/nrms_ebnerd.ipynb) where we train NRMS on EB-NeRD - this is a very simple version using the demo dataset. 62 | 63 | ## Data manipulation and enrichment 64 | In the [dataset_ebnerd](https://github.com/ebanalyse/ebnerd-benchmark/blob/main/examples/00_quick_start/dataset_ebnerd.ipynb) demo, we show how one can join histories and create binary labels. 65 | 66 | # Reproduce EB-NeRD Experiments 67 | 68 | Activate your enviroment: 69 | ``` 70 | conda activate 71 | ``` 72 | 73 | ### [NRMSModel](https://github.com/ebanalyse/ebnerd-benchmark/blob/main/src/ebrec/models/newsrec/nrms.py) 74 | 75 | ``` 76 | python examples/reproducibility_scripts/ebnerd_nrms.py 77 | --datasplit ebnerd_small \ 78 | --epochs 5 \ 79 | --bs_train 32 \ 80 | --bs_test 32 \ 81 | --history_size 20 \ 82 | --npratio 4 \ 83 | --transformer_model_name FacebookAI/xlm-roberta-large \ 84 | --max_title_length 30 \ 85 | --head_num 20 \ 86 | --head_dim 20 \ 87 | --attention_hidden_dim 200 \ 88 | --learning_rate 1e-4 \ 89 | --dropout 0.20 90 | ``` 91 | 92 | Tensorboards: 93 | ``` 94 | tensorboard --logdir=ebnerd_predictions/runs 95 | ``` 96 | 97 | ### [NRMSDocVec](https://github.com/ebanalyse/ebnerd-benchmark/blob/main/src/ebrec/models/newsrec/nrms_docvec.py) 98 | 99 | ``` 100 | python examples/reproducibility_scripts/ebnerd_nrms_docvec.py \ 101 | --datasplit ebnerd_small \ 102 | --epochs 5 \ 103 | --bs_train 32 \ 104 | --history_size 20 \ 105 | --npratio 4 \ 106 | --document_embeddings Ekstra_Bladet_contrastive_vector/contrastive_vector.parquet \ 107 | --head_num 16 \ 108 | --head_dim 16 \ 109 | --attention_hidden_dim 200 \ 110 | --newsencoder_units_per_layer 512 512 512 \ 111 | --learning_rate 1e-4 \ 112 | --dropout 0.2 \ 113 | --newsencoder_l2_regularization 1e-4 114 | ``` 115 | 116 | Tensorboards: 117 | ``` 118 | tensorboard --logdir=ebnerd_predictions/runs 119 | ``` 120 | 121 | 122 | 123 | -------------------------------------------------------------------------------- /examples/baseline/ebnerd_feat_baselines.py: -------------------------------------------------------------------------------- 1 | from pathlib import Path 2 | from tqdm import tqdm 3 | import polars as pl 4 | 5 | from ebrec.utils._python import ( 6 | rank_predictions_by_score, 7 | write_submission_file, 8 | create_lookup_dict, 9 | ) 10 | from ebrec.utils._constants import * 11 | 12 | PATH = Path("~/ebnerd_data/ebnerd_testset") 13 | 14 | df_behaviors = pl.scan_parquet(PATH.joinpath("test", "behaviors.parquet")) 15 | df_articles = pl.scan_parquet(PATH.joinpath("articles.parquet")) 16 | 17 | # ==== LOOKUP DICTS 18 | clicked_dict = create_lookup_dict( 19 | df_articles.select(DEFAULT_ARTICLE_ID_COL, DEFAULT_TOTAL_PAGEVIEWS_COL).collect(), 20 | DEFAULT_ARTICLE_ID_COL, 21 | DEFAULT_TOTAL_PAGEVIEWS_COL, 22 | ) 23 | inview_dict = create_lookup_dict( 24 | df_articles.select(DEFAULT_ARTICLE_ID_COL, DEFAULT_TOTAL_INVIEWS_COL).collect(), 25 | DEFAULT_ARTICLE_ID_COL, 26 | DEFAULT_TOTAL_INVIEWS_COL, 27 | ) 28 | readtime_dict = create_lookup_dict( 29 | df_articles.select(DEFAULT_ARTICLE_ID_COL, DEFAULT_TOTAL_READ_TIME_COL).collect(), 30 | DEFAULT_ARTICLE_ID_COL, 31 | DEFAULT_TOTAL_READ_TIME_COL, 32 | ) 33 | 34 | # Estimate: 35 | df_inview_estimate = ( 36 | df_behaviors.select(DEFAULT_INVIEW_ARTICLES_COL) 37 | .explode(DEFAULT_INVIEW_ARTICLES_COL) 38 | .select(pl.col(DEFAULT_INVIEW_ARTICLES_COL).value_counts()) 39 | .unnest(DEFAULT_INVIEW_ARTICLES_COL) 40 | .collect() 41 | ) 42 | inview_dict_estimate = create_lookup_dict( 43 | df_inview_estimate.select(DEFAULT_INVIEW_ARTICLES_COL, "count"), 44 | DEFAULT_INVIEW_ARTICLES_COL, 45 | "count", 46 | ) 47 | 48 | # ==== CLICKED PREDICTIONS 49 | CLICKED_SCORE_COL = "clicked_prediction_scores" 50 | INVIEW_SCORE_COL = "inview_prediction_scores" 51 | INVIEW_ESTIMATE_SCORE_COL = "inview_estimate_prediction_scores" 52 | READTIME_SCORE_COL = "readtime_prediction_scores" 53 | 54 | df_predictions = ( 55 | df_behaviors.select(DEFAULT_IMPRESSION_ID_COL, DEFAULT_INVIEW_ARTICLES_COL) 56 | .with_columns( 57 | pl.col(DEFAULT_INVIEW_ARTICLES_COL) 58 | .list.eval(pl.element().replace(clicked_dict).fill_null(0)) 59 | .alias(CLICKED_SCORE_COL) 60 | ) 61 | .with_columns( 62 | pl.col(DEFAULT_INVIEW_ARTICLES_COL) 63 | .list.eval(pl.element().replace(inview_dict).fill_null(0)) 64 | .alias(INVIEW_SCORE_COL) 65 | ) 66 | .with_columns( 67 | pl.col(DEFAULT_INVIEW_ARTICLES_COL) 68 | .list.eval(pl.element().replace(inview_dict_estimate).fill_null(0)) 69 | .alias(INVIEW_ESTIMATE_SCORE_COL) 70 | ) 71 | .with_columns( 72 | pl.col(DEFAULT_INVIEW_ARTICLES_COL) 73 | .list.eval(pl.element().replace(readtime_dict).fill_null(0)) 74 | .alias(READTIME_SCORE_COL) 75 | ) 76 | .collect() 77 | ) 78 | 79 | # CONVERT TO RANKS: 80 | impression_id = [] 81 | clicked_scores = [] 82 | inview_scores = [] 83 | inview_estimate_scores = [] 84 | readtime_scores = [] 85 | for row in tqdm( 86 | df_predictions.iter_rows(named=True), 87 | total=df_predictions.shape[0], 88 | ncols=80, 89 | ): 90 | impression_id.append(row[DEFAULT_IMPRESSION_ID_COL]) 91 | clicked_scores.append(rank_predictions_by_score(row[CLICKED_SCORE_COL])) 92 | inview_scores.append(rank_predictions_by_score(row[INVIEW_SCORE_COL])) 93 | inview_estimate_scores.append( 94 | rank_predictions_by_score(row[INVIEW_ESTIMATE_SCORE_COL]) 95 | ) 96 | readtime_scores.append(rank_predictions_by_score(row[READTIME_SCORE_COL])) 97 | 98 | # 99 | for col, scores in zip( 100 | [ 101 | CLICKED_SCORE_COL, 102 | INVIEW_SCORE_COL, 103 | INVIEW_ESTIMATE_SCORE_COL, 104 | READTIME_SCORE_COL, 105 | ], 106 | [clicked_scores, inview_scores, inview_estimate_scores, readtime_scores], 107 | ): 108 | print("Writing submission file for:", col) 109 | Path("downloads").mkdir(exist_ok=True) 110 | write_submission_file( 111 | impression_ids=impression_id, 112 | prediction_scores=scores, 113 | path="downloads/predictions.txt", 114 | filename_zip=f"{col}.zip", 115 | ) 116 | -------------------------------------------------------------------------------- /examples/datasets/plot/article_read_time.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/ebanalyse/ebnerd-benchmark/c12f9b74c83eea68df69f9640d368adcd5d42a9e/examples/datasets/plot/article_read_time.png -------------------------------------------------------------------------------- /examples/datasets/plot/body_len.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/ebanalyse/ebnerd-benchmark/c12f9b74c83eea68df69f9640d368adcd5d42a9e/examples/datasets/plot/body_len.png -------------------------------------------------------------------------------- /examples/datasets/plot/category_distribution.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/ebanalyse/ebnerd-benchmark/c12f9b74c83eea68df69f9640d368adcd5d42a9e/examples/datasets/plot/category_distribution.png -------------------------------------------------------------------------------- /examples/datasets/plot/category_distribution_ba.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/ebanalyse/ebnerd-benchmark/c12f9b74c83eea68df69f9640d368adcd5d42a9e/examples/datasets/plot/category_distribution_ba.png -------------------------------------------------------------------------------- /examples/datasets/plot/front_article_page.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/ebanalyse/ebnerd-benchmark/c12f9b74c83eea68df69f9640d368adcd5d42a9e/examples/datasets/plot/front_article_page.png -------------------------------------------------------------------------------- /examples/datasets/plot/front_read_time.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/ebanalyse/ebnerd-benchmark/c12f9b74c83eea68df69f9640d368adcd5d42a9e/examples/datasets/plot/front_read_time.png -------------------------------------------------------------------------------- /examples/datasets/plot/inview_len.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/ebanalyse/ebnerd-benchmark/c12f9b74c83eea68df69f9640d368adcd5d42a9e/examples/datasets/plot/inview_len.png -------------------------------------------------------------------------------- /examples/datasets/plot/subtitle_len.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/ebanalyse/ebnerd-benchmark/c12f9b74c83eea68df69f9640d368adcd5d42a9e/examples/datasets/plot/subtitle_len.png -------------------------------------------------------------------------------- /examples/datasets/plot/title_len.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/ebanalyse/ebnerd-benchmark/c12f9b74c83eea68df69f9640d368adcd5d42a9e/examples/datasets/plot/title_len.png -------------------------------------------------------------------------------- /examples/quick_start/lstur_dummy.py: -------------------------------------------------------------------------------- 1 | # TODO make a notebook with it 2 | from ebrec.models.newsrec.model_config import hparams_lstur 3 | from ebrec.models.newsrec.lstur import LSTURModel 4 | import numpy as np 5 | 6 | config = hparams_lstur 7 | 8 | # Define the number of samples in your batch 9 | BATCH_SIZE = 300 10 | HISTORY_SIZE = config.history_size 11 | TITLE_SIZE = config.title_size 12 | NPRATIO = 4 13 | word_embeddings = np.random.rand(1000, 100) 14 | 15 | # Define the shapes of the input data 16 | his_input_title_shape = (HISTORY_SIZE, TITLE_SIZE) 17 | pred_input_title_shape = (NPRATIO + 1, TITLE_SIZE) 18 | vocab_size = word_embeddings.shape[0] 19 | n_users = config.n_users 20 | label_shape = (NPRATIO + 1,) 21 | user_indexes_shape = (1,) 22 | 23 | model = LSTURModel(hparams=config, word2vec_embedding=word_embeddings) 24 | model.model.summary() 25 | 26 | # Generate some random input data for input_1 with values between 0 and 1 27 | his_input_title = np.random.randint(0, vocab_size, (BATCH_SIZE, *his_input_title_shape)) 28 | # Generate some random input data for input_2 with values between 0 and 1 29 | pred_input_title = np.random.randint( 30 | 0, vocab_size, (BATCH_SIZE, *pred_input_title_shape) 31 | ) 32 | # Input data for user_indexes 33 | user_indexes = np.random.randint(0, n_users, size=(BATCH_SIZE, *user_indexes_shape)) 34 | 35 | # Generate some random label data with values between 0 and 1 36 | label_data = np.zeros((BATCH_SIZE, *label_shape), dtype=int) 37 | for row in label_data: 38 | row[np.random.choice(label_shape[0])] = 1 39 | 40 | # Print the shapes of the input data to verify they match the model's input layers 41 | print(his_input_title.shape) 42 | print(pred_input_title.shape) 43 | print(user_indexes.shape) 44 | print(label_data.shape) 45 | 46 | # Make input for model: 47 | input = (user_indexes, his_input_title, pred_input_title) 48 | 49 | # fit/predict: 50 | model.model.fit(input, label_data) 51 | model.model.predict(input) 52 | -------------------------------------------------------------------------------- /examples/quick_start/naml_dummy.py: -------------------------------------------------------------------------------- 1 | # TODO make a notebook with it 2 | from ebrec.models.newsrec.model_config import hparams_naml 3 | from ebrec.models.newsrec.naml import NAMLModel 4 | import numpy as np 5 | 6 | config = hparams_naml 7 | 8 | # Define the number of samples in your batch 9 | BATCH_SIZE = 300 10 | NPRATIO = 4 11 | HISTORY_SIZE = config.history_size 12 | TITLE_SIZE = config.title_size 13 | BODY_SIZE = config.body_size 14 | 15 | label_shape = (NPRATIO + 1,) 16 | word_embeddings = np.random.rand(1000, 100) 17 | 18 | vocab_size = word_embeddings.shape[0] 19 | n_verts = config.vert_num 20 | n_subverts = config.subvert_num 21 | 22 | # Model 23 | model = NAMLModel(hparams=config, word2vec_embedding=word_embeddings) 24 | model.model.summary() 25 | 26 | # Define the shapes of the input data 27 | his_input_title = np.random.randint( 28 | 0, vocab_size, size=(BATCH_SIZE, HISTORY_SIZE, TITLE_SIZE) 29 | ) 30 | his_input_body = np.random.randint( 31 | 0, vocab_size, size=(BATCH_SIZE, HISTORY_SIZE, BODY_SIZE) 32 | ) 33 | his_input_vert = np.random.randint(0, n_verts, size=(BATCH_SIZE, HISTORY_SIZE, 1)) 34 | his_input_subvert = np.random.randint(0, n_subverts, size=(BATCH_SIZE, HISTORY_SIZE, 1)) 35 | pred_input_title = np.random.randint( 36 | 0, vocab_size, size=(BATCH_SIZE, NPRATIO + 1, TITLE_SIZE) 37 | ) 38 | pred_input_body = np.random.randint( 39 | 0, vocab_size, size=(BATCH_SIZE, NPRATIO + 1, BODY_SIZE) 40 | ) 41 | pred_input_vert = np.random.randint(0, n_verts, size=(BATCH_SIZE, NPRATIO + 1, 1)) 42 | pred_input_subvert = np.random.randint(0, n_subverts, size=(BATCH_SIZE, NPRATIO + 1, 1)) 43 | 44 | # Generate some random label data with values between 0 and 1 45 | label_data = np.zeros((BATCH_SIZE, *label_shape), dtype=int) 46 | for row in label_data: 47 | row[np.random.choice(label_shape[0])] = 1 48 | 49 | # 50 | his_input_title.shape 51 | his_input_body.shape 52 | his_input_vert.shape 53 | his_input_subvert.shape 54 | pred_input_title.shape 55 | pred_input_body.shape 56 | pred_input_vert.shape 57 | pred_input_subvert.shape 58 | label_data.shape 59 | 60 | # Make input for model: 61 | input = ( 62 | his_input_title, 63 | his_input_body, 64 | his_input_vert, 65 | his_input_subvert, 66 | pred_input_title, 67 | pred_input_body, 68 | pred_input_vert, 69 | pred_input_subvert, 70 | ) 71 | 72 | # fit/predict: 73 | model.model.fit(input, label_data) 74 | model.model.predict(input) 75 | -------------------------------------------------------------------------------- /examples/quick_start/npa_dummy.py: -------------------------------------------------------------------------------- 1 | # TODO make a notebook with it 2 | from ebrec.models.newsrec.model_config import hparams_npa 3 | from ebrec.models.newsrec.npa import NPAModel 4 | import numpy as np 5 | 6 | config = hparams_npa 7 | 8 | # Define the number of samples in your batch 9 | BATCH_SIZE = 300 10 | HISTORY_SIZE = config.history_size 11 | TITLE_SIZE = config.title_size 12 | NPRATIO = 4 13 | word_embeddings = np.random.rand(1000, 100) 14 | 15 | # Define the shapes of the input data 16 | his_input_title_shape = (HISTORY_SIZE, TITLE_SIZE) 17 | pred_input_title_shape = (NPRATIO + 1, TITLE_SIZE) 18 | vocab_size = word_embeddings.shape[0] 19 | n_users = config.n_users 20 | label_shape = (NPRATIO + 1,) 21 | user_indexes_shape = (1,) 22 | 23 | model = NPAModel(hparams=config) 24 | model.model.summary() 25 | 26 | # Generate some random input data for input_1 with values between 0 and 1 27 | his_input_title = np.random.randint(0, vocab_size, (BATCH_SIZE, *his_input_title_shape)) 28 | # Generate some random input data for input_2 with values between 0 and 1 29 | pred_input_title = np.random.randint( 30 | 0, vocab_size, (BATCH_SIZE, *pred_input_title_shape) 31 | ) 32 | # Input data for user_indexes 33 | user_indexes = np.random.randint(0, n_users, size=(BATCH_SIZE, *user_indexes_shape)) 34 | 35 | # Generate some random label data with values between 0 and 1 36 | label_data = np.zeros((BATCH_SIZE, *label_shape), dtype=int) 37 | for row in label_data: 38 | row[np.random.choice(label_shape[0])] = 1 39 | 40 | # Print the shapes of the input data to verify they match the model's input layers 41 | print(his_input_title.shape) 42 | print(pred_input_title.shape) 43 | print(user_indexes.shape) 44 | print(label_data.shape) 45 | 46 | # Make input for model: 47 | input = (user_indexes, his_input_title, pred_input_title) 48 | 49 | # fit/predict: 50 | model.model.fit(input, label_data) 51 | model.model.predict(input) 52 | -------------------------------------------------------------------------------- /examples/quick_start/nrms_docvec_dummy.py: -------------------------------------------------------------------------------- 1 | # TODO make a notebook with it 2 | from ebrec.models.newsrec.nrms_docvec import NRMSDocVec 3 | from ebrec.models.newsrec.model_config import hparams_nrms 4 | import numpy as np 5 | 6 | DOCVEC_DIM = 300 7 | BATCH_SIZE = 10 8 | HISTORY_SIZE = 20 9 | NPRATIO = 4 10 | 11 | # 12 | config = hparams_nrms 13 | config.history_size = HISTORY_SIZE 14 | config.title_size = DOCVEC_DIM 15 | 16 | # MODEL: 17 | model = NRMSDocVec(hparams=config, newsencoder_units_per_layer=[512, 512]) 18 | model.model.summary() 19 | 20 | # 21 | his_input_title_shape = (HISTORY_SIZE, DOCVEC_DIM) 22 | pred_input_title_shape = (NPRATIO + 1, DOCVEC_DIM) 23 | label_shape = (NPRATIO + 1,) 24 | 25 | # Generate some random input data for input_1 26 | his_input_title = np.array( 27 | [np.random.rand(*his_input_title_shape) for _ in range(BATCH_SIZE)] 28 | ) 29 | # Generate some random input data for input_2 30 | pred_input_title = np.array( 31 | [np.random.rand(*pred_input_title_shape) for _ in range(BATCH_SIZE)] 32 | ) 33 | # Generate some random label data with values between 0 and 1 34 | label_data = np.zeros((BATCH_SIZE, *label_shape), dtype=int) 35 | for row in label_data: 36 | row[np.random.choice(label_shape[0])] = 1 37 | 38 | # Print the shapes of the input data to verify they match the model's input layers 39 | print(his_input_title.shape) 40 | print(pred_input_title.shape) 41 | print(label_data.shape) 42 | 43 | # Make input for model: 44 | input = (his_input_title, pred_input_title) 45 | 46 | # fit/predict: 47 | model.model.fit(input, label_data, epochs=10) 48 | model.model.predict(input) 49 | -------------------------------------------------------------------------------- /examples/quick_start/nrms_dummy.py: -------------------------------------------------------------------------------- 1 | # TODO make a notebook with it 2 | from ebrec.models.newsrec.model_config import hparams_nrms 3 | from ebrec.models.newsrec.nrms import NRMSModel 4 | import numpy as np 5 | 6 | config = hparams_nrms 7 | 8 | # Define the number of samples in your batch 9 | BATCH_SIZE = 10 10 | HISTORY_SIZE = config.history_size 11 | TITLE_SIZE = config.title_size 12 | NPRATIO = 4 13 | word_embeddings = np.random.rand(1000, 100) 14 | 15 | model = NRMSModel(hparams=config, word2vec_embedding=word_embeddings) 16 | model.model.summary() 17 | 18 | # Define the shapes of the input data 19 | his_input_title_shape = (HISTORY_SIZE, TITLE_SIZE) 20 | pred_input_title_shape = (NPRATIO + 1, TITLE_SIZE) 21 | label_shape = (NPRATIO + 1,) 22 | vocab_size = word_embeddings.shape[0] 23 | 24 | # Generate some random input data for input_1 with values between 0 and 1 25 | his_input_title = np.random.randint(0, vocab_size, (BATCH_SIZE, *his_input_title_shape)) 26 | 27 | # Generate some random input data for input_2 with values between 0 and 1 28 | pred_input_title = np.random.randint( 29 | 0, vocab_size, (BATCH_SIZE, *pred_input_title_shape) 30 | ) 31 | 32 | # Generate some random label data with values between 0 and 1 33 | label_data = np.zeros((BATCH_SIZE, *label_shape), dtype=int) 34 | for row in label_data: 35 | row[np.random.choice(label_shape[0])] = 1 36 | 37 | # Print the shapes of the input data to verify they match the model's input layers 38 | print(his_input_title.shape) 39 | print(pred_input_title.shape) 40 | print(label_data.shape) 41 | 42 | # Make input for model: 43 | input = (his_input_title, pred_input_title) 44 | 45 | # fit/predict: 46 | model.model.fit(input, label_data) 47 | model.model.predict(input) 48 | -------------------------------------------------------------------------------- /examples/quick_start/nrms_ebnerd.py: -------------------------------------------------------------------------------- 1 | from tensorflow.keras.backend import clear_session 2 | from transformers import AutoTokenizer, AutoModel 3 | from pathlib import Path 4 | import tensorflow as tf 5 | import datetime as dt 6 | import polars as pl 7 | import numpy as np 8 | import gc 9 | import os 10 | 11 | from ebrec.utils._constants import ( 12 | DEFAULT_HISTORY_ARTICLE_ID_COL, 13 | DEFAULT_IS_BEYOND_ACCURACY_COL, 14 | DEFAULT_CLICKED_ARTICLES_COL, 15 | DEFAULT_INVIEW_ARTICLES_COL, 16 | DEFAULT_IMPRESSION_ID_COL, 17 | DEFAULT_SUBTITLE_COL, 18 | DEFAULT_LABELS_COL, 19 | DEFAULT_TITLE_COL, 20 | DEFAULT_USER_COL, 21 | ) 22 | 23 | from ebrec.utils._behaviors import ( 24 | create_binary_labels_column, 25 | sampling_strategy_wu2019, 26 | add_known_user_column, 27 | add_prediction_scores, 28 | truncate_history, 29 | ) 30 | from ebrec.evaluation import MetricEvaluator, AucScore, NdcgScore, MrrScore 31 | from ebrec.utils._articles import convert_text2encoding_with_transformers 32 | from ebrec.utils._polars import ( 33 | slice_join_dataframes, 34 | concat_str_columns, 35 | chunk_dataframe, 36 | split_df, 37 | ) 38 | from ebrec.utils._articles import create_article_id_to_value_mapping 39 | from ebrec.utils._nlp import get_transformers_word_embeddings 40 | from ebrec.utils._python import write_submission_file, rank_predictions_by_score 41 | 42 | from ebrec.models.newsrec.dataloader import NRMSDataLoader, NRMSDataLoaderPretransform 43 | from ebrec.models.newsrec.model_config import hparams_nrms 44 | from ebrec.models.newsrec import NRMSModel 45 | 46 | os.environ["TOKENIZERS_PARALLELISM"] = "false" 47 | gpus = tf.config.experimental.list_physical_devices("GPU") 48 | for gpu in gpus: 49 | tf.config.experimental.set_memory_growth(gpu, True) 50 | 51 | # conda activate ./venv/ 52 | # python -i examples/00_quick_start/nrms_ebnerd.py 53 | 54 | 55 | def ebnerd_from_path(path: Path, history_size: int = 30) -> pl.DataFrame: 56 | """ 57 | Load ebnerd - function 58 | """ 59 | df_history = ( 60 | pl.scan_parquet(path.joinpath("history.parquet")) 61 | .select(DEFAULT_USER_COL, DEFAULT_HISTORY_ARTICLE_ID_COL) 62 | .pipe( 63 | truncate_history, 64 | column=DEFAULT_HISTORY_ARTICLE_ID_COL, 65 | history_size=history_size, 66 | padding_value=0, 67 | enable_warning=False, 68 | ) 69 | ) 70 | df_behaviors = ( 71 | pl.scan_parquet(path.joinpath("behaviors.parquet")) 72 | .collect() 73 | .pipe( 74 | slice_join_dataframes, 75 | df2=df_history.collect(), 76 | on=DEFAULT_USER_COL, 77 | how="left", 78 | ) 79 | ) 80 | return df_behaviors 81 | 82 | 83 | PATH = Path("~/ebnerd_data").expanduser() 84 | DUMP_DIR = Path("ebnerd_predictions") 85 | DUMP_DIR.mkdir(exist_ok=True, parents=True) 86 | SEED = np.random.randint(0, 1_000) 87 | 88 | MODEL_NAME = f"NRMS-{dt.datetime.now().strftime('%Y-%m-%d %H:%M:%S')}-{SEED}" 89 | # MODEL_NAME = "NRMS-382861963-2024-11-12 01:34:49.050070" 90 | 91 | MODEL_WEIGHTS = DUMP_DIR.joinpath(f"state_dict/{MODEL_NAME}/weights") 92 | LOG_DIR = DUMP_DIR.joinpath(f"runs/{MODEL_NAME}") 93 | TEST_DF_DUMP = DUMP_DIR.joinpath("test_predictions", MODEL_NAME) 94 | TEST_DF_DUMP.mkdir(parents=True, exist_ok=True) 95 | 96 | print(f"Dir: {MODEL_NAME}") 97 | 98 | DATASPLIT = "ebnerd_small" 99 | MAX_TITLE_LENGTH = 30 100 | HISTORY_SIZE = 20 101 | FRACTION = 1.0 102 | EPOCHS = 5 103 | FRACTION_TEST = 1.0 104 | # 105 | hparams_nrms.history_size = HISTORY_SIZE 106 | 107 | BATCH_SIZE_TRAIN = 32 108 | BATCH_SIZE_VAL = 32 109 | BATCH_SIZE_TEST_WO_B = 32 110 | BATCH_SIZE_TEST_W_B = 4 111 | N_CHUNKS_TEST = 10 112 | CHUNKS_DONE = 0 113 | 114 | COLUMNS = [ 115 | DEFAULT_USER_COL, 116 | DEFAULT_HISTORY_ARTICLE_ID_COL, 117 | DEFAULT_INVIEW_ARTICLES_COL, 118 | DEFAULT_CLICKED_ARTICLES_COL, 119 | DEFAULT_IMPRESSION_ID_COL, 120 | ] 121 | 122 | df_train = ( 123 | ebnerd_from_path(PATH.joinpath(DATASPLIT, "validation"), history_size=HISTORY_SIZE) 124 | .sample(fraction=FRACTION) 125 | .select(COLUMNS) 126 | .pipe( 127 | sampling_strategy_wu2019, 128 | npratio=4, 129 | shuffle=True, 130 | with_replacement=True, 131 | seed=SEED, 132 | ) 133 | .pipe(create_binary_labels_column) 134 | ) 135 | df_train, df_validation = split_df(df_train, fraction=0.9, seed=SEED, shuffle=False) 136 | 137 | # df_test = df_validation 138 | # df_train = df_train[:100] 139 | # df_validation = df_validation[:100] 140 | # df_test = df_test[:100] 141 | df_articles = pl.read_parquet(PATH.joinpath("articles.parquet")) 142 | 143 | # => 144 | TRANSFORMER_MODEL_NAME = "FacebookAI/xlm-roberta-base" 145 | TEXT_COLUMNS_TO_USE = [DEFAULT_SUBTITLE_COL, DEFAULT_TITLE_COL] 146 | 147 | # LOAD HUGGINGFACE: 148 | transformer_model = AutoModel.from_pretrained(TRANSFORMER_MODEL_NAME) 149 | transformer_tokenizer = AutoTokenizer.from_pretrained(TRANSFORMER_MODEL_NAME) 150 | 151 | word2vec_embedding = get_transformers_word_embeddings(transformer_model) 152 | # 153 | df_articles, cat_cal = concat_str_columns(df_articles, columns=TEXT_COLUMNS_TO_USE) 154 | df_articles, token_col_title = convert_text2encoding_with_transformers( 155 | df_articles, transformer_tokenizer, cat_cal, max_length=MAX_TITLE_LENGTH 156 | ) 157 | # => 158 | article_mapping = create_article_id_to_value_mapping( 159 | df=df_articles, value_col=token_col_title 160 | ) 161 | 162 | # => 163 | print("Init train- and val-dataloader") 164 | train_dataloader = NRMSDataLoaderPretransform( 165 | behaviors=df_train, 166 | article_dict=article_mapping, 167 | unknown_representation="zeros", 168 | history_column=DEFAULT_HISTORY_ARTICLE_ID_COL, 169 | eval_mode=False, 170 | batch_size=BATCH_SIZE_TRAIN, 171 | ) 172 | val_dataloader = NRMSDataLoaderPretransform( 173 | behaviors=df_validation, 174 | article_dict=article_mapping, 175 | unknown_representation="zeros", 176 | history_column=DEFAULT_HISTORY_ARTICLE_ID_COL, 177 | eval_mode=False, 178 | batch_size=BATCH_SIZE_VAL, 179 | ) 180 | 181 | # CALLBACKS 182 | tensorboard_callback = tf.keras.callbacks.TensorBoard(log_dir=LOG_DIR, histogram_freq=1) 183 | early_stopping = tf.keras.callbacks.EarlyStopping(monitor="val_loss", patience=2) 184 | modelcheckpoint = tf.keras.callbacks.ModelCheckpoint( 185 | filepath=MODEL_WEIGHTS, save_best_only=True, save_weights_only=True, verbose=1 186 | ) 187 | 188 | model = NRMSModel( 189 | hparams=hparams_nrms, 190 | word2vec_embedding=word2vec_embedding, 191 | seed=42, 192 | ) 193 | hist = model.model.fit( 194 | train_dataloader, 195 | validation_data=val_dataloader, 196 | epochs=EPOCHS, 197 | callbacks=[tensorboard_callback, early_stopping], 198 | ) 199 | del ( 200 | transformer_tokenizer, 201 | transformer_model, 202 | train_dataloader, 203 | val_dataloader, 204 | df_validation, 205 | df_train, 206 | ) 207 | gc.collect() 208 | 209 | print(f"saving model: {MODEL_WEIGHTS}") 210 | model.model.save_weights(MODEL_WEIGHTS) 211 | print(f"loading model: {MODEL_WEIGHTS}") 212 | model.model.load_weights(MODEL_WEIGHTS) 213 | 214 | # => 215 | print("Init df_test") 216 | df_test = ( 217 | ebnerd_from_path(PATH.joinpath("ebnerd_testset", "test"), history_size=HISTORY_SIZE) 218 | .sample(fraction=FRACTION_TEST) 219 | .with_columns( 220 | pl.col(DEFAULT_INVIEW_ARTICLES_COL) 221 | .list.first() 222 | .alias(DEFAULT_CLICKED_ARTICLES_COL) 223 | ) 224 | .select(COLUMNS + [DEFAULT_IS_BEYOND_ACCURACY_COL]) 225 | .with_columns( 226 | pl.col(DEFAULT_INVIEW_ARTICLES_COL) 227 | .list.eval(pl.element() * 0) 228 | .alias(DEFAULT_LABELS_COL) 229 | ) 230 | ) 231 | # Split test in beyond-accuracy. BA samples have more 'article_ids_inview'. 232 | df_test_wo_beyond = df_test.filter(~pl.col(DEFAULT_IS_BEYOND_ACCURACY_COL)) 233 | df_test_w_beyond = df_test.filter(pl.col(DEFAULT_IS_BEYOND_ACCURACY_COL)) 234 | 235 | df_test_chunks = chunk_dataframe(df_test_wo_beyond, n_chunks=N_CHUNKS_TEST) 236 | df_pred_test_wo_beyond = [] 237 | 238 | for i, df_test_chunk in enumerate(df_test_chunks[CHUNKS_DONE:], start=1 + CHUNKS_DONE): 239 | print(f"Init test-dataloader: {i}/{len(df_test_chunks)}") 240 | # Initialize DataLoader 241 | test_dataloader_wo_b = NRMSDataLoader( 242 | behaviors=df_test_chunk, 243 | article_dict=article_mapping, 244 | unknown_representation="zeros", 245 | history_column=DEFAULT_HISTORY_ARTICLE_ID_COL, 246 | eval_mode=True, 247 | batch_size=BATCH_SIZE_TEST_WO_B, 248 | ) 249 | # Predict and clear session 250 | scores = model.scorer.predict(test_dataloader_wo_b) 251 | clear_session() 252 | 253 | # Process the predictions 254 | df_test_chunk = add_prediction_scores(df_test_chunk, scores.tolist()).with_columns( 255 | pl.col("scores") 256 | .map_elements(lambda x: list(rank_predictions_by_score(x))) 257 | .alias("ranked_scores") 258 | ) 259 | 260 | # Save the processed chunk 261 | df_test_chunk.select(DEFAULT_IMPRESSION_ID_COL, "ranked_scores").write_parquet( 262 | TEST_DF_DUMP.joinpath(f"pred_wo_ba_{i}.parquet") 263 | ) 264 | 265 | # Append and clean up 266 | df_pred_test_wo_beyond.append(df_test_chunk) 267 | 268 | # Cleanup 269 | del df_test_chunk, test_dataloader_wo_b, scores 270 | gc.collect() 271 | 272 | # => 273 | df_pred_test_wo_beyond = pl.concat(df_pred_test_wo_beyond) 274 | df_pred_test_wo_beyond.select(DEFAULT_IMPRESSION_ID_COL, "ranked_scores").write_parquet( 275 | TEST_DF_DUMP.joinpath("pred_wo_ba.parquet") 276 | ) 277 | 278 | print("Init test-dataloader: beyond-accuracy") 279 | test_dataloader_w_b = NRMSDataLoader( 280 | behaviors=df_test_w_beyond, 281 | article_dict=article_mapping, 282 | unknown_representation="zeros", 283 | history_column=DEFAULT_HISTORY_ARTICLE_ID_COL, 284 | eval_mode=True, 285 | batch_size=BATCH_SIZE_TEST_W_B, 286 | ) 287 | scores = model.scorer.predict(test_dataloader_w_b) 288 | df_pred_test_w_beyond = add_prediction_scores( 289 | df_test_w_beyond, scores.tolist() 290 | ).with_columns( 291 | pl.col("scores") 292 | .map_elements(lambda x: list(rank_predictions_by_score(x))) 293 | .alias("ranked_scores") 294 | ) 295 | df_pred_test_w_beyond.select(DEFAULT_IMPRESSION_ID_COL, "ranked_scores").write_parquet( 296 | TEST_DF_DUMP.joinpath("pred_w_ba.parquet") 297 | ) 298 | 299 | # => 300 | df_test = pl.concat([df_pred_test_wo_beyond, df_pred_test_w_beyond]) 301 | df_test.select(DEFAULT_IMPRESSION_ID_COL, "ranked_scores").write_parquet( 302 | TEST_DF_DUMP.joinpath("pred_concat.parquet") 303 | ) 304 | # metrics = MetricEvaluator( 305 | # labels=df_validation["labels"].to_list(), 306 | # predictions=df_validation["scores"].to_list(), 307 | # metric_functions=[AucScore(), MrrScore(), NdcgScore(k=5), NdcgScore(k=10)], 308 | # ) 309 | # metrics.evaluate() 310 | 311 | write_submission_file( 312 | impression_ids=df_test[DEFAULT_IMPRESSION_ID_COL], 313 | prediction_scores=df_test["ranked_scores"], 314 | path=DUMP_DIR.joinpath("predictions.txt"), 315 | filename_zip=f"{DATASPLIT}_predictions-{MODEL_NAME}.zip", 316 | ) 317 | -------------------------------------------------------------------------------- /examples/reproducibility_scripts/args_nrms.py: -------------------------------------------------------------------------------- 1 | import argparse 2 | 3 | 4 | def get_args(): 5 | parser = argparse.ArgumentParser( 6 | description="Argument parser for NRMSModel training" 7 | ) 8 | 9 | parser.add_argument( 10 | "--data_path", 11 | type=str, 12 | default=str("~/ebnerd_data"), 13 | help="Path to the data directory", 14 | ) 15 | 16 | # General settings 17 | parser.add_argument("--seed", type=int, default=123, help="Random seed") 18 | parser.add_argument( 19 | "--datasplit", type=str, default="ebnerd_small", help="Dataset split to use" 20 | ) 21 | parser.add_argument("--debug", action="store_true", help="Enable debug mode") 22 | 23 | # Batch sizes 24 | parser.add_argument( 25 | "--bs_train", type=int, default=32, help="Batch size for training" 26 | ) 27 | parser.add_argument( 28 | "--bs_test", type=int, default=32, help="Batch size for testing" 29 | ) 30 | parser.add_argument( 31 | "--batch_size_test_wo_b", 32 | type=int, 33 | default=32, 34 | help="Batch size for testing without balancing", 35 | ) 36 | parser.add_argument( 37 | "--batch_size_test_w_b", 38 | type=int, 39 | default=4, 40 | help="Batch size for testing with balancing", 41 | ) 42 | 43 | # History and ratios 44 | parser.add_argument( 45 | "--history_size", type=int, default=20, help="History size for the model" 46 | ) 47 | parser.add_argument( 48 | "--npratio", type=int, default=4, help="Negative-positive ratio" 49 | ) 50 | 51 | # Training settings 52 | parser.add_argument("--epochs", type=int, default=5, help="Number of epochs") 53 | parser.add_argument( 54 | "--train_fraction", 55 | type=float, 56 | default=1.0, 57 | help="Fraction of training data to use", 58 | ) 59 | parser.add_argument( 60 | "--fraction_test", 61 | type=float, 62 | default=1.0, 63 | help="Fraction of testing data to use", 64 | ) 65 | 66 | # Model and loader settings 67 | parser.add_argument( 68 | "--nrms_loader", 69 | type=str, 70 | default="NRMSDataLoaderPretransform", 71 | choices=["NRMSDataLoaderPretransform", "NRMSDataLoader"], 72 | help="Data loader type (speed or memory efficient)", 73 | ) 74 | 75 | # Chunk processing 76 | parser.add_argument( 77 | "--n_chunks_test", type=int, default=10, help="Number of test chunks to process" 78 | ) 79 | parser.add_argument( 80 | "--chunks_done", type=int, default=0, help="Number of chunks already processed" 81 | ) 82 | 83 | # ===================================================================================== 84 | # ############################# UNIQUE FOR NRMSDocVec ############################### 85 | # ===================================================================================== 86 | # Transformer settings 87 | parser.add_argument( 88 | "--transformer_model_name", 89 | type=str, 90 | default="FacebookAI/xlm-roberta-large", 91 | help="Transformer model name", 92 | ) 93 | parser.add_argument( 94 | "--max_title_length", 95 | type=int, 96 | default=30, 97 | help="Maximum length of title encoding", 98 | ) 99 | 100 | # Hyperparameters 101 | parser.add_argument( 102 | "--head_num", type=int, default=20, help="Number of attention heads" 103 | ) 104 | parser.add_argument( 105 | "--head_dim", type=int, default=20, help="Dimension of each attention head" 106 | ) 107 | parser.add_argument( 108 | "--attention_hidden_dim", 109 | type=int, 110 | default=200, 111 | help="Dimension of attention hidden layers", 112 | ) 113 | 114 | # Optimizer settings 115 | parser.add_argument( 116 | "--optimizer", type=str, default="adam", help="Optimizer to use" 117 | ) 118 | parser.add_argument( 119 | "--loss", type=str, default="cross_entropy_loss", help="Loss function" 120 | ) 121 | parser.add_argument("--dropout", type=float, default=0.20, help="Dropout rate") 122 | parser.add_argument( 123 | "--learning_rate", type=float, default=1e-4, help="Learning rate" 124 | ) 125 | 126 | return parser.parse_args() 127 | -------------------------------------------------------------------------------- /examples/reproducibility_scripts/args_nrms_docvec.py: -------------------------------------------------------------------------------- 1 | import argparse 2 | 3 | 4 | def get_args(): 5 | parser = argparse.ArgumentParser( 6 | description="Argument parser for NRMSModel training" 7 | ) 8 | 9 | parser.add_argument( 10 | "--data_path", 11 | type=str, 12 | default=str("~/ebnerd_data"), 13 | help="Path to the data directory", 14 | ) 15 | 16 | # General settings 17 | parser.add_argument("--seed", type=int, default=123, help="Random seed") 18 | parser.add_argument( 19 | "--datasplit", type=str, default="ebnerd_small", help="Dataset split to use" 20 | ) 21 | parser.add_argument("--debug", action="store_true", help="Enable debug mode") 22 | 23 | # Batch sizes 24 | parser.add_argument( 25 | "--bs_train", type=int, default=32, help="Batch size for training" 26 | ) 27 | parser.add_argument( 28 | "--bs_test", type=int, default=32, help="Batch size for testing" 29 | ) 30 | parser.add_argument( 31 | "--batch_size_test_wo_b", 32 | type=int, 33 | default=32, 34 | help="Batch size for testing without balancing", 35 | ) 36 | parser.add_argument( 37 | "--batch_size_test_w_b", 38 | type=int, 39 | default=4, 40 | help="Batch size for testing with balancing", 41 | ) 42 | 43 | # History and ratios 44 | parser.add_argument( 45 | "--history_size", type=int, default=20, help="History size for the model" 46 | ) 47 | parser.add_argument( 48 | "--npratio", type=int, default=4, help="Negative-positive ratio" 49 | ) 50 | 51 | # Training settings 52 | parser.add_argument("--epochs", type=int, default=5, help="Number of epochs") 53 | parser.add_argument( 54 | "--train_fraction", 55 | type=float, 56 | default=1.0, 57 | help="Fraction of training data to use", 58 | ) 59 | parser.add_argument( 60 | "--fraction_test", 61 | type=float, 62 | default=1.0, 63 | help="Fraction of testing data to use", 64 | ) 65 | 66 | # Model and loader settings 67 | parser.add_argument( 68 | "--nrms_loader", 69 | type=str, 70 | default="NRMSDataLoaderPretransform", 71 | choices=["NRMSDataLoaderPretransform", "NRMSDataLoader"], 72 | help="Data loader type (speed or memory efficient)", 73 | ) 74 | 75 | # Chunk processing 76 | parser.add_argument( 77 | "--n_chunks_test", type=int, default=10, help="Number of test chunks to process" 78 | ) 79 | parser.add_argument( 80 | "--chunks_done", type=int, default=0, help="Number of chunks already processed" 81 | ) 82 | 83 | # ===================================================================================== 84 | # ############################# UNIQUE FOR NRMSDocVec ############################### 85 | # ===================================================================================== 86 | 87 | parser.add_argument( 88 | "--document_embeddings", 89 | type=str, 90 | default="Ekstra_Bladet_contrastive_vector/contrastive_vector.parquet", 91 | help="Path to the document embeddings file", 92 | ) 93 | # Model function and architecture 94 | parser.add_argument( 95 | "--title_size", type=int, default=768, help="Size of title encoding" 96 | ) 97 | parser.add_argument( 98 | "--head_num", type=int, default=16, help="Number of attention heads" 99 | ) 100 | parser.add_argument( 101 | "--head_dim", type=int, default=16, help="Dimension of each attention head" 102 | ) 103 | parser.add_argument( 104 | "--attention_hidden_dim", 105 | type=int, 106 | default=200, 107 | help="Dimension of attention hidden layers", 108 | ) 109 | parser.add_argument( 110 | "--newsencoder_units_per_layer", 111 | nargs="+", 112 | type=int, 113 | default=[512, 512, 512], 114 | help="List of units per layer in the news encoder", 115 | ) 116 | 117 | # Optimizer settings 118 | parser.add_argument( 119 | "--optimizer", type=str, default="adam", help="Optimizer to use" 120 | ) 121 | parser.add_argument( 122 | "--loss", type=str, default="cross_entropy_loss", help="Loss function" 123 | ) 124 | parser.add_argument("--dropout", type=float, default=0.2, help="Dropout rate") 125 | parser.add_argument( 126 | "--learning_rate", type=float, default=1e-4, help="Learning rate" 127 | ) 128 | parser.add_argument( 129 | "--newsencoder_l2_regularization", 130 | type=float, 131 | default=1e-4, 132 | help="L2 regularization for the news encoder", 133 | ) 134 | 135 | return parser.parse_args() 136 | -------------------------------------------------------------------------------- /examples/reproducibility_scripts/ebnerd_nrms_doc_hist.py: -------------------------------------------------------------------------------- 1 | from transformers import AutoTokenizer, AutoModel 2 | from ebrec.utils._nlp import get_transformers_word_embeddings 3 | from ebrec.utils._articles import convert_text2encoding_with_transformers 4 | 5 | from pathlib import Path 6 | import tensorflow as tf 7 | import datetime as dt 8 | import polars as pl 9 | import shutil 10 | import gc 11 | import os 12 | 13 | from ebrec.utils._constants import * 14 | 15 | from ebrec.utils._behaviors import ( 16 | create_binary_labels_column, 17 | sampling_strategy_wu2019, 18 | add_prediction_scores, 19 | truncate_history, 20 | ebnerd_from_path, 21 | ) 22 | from ebrec.evaluation import MetricEvaluator, AucScore, NdcgScore, MrrScore 23 | 24 | from ebrec.utils._python import ( 25 | write_submission_file, 26 | rank_predictions_by_score, 27 | write_json_file, 28 | ) 29 | from ebrec.utils._articles import create_article_id_to_value_mapping 30 | from ebrec.utils._polars import split_df_chunks, concat_str_columns 31 | 32 | from ebrec.models.newsrec.dataloader import NRMSDataLoader, NRMSDataLoaderPretransform 33 | from ebrec.models.newsrec.model_config import ( 34 | hparams_nrms, 35 | hparams_nrms_docvec, 36 | hparams_to_dict, 37 | print_hparams, 38 | ) 39 | from ebrec.models.newsrec.nrms_docvec import NRMSDocVec 40 | from ebrec.models.newsrec import NRMSModel 41 | 42 | os.environ["TOKENIZERS_PARALLELISM"] = "false" 43 | 44 | from args_nrms_docvec import get_args 45 | 46 | args = get_args() 47 | 48 | for arg, val in vars(args).items(): 49 | print(f"{arg} : {val}") 50 | 51 | PATH = Path(args.data_path).expanduser() 52 | # Access arguments as variables 53 | SEED = args.seed 54 | DATASPLIT = args.datasplit 55 | DEBUG = args.debug 56 | BS_TRAIN = args.bs_train 57 | BS_TEST = args.bs_test 58 | BATCH_SIZE_TEST_WO_B = args.batch_size_test_wo_b 59 | BATCH_SIZE_TEST_W_B = args.batch_size_test_w_b 60 | HISTORY_SIZE = args.history_size 61 | NPRATIO = args.npratio 62 | EPOCHS = args.epochs 63 | TRAIN_FRACTION = args.train_fraction if not DEBUG else 0.0001 64 | FRACTION_TEST = args.fraction_test if not DEBUG else 0.0001 65 | 66 | NRMSLoader_training = ( 67 | NRMSDataLoaderPretransform 68 | if args.nrms_loader == "NRMSDataLoaderPretransform" 69 | else NRMSDataLoader 70 | ) 71 | 72 | # ===================================================================================== 73 | # ############################# UNIQUE FOR NRMSModel ################################ 74 | # ===================================================================================== 75 | 76 | # Model in use: 77 | model_func = NRMSDocVec 78 | hparams = hparams_nrms_docvec 79 | # 80 | hparams.title_size = args.title_size 81 | hparams.history_size = args.history_size 82 | hparams.head_num = args.head_num 83 | hparams.head_dim = args.head_dim 84 | hparams.attention_hidden_dim = args.attention_hidden_dim 85 | hparams.newsencoder_units_per_layer = args.newsencoder_units_per_layer 86 | hparams.optimizer = args.optimizer 87 | hparams.loss = args.loss 88 | hparams.dropout = args.dropout 89 | hparams.learning_rate = args.learning_rate 90 | hparams.newsencoder_l2_regularization = args.newsencoder_l2_regularization 91 | print_hparams(hparams) 92 | 93 | # ============= 94 | # Data-path 95 | DOC_VEC_PATH = PATH.joinpath(f"artifacts/{args.document_embeddings}") 96 | print("Initiating articles...") 97 | df_articles = pl.read_parquet(DOC_VEC_PATH) 98 | article_mapping = create_article_id_to_value_mapping( 99 | df=df_articles, value_col=df_articles.columns[-1] 100 | ) 101 | 102 | # ===================================================================================== 103 | # ############################# UNIQUE FOR NRMSDocVec ############################### 104 | # ===================================================================================== 105 | 106 | 107 | # Dump paths: 108 | DUMP_DIR = Path("ebnerd_predictions") 109 | DUMP_DIR.mkdir(exist_ok=True, parents=True) 110 | # 111 | DT_NOW = dt.datetime.now() 112 | # 113 | MODEL_NAME = model_func.__name__ 114 | MODEL_OUTPUT_NAME = f"{MODEL_NAME}-{DT_NOW}" 115 | # 116 | ARTIFACT_DIR = DUMP_DIR.joinpath("test_predictions", MODEL_NAME) 117 | # Model monitoring: 118 | MODEL_WEIGHTS = DUMP_DIR.joinpath(f"state_dict/{MODEL_OUTPUT_NAME}/weights") 119 | LOG_DIR = DUMP_DIR.joinpath(f"runs/{MODEL_OUTPUT_NAME}") 120 | # Evaluating the test test can be memory intensive, we'll chunk it up: 121 | TEST_CHUNKS_DIR = ARTIFACT_DIR.joinpath("test_chunks") 122 | TEST_CHUNKS_DIR.mkdir(parents=True, exist_ok=True) 123 | N_CHUNKS_TEST = 10 124 | CHUNKS_DONE = 0 # if it crashes, you can start from here. 125 | # Just trying keeping the dataframe slime: 126 | COLUMNS = [ 127 | DEFAULT_IMPRESSION_TIMESTAMP_COL, 128 | DEFAULT_HISTORY_ARTICLE_ID_COL, 129 | DEFAULT_INVIEW_ARTICLES_COL, 130 | DEFAULT_CLICKED_ARTICLES_COL, 131 | DEFAULT_IMPRESSION_ID_COL, 132 | DEFAULT_USER_COL, 133 | ] 134 | # Store hparams 135 | write_json_file( 136 | hparams_to_dict(hparams), 137 | ARTIFACT_DIR.joinpath(f"{MODEL_NAME}_hparams.json"), 138 | ) 139 | write_json_file(vars(args), ARTIFACT_DIR.joinpath(f"{MODEL_NAME}_argparser.json")) 140 | 141 | # ===================================================================================== 142 | 143 | df = ( 144 | ebnerd_from_path(PATH.joinpath(DATASPLIT, "train"), history_size=HISTORY_SIZE) 145 | .sample(fraction=TRAIN_FRACTION, shuffle=True, seed=SEED) 146 | .select(COLUMNS) 147 | .pipe( 148 | sampling_strategy_wu2019, 149 | npratio=4, 150 | shuffle=True, 151 | with_replacement=True, 152 | seed=SEED, 153 | ) 154 | .pipe(create_binary_labels_column) 155 | ) 156 | # 157 | last_dt = df[DEFAULT_IMPRESSION_TIMESTAMP_COL].dt.date().max() - dt.timedelta(days=1) 158 | df_train = df.filter(pl.col(DEFAULT_IMPRESSION_TIMESTAMP_COL).dt.date() < last_dt) 159 | df_validation = df.filter(pl.col(DEFAULT_IMPRESSION_TIMESTAMP_COL).dt.date() >= last_dt) 160 | 161 | 162 | # ===================================================================================== 163 | train_dataloader = NRMSDataLoaderPretransform( 164 | behaviors=df_train, 165 | article_dict=article_mapping, 166 | unknown_representation="zeros", 167 | history_column=DEFAULT_HISTORY_ARTICLE_ID_COL, 168 | eval_mode=False, 169 | batch_size=BS_TRAIN, 170 | ) 171 | 172 | val_dataloader = NRMSDataLoaderPretransform( 173 | behaviors=df_validation, 174 | article_dict=article_mapping, 175 | unknown_representation="zeros", 176 | history_column=DEFAULT_HISTORY_ARTICLE_ID_COL, 177 | eval_mode=False, 178 | batch_size=BS_TEST, 179 | ) 180 | 181 | # ===================================================================================== 182 | print(f"Initiating training-dataloader") 183 | train_dataloader = NRMSLoader_training( 184 | behaviors=df_train, 185 | article_dict=article_mapping, 186 | unknown_representation="zeros", 187 | history_column=DEFAULT_HISTORY_ARTICLE_ID_COL, 188 | eval_mode=False, 189 | batch_size=BS_TRAIN, 190 | ) 191 | 192 | val_dataloader = NRMSLoader_training( 193 | behaviors=df_validation, 194 | article_dict=article_mapping, 195 | unknown_representation="zeros", 196 | history_column=DEFAULT_HISTORY_ARTICLE_ID_COL, 197 | eval_mode=False, 198 | batch_size=BS_TRAIN, 199 | ) 200 | 201 | # ===================================================================================== 202 | # CALLBACKS 203 | tensorboard_callback = tf.keras.callbacks.TensorBoard( 204 | log_dir=LOG_DIR, 205 | histogram_freq=1, 206 | ) 207 | early_stopping = tf.keras.callbacks.EarlyStopping( 208 | monitor="val_auc", 209 | mode="max", 210 | patience=4, 211 | restore_best_weights=True, 212 | ) 213 | modelcheckpoint = tf.keras.callbacks.ModelCheckpoint( 214 | filepath=MODEL_WEIGHTS, 215 | monitor="val_auc", 216 | mode="max", 217 | save_best_only=True, 218 | save_weights_only=True, 219 | verbose=1, 220 | ) 221 | lr_scheduler = tf.keras.callbacks.ReduceLROnPlateau( 222 | monitor="val_auc", 223 | mode="max", 224 | factor=0.2, 225 | patience=2, 226 | min_lr=1e-6, 227 | ) 228 | callbacks = [tensorboard_callback, early_stopping, modelcheckpoint, lr_scheduler] 229 | 230 | # ===================================================================================== 231 | model = model_func( 232 | hparams=hparams, 233 | seed=42, 234 | ) 235 | model.model.compile( 236 | optimizer=model.model.optimizer, 237 | loss=model.model.loss, 238 | metrics=["AUC"], 239 | ) 240 | f"Initiating {MODEL_NAME}, start training..." 241 | # => 242 | hist = model.model.fit( 243 | train_dataloader, 244 | validation_data=val_dataloader, 245 | epochs=EPOCHS, 246 | callbacks=callbacks, 247 | ) 248 | 249 | print(f"loading model: {MODEL_WEIGHTS}") 250 | model.model.load_weights(MODEL_WEIGHTS) 251 | 252 | # ===================================================================================== 253 | 254 | # First filter: only keep users with >FILTER_MIN_HISTORY in history-size 255 | FILTER_MIN_HISTORY = 100 256 | # Truncate the history 257 | HIST_SIZE = 100 258 | 259 | # => 260 | df = ( 261 | ebnerd_from_path( 262 | PATH.joinpath(DATASPLIT, "validation"), history_size=120, padding=None 263 | ) 264 | .sample(fraction=FRACTION_TEST) 265 | .filter(pl.col(DEFAULT_HISTORY_ARTICLE_ID_COL).list.len() >= FILTER_MIN_HISTORY) 266 | .select(COLUMNS) 267 | .pipe(create_binary_labels_column) 268 | ) 269 | 270 | pairs = [ 271 | (1, 256), 272 | (2, 256), 273 | (3, 256), 274 | (4, 256), 275 | (5, 256), 276 | (6, 256), 277 | (7, 256), 278 | (8, 256), 279 | (9, 256), 280 | (10, 256), 281 | (15, 128), 282 | (20, 128), 283 | (30, 64), 284 | (40, 64), 285 | (50, 64), 286 | ] 287 | 288 | aucs = [] 289 | hists = [] 290 | for hist_size, batch_size in pairs: 291 | print(f"History size: {hist_size}, Batch size: {batch_size}") 292 | 293 | df_ = df.pipe( 294 | truncate_history, 295 | column=DEFAULT_HISTORY_ARTICLE_ID_COL, 296 | history_size=hist_size, 297 | padding_value=0, 298 | enable_warning=False, 299 | ) 300 | 301 | test_dataloader = NRMSDataLoader( 302 | behaviors=df_, 303 | article_dict=article_mapping, 304 | unknown_representation="zeros", 305 | history_column=DEFAULT_HISTORY_ARTICLE_ID_COL, 306 | eval_mode=True, 307 | batch_size=batch_size, 308 | ) 309 | 310 | scores = model.scorer.predict(test_dataloader) 311 | 312 | df_pred = add_prediction_scores(df_, scores.tolist()) 313 | 314 | metrics = MetricEvaluator( 315 | labels=df_pred["labels"], 316 | predictions=df_pred["scores"], 317 | metric_functions=[AucScore()], 318 | ) 319 | metrics.evaluate() 320 | auc = metrics.evaluations["auc"] 321 | aucs.append(round(auc, 4)) 322 | hists.append(hist_size) 323 | print(f"{auc} (History size: {hist_size}, Batch size: {batch_size})") 324 | 325 | for h, a in zip(hists, aucs): 326 | print(f"({a}, {h}),") 327 | 328 | results = {h: a for h, a in zip(hists, aucs)} 329 | write_json_file(results, ARTIFACT_DIR.joinpath("auc_history_length.json")) 330 | 331 | # Clean up 332 | if TEST_CHUNKS_DIR.exists() and TEST_CHUNKS_DIR.is_dir(): 333 | shutil.rmtree(TEST_CHUNKS_DIR) 334 | -------------------------------------------------------------------------------- /pyproject.toml: -------------------------------------------------------------------------------- 1 | [project] 2 | name = "ebrec" 3 | description = "Ekstra Bladet Benchmark" 4 | version = "0.0.1" 5 | authors = [{ name = "Johannes Kruse", email = "johannes.kruse@eb.dk" }] 6 | requires-python = ">=3.10, <3.12" 7 | dependencies = [ 8 | # fastformer: 9 | "transformers>=4.30.0, <4.37.3", 10 | # newsrec: 11 | "tensorflow>=2.12.0, <2.16.0", 12 | # Fastformer; DeepCTR 13 | "torch>=2.0.0, <2.3.0", 14 | # Evaluation: 15 | "scikit-learn==1.4.0", 16 | # GENERAL: 17 | "numpy>=1.24.0, <1.26.1", 18 | "polars==0.20.8", 19 | "pyyaml==6.0.1", 20 | "tqdm", 21 | ] 22 | 23 | [project.optional-dependencies] 24 | # pip install "my_project[extras]" 25 | # pip install -e .'[notebooks]' 26 | notebooks = ["transformers", "jupyter"] 27 | tests = [ 28 | "pytest", 29 | "transformers>=4.30.0, <4.37.3", 30 | "tensorflow>=2.12.0, <2.16.0", 31 | "torch>=2.0.0, <2.3.0", 32 | ] 33 | -------------------------------------------------------------------------------- /src/__init__.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/ebanalyse/ebnerd-benchmark/c12f9b74c83eea68df69f9640d368adcd5d42a9e/src/__init__.py -------------------------------------------------------------------------------- /src/ebrec/evaluation/__init__.py: -------------------------------------------------------------------------------- 1 | from .metrics_protocols import ( 2 | RootMeanSquaredError, 3 | MetricEvaluator, 4 | AccuracyScore, 5 | LogLossScore, 6 | NdcgScore, 7 | AucScore, 8 | F1Score, 9 | MrrScore, 10 | ) 11 | from .beyond_accuracy import ( 12 | IntralistDiversity, 13 | Distribution, 14 | Serendipity, 15 | Coverage, 16 | Novelty, 17 | ) 18 | -------------------------------------------------------------------------------- /src/ebrec/evaluation/_ba_test.py: -------------------------------------------------------------------------------- 1 | import numpy as np 2 | from sklearn.metrics.pairwise import cosine_distances 3 | 4 | from ebrec.evaluation.beyond_accuracy import ( 5 | IntralistDiversity, 6 | Distribution, 7 | Serendipity, 8 | Novelty, 9 | Coverage, 10 | ) 11 | 12 | lookup_dict = { 13 | "101": {"doc_vec": np.array([1, 0, 0]), "v": 1, "sv": [1], "pop_sc": 0.50}, 14 | "102": {"doc_vec": np.array([0, 1, 0]), "v": 2, "sv": [1], "pop_sc": 0.25}, 15 | "103": {"doc_vec": np.array([1, 1, 1]), "v": 3, "sv": [1], "pop_sc": 0.75}, 16 | "104": {"doc_vec": np.array([1, 1, 1]), "v": 4, "sv": [1], "pop_sc": 0.50}, 17 | "105": {"doc_vec": np.array([-1, 0, 0]), "v": 5, "sv": [1], "pop_sc": 0.94}, 18 | "106": {"doc_vec": np.array([-1, 0, 0]), "v": 6, "sv": [1, 2], "pop_sc": 0.95}, 19 | "107": {"doc_vec": np.array([-1, 0, 0]), "v": 7, "sv": [1, 2], "pop_sc": 0.96}, 20 | "108": {"doc_vec": np.array([0, 0, 1]), "v": 8, "sv": [1, 2], "pop_sc": 0.50}, 21 | "400": {"doc_vec": np.array([0, 0, 1]), "v": 9, "sv": [4], "pop_sc": 0.20}, 22 | "401": {"doc_vec": np.array([0, 0, 1]), "v": 9, "sv": [4, 5], "pop_sc": 0.20}, 23 | } 24 | 25 | # 404 is not excepted, however, setup supports it: 26 | R = np.array( 27 | [ 28 | ["101", "102", "400"], 29 | ["101", "103", "400"], 30 | ["101", "102", "103"], 31 | ["101", "104", "400"], 32 | ["101", "106", "404"], 33 | ["404", "404", "404"], 34 | ] 35 | ) 36 | 37 | C = ["1", "2", "101", "102", "103", "104", "105", "106", "107", "108", "400", "401"] 38 | 39 | click_histories = [ 40 | np.array([["101", "102"]]), 41 | np.array([["105", "106", "400"]]), 42 | np.array([["102", "103", "104"]]), 43 | np.array([["101", "400"]]), 44 | np.array([["400"]]), 45 | np.array([["400"]]), 46 | ] 47 | pairwise_distance_function = cosine_distances 48 | 49 | # => IntralistDiversity 50 | lookup_key = "doc_vec" 51 | div = IntralistDiversity() 52 | div(R, lookup_dict=lookup_dict, lookup_key=lookup_key) 53 | div._candidate_diversity( 54 | R=C, 55 | n_recommendations=2, 56 | lookup_dict=lookup_dict, 57 | lookup_key=lookup_key, 58 | pairwise_distance_function=pairwise_distance_function, 59 | ) 60 | 61 | try: 62 | div._candidate_diversity(C, 7, lookup_dict=lookup_dict, lookup_key=lookup_key) 63 | except ValueError as e: 64 | print(f"Failed - hurra! Error message: \n {e}") 65 | 66 | # => Distribution 67 | dist = Distribution() 68 | dist(R[:2], lookup_dict, "v") 69 | dist(R, lookup_dict, "sv") 70 | dist(C, lookup_dict, "v") 71 | try: 72 | dist(C, lookup_dict, "q") 73 | except ValueError as e: 74 | print(f"Failed - hurra! Error message: \n {e}") 75 | 76 | # => Coverage 77 | cov = Coverage() 78 | cov(R) 79 | cov(R, C) 80 | 81 | # => Serendipity 82 | ser = Serendipity() 83 | ser( 84 | R=R, 85 | H=click_histories, 86 | lookup_dict=lookup_dict, 87 | lookup_key=lookup_key, 88 | pairwise_distance_function=pairwise_distance_function, 89 | ) 90 | # np.nan_to_num(ser(R, click_histories, lookup_dict, lookup_key), 0.0) 91 | 92 | # => Novelty 93 | nov = Novelty() 94 | nov(R, lookup_dict=lookup_dict, lookup_key="pop_sc") 95 | nov._candidate_novelty(C, 2, lookup_dict=lookup_dict, lookup_key="pop_sc") 96 | -------------------------------------------------------------------------------- /src/ebrec/evaluation/metrics/__init__.py: -------------------------------------------------------------------------------- 1 | from ._beyond_accuracy import * 2 | from ._classification import * 3 | from ._ranking import * 4 | from ._sklearn import * 5 | -------------------------------------------------------------------------------- /src/ebrec/evaluation/metrics/_beyond_accuracy.py: -------------------------------------------------------------------------------- 1 | from typing import Callable 2 | 3 | from sklearn.metrics.pairwise import cosine_distances 4 | from collections import Counter 5 | import numpy as np 6 | 7 | 8 | def intralist_diversity( 9 | R: np.ndarray[np.ndarray], 10 | pairwise_distance_function: Callable = cosine_distances, 11 | ) -> float: 12 | """Calculate the intra-list diversity of a recommendation list. 13 | 14 | This function implements the method described by Smyth and McClave (2001) to 15 | measure the diversity within a recommendation list. It calculates the average 16 | pairwise distance between all items in the list. 17 | 18 | Formula: 19 | Diversity(R) = ( sum_{i∈R} sum_{j∈R_{i}} dist(i, j) ) / ( |R|(|R|-1) ) 20 | 21 | where `R` is the recommendation list, and `dist` represents the pairwise distance function used. 22 | 23 | Args: 24 | R (np.ndarray[np.ndarray]): A 2D numpy array where each row represents a recommendation. 25 | This array should be either array-like or a sparse matrix, with shape (n_samples_X, n_features). 26 | pairwise_distance_function (Callable, optional): A function to compute pairwise distance 27 | between samples. Defaults to `cosine_distances`. 28 | 29 | Returns: 30 | float: The calculated diversity score. If the recommendation list contains less than or 31 | equal to one item, NaN is returned to signify an undefined diversity score. 32 | 33 | References: 34 | Smyth, B., McClave, P. (2001). Similarity vs. Diversity. In: Aha, D.W., Watson, I. (eds) 35 | Case-Based Reasoning Research and Development. ICCBR 2001. Lecture Notes in Computer Science(), 36 | vol 2080. Springer, Berlin, Heidelberg. https://doi.org/10.1007/3-540-44593-5_25 37 | 38 | Examples: 39 | >>> R1 = np.array([[0.1, 0.2, 0.3], [0.4, 0.5, 0.6], [0.7, 0.8, 0.9]]) 40 | >>> print(intralist_diversity(R1)) 41 | 0.022588438516842262 42 | >>> print(intralist_diversity(np.array([[0.1, 0.2], [0.1, 0.2]]))) 43 | 1.1102230246251565e-16 44 | """ 45 | R_n = R.shape[0] # number of recommendations 46 | if R_n <= 1: 47 | # Less than or equal to 1 recommendations in recommendation list 48 | diversity = np.nan 49 | else: 50 | pairwise_distances = pairwise_distance_function(R, R) 51 | diversity = np.sum(pairwise_distances) / (R_n * (R_n - 1)) 52 | return diversity 53 | 54 | 55 | def serendipity( 56 | R: np.ndarray[np.ndarray], 57 | H: np.ndarray[np.ndarray], 58 | pairwise_distance_function: Callable = cosine_distances, 59 | ) -> float: 60 | """Calculate the serendipity score between a set of recommendations and user's reading history. 61 | 62 | This function implements the concept of serendipity as defined by Feng Lu, Anca Dumitrache, and David Graus (2020). 63 | Serendipity in this context is measured as the mean distance between the items in the recommendation list and the 64 | user's reading history. 65 | 66 | Formula: 67 | Serendipity(R, H) = ( sum_{i∈R} sum_{j∈R} dist(i, j) ) / ( |R||H| ) 68 | 69 | where `R` is the recommendation list, `H` is the user's reading history, and `dist` is the pairwise distance function. 70 | 71 | Args: 72 | R (np.ndarray[np.ndarray]): A 2D numpy array representing the recommendation list, where each row is a recommendation. 73 | It should be either array-like or a sparse matrix, with shape (n_samples_X, n_features). 74 | H (np.ndarray[np.ndarray]): A 2D numpy array representing the user's reading history, with the same format as R. 75 | pairwise_distance_function (Callable, optional): A function to compute pairwise distance between samples. 76 | Defaults to `cosine_distances`. 77 | 78 | Returns: 79 | float: The calculated serendipity score. 80 | 81 | References: 82 | Lu, F., Dumitrache, A., & Graus, D. (2020). Beyond Optimizing for Clicks: Incorporating Editorial Values in News Recommendation. 83 | Retrieved from https://arxiv.org/abs/2004.09980 84 | 85 | Examples: 86 | >>> R1 = np.array([[0.1, 0.2, 0.3], [0.4, 0.5, 0.6]]) 87 | >>> H1 = np.array([[0.7, 0.8, 0.9], [0.1, 0.2, 0.3]]) 88 | >>> print(serendipity(R1, H1)) 89 | 0.016941328887631724 90 | """ 91 | # Compute the pairwise distances between each vector: 92 | dists = pairwise_distance_function(R, H) 93 | # Compute serendipity: 94 | return np.mean(dists) 95 | 96 | 97 | def coverage_count(R: np.ndarray) -> int: 98 | """Calculate the number of distinct items in a recommendation list. 99 | 100 | Args: 101 | R (np.ndarray): An array containing the items in the recommendation list. 102 | 103 | Returns: 104 | int: The count of distinct items in the recommendation list. 105 | 106 | Examples: 107 | >>> R1 = np.array([1, 2, 3, 4, 5, 5, 6]) 108 | >>> print(coverage_count(R1)) 109 | 6 110 | """ 111 | # Distinct items: 112 | return np.unique(R).size 113 | 114 | 115 | def coverage_fraction(R: np.ndarray, C: np.ndarray) -> float: 116 | """Calculate the fraction of distinct items in the recommendation list compared to a universal set. 117 | 118 | Args: 119 | R (np.ndarray): An array containing the items in the recommendation list. 120 | C (np.ndarray): An array representing the universal set of items. 121 | It should contain all possible items that can be recommended. 122 | 123 | Returns: 124 | float: The fraction representing the coverage of the recommendation system. 125 | This is calculated as the size of unique elements in R divided by the size of unique elements in C. 126 | 127 | Examples: 128 | >>> R1 = np.array([1, 2, 3, 4, 5, 5, 6]) 129 | >>> C1 = np.array([1, 2, 3, 4, 5, 6, 7, 8, 9, 10]) 130 | >>> print(coverage_fraction(R1, C1)) # Expected output: 0.6 131 | 0.6 132 | """ 133 | # Distinct items: 134 | return np.unique(R).size / np.unique(C).size 135 | 136 | 137 | def novelty(R: np.ndarray[float]) -> float: 138 | """Calculate the novelty score of recommendations based on their popularity. 139 | 140 | This function computes the novelty score for a set of recommendations by applying the self-information popularity metric. 141 | It uses the formula described by Zhou et al. (2010) and Vargas and Castells (2011). The novelty is calculated as the 142 | average negative logarithm (base 2) of the popularity scores of the items in the recommendation list. 143 | 144 | Formula: 145 | Novelty(R) = ( sum_{i∈R} -log2( p_i ) / ( |R| ) 146 | 147 | where p_i represents the popularity score of each item in the recommendation list R, and |R| is the size of R. 148 | 149 | Args: 150 | R (np.ndarray[float]): An array of popularity scores (p_i) for each item in the recommendation list. 151 | 152 | Returns: 153 | float: The calculated novelty score. Higher values indicate less popular (more novel) recommendations. 154 | 155 | References: 156 | Zhou et al. (2010). 157 | Vargas & Castells (2011). 158 | 159 | Examples: 160 | >>> print(novelty([0.1, 0.2, 0.3, 0.4, 0.5])) # Expected: High score (low popularity scores) 161 | 1.9405499757656586 162 | >>> print(novelty([0.9, 0.9, 0.9, 1.0, 0.5])) # Expected: Low score (high popularity scores) 163 | 0.29120185606703 164 | """ 165 | return np.mean(-np.log2(R)) 166 | 167 | 168 | def index_of_dispersion(x: list[int]) -> float: 169 | """ 170 | Computes the Index of Dispersion (variance-to-mean ratio) for a given dataset of nominal variables. 171 | 172 | The Index of Dispersion is a statistical measure used to quantify the dispersion or variability of a distribution 173 | relative to its mean. It's particularly useful in identifying whether a dataset follows a Poisson distribution, 174 | where the Index of Dispersion would be approximately 1. 175 | 176 | Formula: 177 | D = ( k * (N^2 - Σf^2) ) / ( N^2 * (k-1) ) 178 | Where: 179 | k = number of categories in the data set (including categories with zero items), 180 | N = number of items in the set, 181 | f = number of frequencies or ratings, 182 | Σf^2 = sum of squared frequencies/ratings. 183 | 184 | Args: 185 | x (list[int]): A list of integers representing frequencies or counts of occurrences in different categories. 186 | Each integer in the list corresponds to the count of occurrences in a given category. 187 | 188 | Returns: 189 | float: The Index of Dispersion for the dataset. Returns `np.nan` if the input list contains only one item, 190 | indicating an undefined Index of Dispersion. Returns 0 if there's only one category present in the dataset. 191 | 192 | References: 193 | Walker, 1999, Statistics in criminal 194 | Source: https://www.statisticshowto.com/index-of-dispersion/ 195 | 196 | Examples: 197 | Given the following categories: Math(25), Economics(42), Chemistry(13), Physical Education (8), Religious Studies (13). 198 | >>> N = np.sum(25+42+13+8+13) 199 | >>> k = 5 200 | >>> sq_f2 = np.sum(25**2 + 42**2 + 13**2 + 8**2 + 13**2) 201 | >>> iod = ( k * (N**2 - sq_f2)) / ( N**2 * (k-1) ) 202 | 0.9079992157631604 203 | 204 | Validate method: 205 | >>> cat = [[1]*25, [2]*42, [3]*13, [4]*8, [5]*13] 206 | >>> flat_list = [item for sublist in cat for item in sublist] 207 | >>> index_of_dispersion(flat_list) 208 | 0.9079992157631604 209 | """ 210 | # number of items 211 | N = len(x) 212 | # compute frequencies 213 | count = Counter(x) 214 | # number of categories 215 | k = len(count) 216 | if k == 1: 217 | if N == 1: 218 | return np.nan 219 | else: 220 | return 0 221 | # squared frequencies 222 | f_squared = [count.get(f) ** 2 for f in count] 223 | # compute Index of Dispersion 224 | D = k * (N**2 - sum(f_squared)) / (N**2 * (k - 1)) 225 | return D 226 | -------------------------------------------------------------------------------- /src/ebrec/evaluation/metrics/_classification.py: -------------------------------------------------------------------------------- 1 | import numpy as np 2 | 3 | 4 | def auc_score_custom(y_true: np.ndarray, y_pred: np.ndarray) -> float: 5 | """ 6 | Computes the Area Under the Curve (AUC) score for the Receiver Operating Characteristic (ROC) curve using a 7 | custom method. This implementation is particularly useful for understanding basic ROC curve properties and 8 | for educational purposes to demonstrate how AUC scores can be manually calculated. 9 | 10 | This function may produce slightly different results compared to standard library implementations (e.g., sklearn's roc_auc_score) 11 | in cases where positive and negative predictions have the same score. The function treats the problem as a binary classification task, 12 | comparing the prediction scores for positive instances against those for negative instances directly. 13 | 14 | Args: 15 | y_true (np.ndarray): A binary array indicating the true classification (1 for positive class and 0 for negative class). 16 | y_pred (np.ndarray): An array of scores as predicted by a model, indicating the likelihood of each instance being positive. 17 | 18 | Returns: 19 | float: The calculated AUC score, representing the probability that a randomly chosen positive instance is ranked 20 | higher than a randomly chosen negative instance based on the prediction scores. 21 | 22 | Raises: 23 | ValueError: If `y_true` and `y_pred` do not have the same length or if they contain invalid data types. 24 | 25 | Examples: 26 | >>> y_true = np.array([1, 1, 0, 0, 1, 0, 0, 0]) 27 | >>> y_pred = np.array([0.9999, 0.9838, 0.5747, 0.8485, 0.8624, 0.4502, 0.3357, 0.8985]) 28 | >>> auc_score_custom(y_true, y_pred) 29 | 0.9333333333333333 30 | >>> from sklearn.metrics import roc_auc_score 31 | >>> roc_auc_score(y_true, y_pred) 32 | 0.9333333333333333 33 | 34 | An error will occur when pos/neg prediction have same score: 35 | >>> y_true = np.array([1, 1, 0, 0, 1, 0, 0, 0]) 36 | >>> y_pred = np.array([0.9999, 0.8, 0.8, 0.8485, 0.8624, 0.4502, 0.3357, 0.8985]) 37 | >>> auc_score_custom(y_true, y_pred) 38 | 0.7333 39 | >>> roc_auc_score(y_true, y_pred) 40 | 0.7667 41 | """ 42 | y_true = np.asarray(y_true) 43 | y_pred = np.asarray(y_pred) 44 | 45 | y_true_bool = y_true.astype(np.bool_) 46 | # Index: 47 | pos_scores = y_pred[y_true_bool] 48 | neg_scores = y_pred[np.logical_not(y_true_bool)] 49 | # Arrange: 50 | pos_scores = np.repeat(pos_scores, len(neg_scores)) 51 | neg_scores = np.tile(neg_scores, sum(y_true_bool)) 52 | assert len(neg_scores) == len(pos_scores) 53 | return (pos_scores > neg_scores).sum() / len(neg_scores) 54 | -------------------------------------------------------------------------------- /src/ebrec/evaluation/metrics/_ranking.py: -------------------------------------------------------------------------------- 1 | import numpy as np 2 | 3 | 4 | def reciprocal_rank_score(y_true: np.ndarray, y_pred: np.ndarray) -> float: 5 | """Computes the Mean Reciprocal Rank (MRR) score. 6 | 7 | Args: 8 | y_true (np.ndarray): A 1D array of ground-truth labels. These should be binary (0 or 1), 9 | where 1 indicates the relevant item. 10 | y_pred (np.ndarray): A 1D array of predicted scores. These scores indicate the likelihood 11 | of items being relevant. 12 | 13 | Returns: 14 | float: The mean reciprocal rank (MRR) score. 15 | 16 | Note: 17 | Both `y_true` and `y_pred` should be 1D arrays of the same length. 18 | The function assumes higher scores in `y_pred` indicate higher relevance. 19 | 20 | Examples: 21 | >>> y_true_1 = np.array([0, 0, 1]) 22 | >>> y_pred_1 = np.array([0.5, 0.2, 0.1]) 23 | >>> reciprocal_rank_score(y_true_1, y_pred_1) 24 | 0.33 25 | 26 | >>> y_true_2 = np.array([0, 1, 1]) 27 | >>> y_pred_2 = np.array([0.5, 0.2, 0.1]) 28 | >>> reciprocal_rank_score(y_true_2, y_pred_2) 29 | 0.5 30 | 31 | >>> y_true_3 = np.array([1, 1, 0]) 32 | >>> y_pred_3 = np.array([0.5, 0.2, 0.1]) 33 | >>> reciprocal_rank_score(y_true_3, y_pred_3) 34 | 1.0 35 | 36 | >>> np.mean( 37 | [ 38 | reciprocal_rank_score(y_true, y_pred) 39 | for y_true, y_pred in zip( 40 | [y_true_1, y_true_2, y_true_3], [y_pred_1, y_pred_2, y_pred_3] 41 | ) 42 | ] 43 | ) 44 | 0.61 45 | mrr_score([y_true_1, y_true_2, y_true_3], [y_pred_1, y_pred_2, y_pred_3]) 46 | """ 47 | order = np.argsort(y_pred)[::-1] 48 | y_true = np.take(y_true, order) 49 | first_positive_rank = np.argmax(y_true) + 1 50 | return 1.0 / first_positive_rank 51 | 52 | 53 | def dcg_score(y_true: np.ndarray, y_pred: np.ndarray, k: int = 10) -> float: 54 | """ 55 | Compute the Discounted Cumulative Gain (DCG) score at a particular rank `k`. 56 | 57 | Args: 58 | y_true (np.ndarray): A 1D or 2D array of ground-truth relevance labels. 59 | Each element should be a non-negative integer. 60 | y_pred (np.ndarray): A 1D or 2D array of predicted scores. Each element is 61 | a score corresponding to the predicted relevance. 62 | k (int, optional): The rank at which the DCG score is calculated. Defaults 63 | to 10. If `k` is larger than the number of elements, it 64 | will be truncated to the number of elements. 65 | 66 | Note: 67 | In case of a 2D array, each row represents a different sample. 68 | 69 | Returns: 70 | float: The calculated DCG score for the top `k` elements. 71 | 72 | Raises: 73 | ValueError: If `y_true` and `y_pred` have different shapes. 74 | 75 | Examples: 76 | >>> from sklearn.metrics import dcg_score as dcg_score_sklearn 77 | >>> y_true = np.array([1, 0, 0, 1, 0]) 78 | >>> y_pred = np.array([0.5, 0.2, 0.1, 0.8, 0.4]) 79 | >>> dcg_score(y_true, y_pred) 80 | 1.6309297535714575 81 | >>> dcg_score_sklearn([y_true], [y_pred]) 82 | 1.6309297535714573 83 | """ 84 | k = min(np.shape(y_true)[-1], k) 85 | order = np.argsort(y_pred)[::-1] 86 | y_true = np.take(y_true, order[:k]) 87 | gains = 2**y_true - 1 88 | discounts = np.log2(np.arange(len(y_true)) + 2) 89 | return np.sum(gains / discounts) 90 | 91 | 92 | def ndcg_score(y_true: np.ndarray, y_pred: np.ndarray, k: int = 10) -> float: 93 | """ 94 | Compute the Normalized Discounted Cumulative Gain (NDCG) score at a rank `k`. 95 | 96 | Args: 97 | y_true (np.ndarray): A 1D or 2D array of ground-truth relevance labels. 98 | Each element should be a non-negative integer. In case 99 | of a 2D array, each row represents a different sample. 100 | y_pred (np.ndarray): A 1D or 2D array of predicted scores. Each element is 101 | a score corresponding to the predicted relevance. The 102 | array should have the same shape as `y_true`. 103 | k (int, optional): The rank at which the NDCG score is calculated. Defaults 104 | to 10. If `k` is larger than the number of elements, it 105 | will be truncated to the number of elements. 106 | 107 | Returns: 108 | float: The calculated NDCG score for the top `k` elements. The score ranges 109 | from 0 to 1, with 1 representing the perfect ranking. 110 | 111 | Examples: 112 | >>> from sklearn.metrics import ndcg_score as ndcg_score_sklearn 113 | >>> y_true = np.array([1, 0, 0, 1, 0]) 114 | >>> y_pred = np.array([0.1, 0.2, 0.1, 0.8, 0.4]) 115 | >>> ndcg_score([y_true], [y_pred]) 116 | 0.863780110436402 117 | >>> ndcg_score_sklearn([y_true], [y_pred]) 118 | 0.863780110436402 119 | >>> 120 | """ 121 | best = dcg_score(y_true, y_true, k) 122 | actual = dcg_score(y_true, y_pred, k) 123 | return actual / best 124 | 125 | 126 | def mrr_score(y_true: np.ndarray, y_pred: np.ndarray) -> float: 127 | """Computes the Mean Reciprocal Rank (MRR) score. 128 | 129 | THIS MIGHT NOT ALL PROPER, TO BE DETERMIEND: 130 | - https://github.com/recommenders-team/recommenders/issues/2141 131 | 132 | Args: 133 | y_true (np.ndarray): A 1D array of ground-truth labels. These should be binary (0 or 1), 134 | where 1 indicates the relevant item. 135 | y_pred (np.ndarray): A 1D array of predicted scores. These scores indicate the likelihood 136 | of items being relevant. 137 | 138 | Returns: 139 | float: The mean reciprocal rank (MRR) score. 140 | 141 | Note: 142 | Both `y_true` and `y_pred` should be 1D arrays of the same length. 143 | The function assumes higher scores in `y_pred` indicate higher relevance. 144 | 145 | Examples: 146 | >>> y_true = np.array([[1, 0, 0, 1, 0]]) 147 | >>> y_pred = np.array([[0.5, 0.2, 0.1, 0.8, 0.4]]) 148 | >>> mrr_score(y_true, y_pred) 149 | 0.75 150 | 151 | """ 152 | order = np.argsort(y_pred)[::-1] 153 | y_true = np.take(y_true, order) 154 | rr_score = y_true / (np.arange(len(y_true)) + 1) 155 | return np.sum(rr_score) / np.sum(y_true) 156 | -------------------------------------------------------------------------------- /src/ebrec/evaluation/metrics/_sklearn.py: -------------------------------------------------------------------------------- 1 | try: 2 | from sklearn.metrics import ( 3 | # _regression: 4 | mean_squared_error, 5 | # _ranking: 6 | roc_auc_score, 7 | # _classification: 8 | accuracy_score, 9 | f1_score, 10 | log_loss, 11 | ) 12 | except ImportError: 13 | print("sklearn not available") 14 | -------------------------------------------------------------------------------- /src/ebrec/evaluation/metrics_protocols.py: -------------------------------------------------------------------------------- 1 | from itertools import compress 2 | from typing import Iterable 3 | from tqdm import tqdm 4 | import numpy as np 5 | import json 6 | 7 | from ebrec.evaluation.utils import convert_to_binary 8 | from ebrec.evaluation.protocols import Metric 9 | 10 | from ebrec.evaluation.metrics import ( 11 | mean_squared_error, 12 | accuracy_score, 13 | roc_auc_score, 14 | ndcg_score, 15 | mrr_score, 16 | log_loss, 17 | f1_score, 18 | ) 19 | 20 | 21 | class AccuracyScore(Metric): 22 | def __init__(self, threshold: float = 0.5): 23 | self.threshold = threshold 24 | self.name = "accuracy" 25 | 26 | def calculate(self, y_true: list[np.ndarray], y_pred: list[np.ndarray]) -> float: 27 | res = np.mean( 28 | [ 29 | accuracy_score( 30 | each_labels, convert_to_binary(each_preds, self.threshold) 31 | ) 32 | for each_labels, each_preds in tqdm( 33 | zip(y_true, y_pred), ncols=80, total=len(y_true), desc="AUC" 34 | ) 35 | ] 36 | ) 37 | return float(res) 38 | 39 | 40 | class F1Score(Metric): 41 | def __init__(self, threshold: float = 0.5): 42 | self.threshold = threshold 43 | self.name = "f1" 44 | 45 | def calculate(self, y_true: list[np.ndarray], y_pred: list[np.ndarray]) -> float: 46 | res = np.mean( 47 | [ 48 | f1_score(each_labels, convert_to_binary(each_preds, self.threshold)) 49 | for each_labels, each_preds in tqdm( 50 | zip(y_true, y_pred), ncols=80, total=len(y_true), desc="AUC" 51 | ) 52 | ] 53 | ) 54 | return float(res) 55 | 56 | 57 | class RootMeanSquaredError(Metric): 58 | def __init__(self): 59 | self.name = "rmse" 60 | 61 | def calculate(self, y_true: list[np.ndarray], y_pred: list[np.ndarray]) -> float: 62 | res = np.mean( 63 | [ 64 | np.sqrt(mean_squared_error(each_labels, each_preds)) 65 | for each_labels, each_preds in tqdm( 66 | zip(y_true, y_pred), ncols=80, total=len(y_true), desc="AUC" 67 | ) 68 | ] 69 | ) 70 | return float(res) 71 | 72 | 73 | class AucScore(Metric): 74 | def __init__(self): 75 | self.name = "auc" 76 | 77 | def calculate(self, y_true: list[np.ndarray], y_pred: list[np.ndarray]) -> float: 78 | res = np.mean( 79 | [ 80 | roc_auc_score(each_labels, each_preds) 81 | for each_labels, each_preds in tqdm( 82 | zip(y_true, y_pred), ncols=80, total=len(y_true), desc="AUC" 83 | ) 84 | ] 85 | ) 86 | return float(res) 87 | 88 | 89 | class LogLossScore(Metric): 90 | def __init__(self): 91 | self.name = "logloss" 92 | 93 | def calculate(self, y_true: list[np.ndarray], y_pred: list[np.ndarray]) -> float: 94 | res = np.mean( 95 | [ 96 | log_loss( 97 | each_labels, 98 | [max(min(p, 1.0 - 10e-12), 10e-12) for p in each_preds], 99 | ) 100 | for each_labels, each_preds in tqdm( 101 | zip(y_true, y_pred), ncols=80, total=len(y_true), desc="AUC" 102 | ) 103 | ] 104 | ) 105 | return float(res) 106 | 107 | 108 | class MrrScore(Metric): 109 | def __init__(self) -> Metric: 110 | self.name = "mrr" 111 | 112 | def calculate(self, y_true: list[np.ndarray], y_pred: list[np.ndarray]) -> float: 113 | mean_mrr = np.mean( 114 | [ 115 | mrr_score(each_labels, each_preds) 116 | for each_labels, each_preds in tqdm( 117 | zip(y_true, y_pred), ncols=80, total=len(y_true), desc="AUC" 118 | ) 119 | ] 120 | ) 121 | return float(mean_mrr) 122 | 123 | 124 | class NdcgScore(Metric): 125 | def __init__(self, k: int): 126 | self.k = k 127 | self.name = f"ndcg@{k}" 128 | 129 | def calculate(self, y_true: list[np.ndarray], y_pred: list[np.ndarray]) -> float: 130 | res = np.mean( 131 | [ 132 | ndcg_score(each_labels, each_preds, self.k) 133 | for each_labels, each_preds in tqdm( 134 | zip(y_true, y_pred), ncols=80, total=len(y_true), desc="AUC" 135 | ) 136 | ] 137 | ) 138 | return float(res) 139 | 140 | 141 | class MetricEvaluator: 142 | """ 143 | >>> y_true = [[1, 0, 0], [1, 1, 0], [1, 0, 0, 0]] 144 | >>> y_pred = [[0.2, 0.3, 0.5], [0.18, 0.7, 0.1], [0.18, 0.2, 0.1, 0.1]] 145 | 146 | >>> met_eval = MetricEvaluator( 147 | labels=y_true, 148 | predictions=y_pred, 149 | metric_functions=[ 150 | AucScore(), 151 | MrrScore(), 152 | NdcgScore(k=5), 153 | NdcgScore(k=10), 154 | LogLossScore(), 155 | RootMeanSquaredError(), 156 | AccuracyScore(threshold=0.5), 157 | F1Score(threshold=0.5), 158 | ], 159 | ) 160 | >>> met_eval.evaluate() 161 | { 162 | "auc": 0.5555555555555556, 163 | "mrr": 0.5277777777777778, 164 | "ndcg@5": 0.7103099178571526, 165 | "ndcg@10": 0.7103099178571526, 166 | "logloss": 0.716399020295845, 167 | "rmse": 0.5022870658128165 168 | "accuracy": 0.5833333333333334, 169 | "f1": 0.2222222222222222 170 | } 171 | """ 172 | 173 | def __init__( 174 | self, 175 | labels: list[np.ndarray], 176 | predictions: list[np.ndarray], 177 | metric_functions: list[Metric], 178 | ): 179 | self.labels = labels 180 | self.predictions = predictions 181 | self.metric_functions = metric_functions 182 | self.evaluations = dict() 183 | 184 | def evaluate(self) -> dict: 185 | self.evaluations = { 186 | metric_function.name: metric_function(self.labels, self.predictions) 187 | for metric_function in self.metric_functions 188 | } 189 | return self 190 | 191 | @property 192 | def metric_functions(self): 193 | return self.__metric_functions 194 | 195 | @metric_functions.setter 196 | def metric_functions(self, values): 197 | invalid_callables = self.__invalid_callables(values) 198 | if not any(invalid_callables) and invalid_callables: 199 | self.__metric_functions = values 200 | else: 201 | invalid_objects = list(compress(values, invalid_callables)) 202 | invalid_types = [type(item) for item in invalid_objects] 203 | raise TypeError(f"Following object(s) are not callable: {invalid_types}") 204 | 205 | @staticmethod 206 | def __invalid_callables(iter: Iterable): 207 | return [not callable(item) for item in iter] 208 | 209 | def __str__(self): 210 | if self.evaluations: 211 | evaluations_json = json.dumps(self.evaluations, indent=4) 212 | return f": \n {evaluations_json}" 213 | else: 214 | return f": {self.evaluations}" 215 | 216 | def __repr__(self): 217 | return str(self) 218 | -------------------------------------------------------------------------------- /src/ebrec/evaluation/protocols.py: -------------------------------------------------------------------------------- 1 | from typing import Protocol 2 | import numpy as np 3 | 4 | 5 | class Metric(Protocol): 6 | name: str 7 | 8 | def calculate(self, y_true: np.ndarray, y_score: np.ndarray) -> float: ... 9 | 10 | def __str__(self) -> str: 11 | return f": params: {self.__dict__}" 12 | 13 | def __repr__(self) -> str: 14 | return str(self) 15 | 16 | def __call__(self, y_true: np.ndarray, y_score: np.ndarray) -> float: 17 | return self.calculate(y_true, y_score) 18 | -------------------------------------------------------------------------------- /src/ebrec/evaluation/utils.py: -------------------------------------------------------------------------------- 1 | from collections import Counter 2 | from typing import Iterable 3 | import numpy as np 4 | 5 | 6 | def convert_to_binary(y_pred: np.ndarray, threshold: float): 7 | y_pred = np.asarray(y_pred) 8 | y_pred[y_pred >= threshold] = 1 9 | y_pred[y_pred < threshold] = 0 10 | return y_pred 11 | 12 | 13 | def is_iterable_nested_dtype(iterable: Iterable[any], dtypes) -> bool: 14 | """ 15 | Check whether iterable is a nested with dtype, 16 | note, we assume all types in iterable are the the same. 17 | Check all cases: any(isinstance(i, dtypes) for i in a) 18 | 19 | Args: 20 | iterable (Iterable[Any]): iterable (list, array, tuple) of any type of data 21 | dtypes (Tuple): tuple of possible dtypes, e.g. dtypes = (list, np.ndarray) 22 | Returns: 23 | bool: boolean whether it is true or false 24 | 25 | Examples: 26 | >>> is_iterable_nested_dtype([1, 2, 3], list) 27 | False 28 | >>> is_iterable_nested_dtype([1, 2, 3], (list, int)) 29 | True 30 | >>> is_iterable_nested_dtype([[1], [2], [3]], list) 31 | True 32 | """ 33 | return isinstance(iterable[0], dtypes) 34 | 35 | 36 | def compute_combinations(n: int, r: int) -> int: 37 | """Compute Combinations where order does not matter (without replacement) 38 | 39 | Source: https://www.statskingdom.com/combinations-calculator.html 40 | Args: 41 | n (int): number of items 42 | r (int): number of items being chosen at a time 43 | Returns: 44 | int: number of possible combinations 45 | 46 | Formula: 47 | * nCr = n! / ( (n - r)! * r! ) 48 | 49 | Assume the following: 50 | * we sample without replacement of items 51 | * order of the outcomes does NOT matter 52 | """ 53 | return int( 54 | (np.math.factorial(n)) / (np.math.factorial(n - r) * np.math.factorial(r)) 55 | ) 56 | 57 | 58 | def scale_range( 59 | m: np.ndarray, 60 | r_min: float = None, 61 | r_max: float = None, 62 | t_min: float = 0, 63 | t_max: float = 1.0, 64 | ) -> None: 65 | """Scale an array between a range 66 | Source: https://stats.stackexchange.com/questions/281162/scale-a-number-between-a-range 67 | 68 | m -> ((m-r_min)/(r_max-r_min)) * (t_max-t_min) + t_min 69 | 70 | Args: 71 | m ∈ [r_min,r_max] denote your measurements to be scaled 72 | r_min denote the minimum of the range of your measurement 73 | r_max denote the maximum of the range of your measurement 74 | t_min denote the minimum of the range of your desired target scaling 75 | t_max denote the maximum of the range of your desired target scaling 76 | """ 77 | if not r_min: 78 | r_min = np.min(m) 79 | if not r_max: 80 | r_max = np.max(m) 81 | return ((m - r_min) / (r_max - r_min)) * (t_max - t_min) + t_min 82 | 83 | 84 | # utils for 85 | def compute_item_popularity_scores(R: Iterable[np.ndarray]) -> dict[str, float]: 86 | """Compute popularity scores for items based on their occurrence in user interactions. 87 | 88 | This function calculates the popularity score of each item as the fraction of users who have interacted with that item. 89 | The popularity score, p_i, for an item is defined as the number of users who have interacted with the item divided by the 90 | total number of users. 91 | 92 | Formula: 93 | p_i = | {u ∈ U}, r_ui != Ø | / |U| 94 | 95 | where p_i is the popularity score of an item, U is the total number of users, and r_ui is the interaction of user u with item i (non-zero 96 | interaction implies the user has seen the item). 97 | 98 | Note: 99 | Each entry can only have the same item ones. TODO - ADD THE TEXT DONE HERE. 100 | 101 | Args: 102 | R (Iterable[np.ndarray]): An iterable of numpy arrays, where each array represents the items interacted with by a single user. 103 | Each element in the array should be a string identifier for an item. 104 | 105 | Returns: 106 | dict[str, float]: A dictionary where keys are item identifiers and values are their corresponding popularity scores (as floats). 107 | 108 | Examples: 109 | >>> R = [ 110 | np.array(["item1", "item2", "item3"]), 111 | np.array(["item1", "item3"]), 112 | np.array(["item1", "item4"]), 113 | ] 114 | >>> print(popularity_scores(R)) 115 | {'item1': 1.0, 'item2': 0.3333333333333333, 'item3': 0.6666666666666666, 'item4': 0.3333333333333333} 116 | """ 117 | U = len(R) 118 | R_flatten = np.concatenate(R) 119 | item_counts = Counter(R_flatten) 120 | return {item: (r_ui / U) for item, r_ui in item_counts.items()} 121 | 122 | 123 | def compute_normalized_distribution( 124 | R: np.ndarray[str], 125 | weights: np.ndarray[float] = None, 126 | distribution: dict[str, float] = None, 127 | ) -> dict[str, float]: 128 | """ 129 | Compute a normalized weigted distribution for a list of items that each can have a single representation assigned. 130 | 131 | Args: 132 | a (np.ndarray[str]): an array of items representation. 133 | weights (np.ndarray[float], optional): weights to assign each element in a. Defaults to None. 134 | * Following yields: len(weights) == len(a) 135 | distribution (Dict[str, float], optional): dictionary to assign the distribution values, if None it will be generated as {}. Defaults to None. 136 | * Use case; if you want to add distribution values to existing, one can input it. 137 | 138 | Returns: 139 | Dict[str, float]: dictionary with normalized distribution values 140 | 141 | Examples: 142 | >>> a = np.array(["a", "b", "c", "c"]) 143 | >>> compute_normalized_distribution(a) 144 | {'a': 0.25, 'b': 0.25, 'c': 0.5} 145 | """ 146 | n_elements = len(R) 147 | 148 | distr = distribution if distribution is not None else {} 149 | weights = weights if weights is not None else np.ones(n_elements) / n_elements 150 | for item, weight in zip(R, weights): 151 | distr[item] = weight + distr.get(item, 0.0) 152 | return distr 153 | 154 | 155 | def get_keys_in_dict(id_list: any, dictionary: dict) -> list[any]: 156 | """ 157 | Returns a list of IDs from id_list that are keys in the dictionary. 158 | Args: 159 | id_list (List[Any]): List of IDs to check against the dictionary. 160 | dictionary (Dict[Any, Any]): Dictionary where keys are checked against the IDs. 161 | 162 | Returns: 163 | List[Any]: List of IDs that are also keys in the dictionary. 164 | 165 | Examples: 166 | >>> get_keys_in_dict(['a', 'b', 'c'], {'a': 1, 'c': 3, 'd': 4}) 167 | ['a', 'c'] 168 | """ 169 | return [id_ for id_ in id_list if id_ in dictionary] 170 | 171 | 172 | def check_key_in_all_nested_dicts(dictionary: dict, key: str) -> None: 173 | """ 174 | Checks if the given key is present in all nested dictionaries within the main dictionary. 175 | Raises a ValueError if the key is not found in any of the nested dictionaries. 176 | 177 | Args: 178 | dictionary (dict): The dictionary containing nested dictionaries to check. 179 | key (str): The key to look for in all nested dictionaries. 180 | 181 | Raises: 182 | ValueError: If the key is not present in any of the nested dictionaries. 183 | 184 | Example: 185 | >>> nested_dict = { 186 | "101": {"name": "Alice", "age": 30}, 187 | "102": {"name": "Bob", "age": 25}, 188 | } 189 | >>> check_key_in_all_nested_dicts(nested_dict, "age") 190 | # No error is raised 191 | >>> check_key_in_all_nested_dicts(nested_dict, "salary") 192 | # Raises ValueError: 'salary is not present in all nested dictionaries.' 193 | """ 194 | for dict_key, sub_dict in dictionary.items(): 195 | if not isinstance(sub_dict, dict) or key not in sub_dict: 196 | raise ValueError( 197 | f"'{key}' is not present in '{dict_key}' nested dictionary." 198 | ) 199 | -------------------------------------------------------------------------------- /src/ebrec/models/fastformer/__init__.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/ebanalyse/ebnerd-benchmark/c12f9b74c83eea68df69f9640d368adcd5d42a9e/src/ebrec/models/fastformer/__init__.py -------------------------------------------------------------------------------- /src/ebrec/models/fastformer/dataloader.py: -------------------------------------------------------------------------------- 1 | from dataclasses import dataclass 2 | from tqdm import tqdm 3 | import polars as pl 4 | import numpy as np 5 | 6 | from torch.utils.tensorboard import SummaryWriter 7 | from torch.utils.data import DataLoader 8 | from torch.utils.data import Dataset 9 | import torch.optim as optim 10 | import torch.nn as nn 11 | import torch 12 | 13 | from ebrec.utils._constants import DEFAULT_INVIEW_ARTICLES_COL, DEFAULT_LABELS_COL 14 | 15 | from ebrec.utils._python import ( 16 | repeat_by_list_values_from_matrix, 17 | convert_to_nested_list, 18 | create_lookup_objects, 19 | ) 20 | from ebrec.utils._articles_behaviors import map_list_article_id_to_value 21 | from ebrec.utils._polars import shuffle_rows 22 | 23 | from ebrec.evaluation import AucScore 24 | from ebrec.utils._torch import save_checkpoint 25 | 26 | 27 | @dataclass 28 | class FastformerDataset(Dataset): 29 | """_summary_ 30 | The batch-size is aggragating multiple impressions and processing them simultaneous, which 31 | has a major effect on the training time. Hence, you should put the batch_size=1 in the 'DataLoader' 32 | and just use FastformerDataset batch_size. 33 | 34 | Note, the outut is then (1, output_shape), where the 1 is the DataLoader batch_size. 35 | """ 36 | 37 | behaviors: pl.DataFrame 38 | history_column: str 39 | article_dict: dict[int, pl.Series] 40 | batch_size: int = 64 41 | shuffle: bool = True 42 | device: str = "cpu" 43 | seed: int = None 44 | labels_col: str = DEFAULT_LABELS_COL 45 | inview_col: str = DEFAULT_INVIEW_ARTICLES_COL 46 | n_samples_col: str = "n_samples" 47 | 48 | def __post_init__(self): 49 | self.unknown_index = [0] 50 | if self.shuffle: 51 | self.behaviors = shuffle_rows(self.behaviors, seed=self.seed) 52 | self.behaviors = self.behaviors.with_columns( 53 | pl.col(self.labels_col).list.len().alias(self.n_samples_col) 54 | ) 55 | self.lookup_indexes, self.lookup_matrix = create_lookup_objects( 56 | self.article_dict, unknown_representation="zeros" 57 | ) 58 | 59 | def __len__(self): 60 | """ 61 | Number of batch steps in the data 62 | """ 63 | return int(np.ceil(self.behaviors.shape[0] / self.batch_size)) 64 | 65 | def __getitem__(self, index: int): 66 | """ 67 | Get the batch of samples for the given index. 68 | 69 | Note: The dataset class provides a single index for each iteration. The batching is done internally in this method 70 | to utilize and optimize for speed. This can be seen as a mini-batching approach. 71 | 72 | Args: 73 | index (int): An integer index. 74 | 75 | Returns: 76 | Tuple[torch.Tensor, torch.Tensor]: A tuple containing the input features and labels as torch Tensors. 77 | Note, the output of the PyTorch DataLoader is (1, *shape), where 1 is the DataLoader's batch_size. 78 | """ 79 | # Clever way to batch the data: 80 | batch_indices = range(index * self.batch_size, (index + 1) * self.batch_size) 81 | batch = self.behaviors[batch_indices] 82 | if self.shuffle: 83 | batch = shuffle_rows(batch, seed=self.seed) 84 | # => 85 | x = ( 86 | batch.drop(self.labels_col) 87 | .pipe( 88 | map_list_article_id_to_value, 89 | behaviors_column=self.history_column, 90 | mapping=self.lookup_indexes, 91 | fill_nulls=self.unknown_index, 92 | ) 93 | .pipe( 94 | map_list_article_id_to_value, 95 | behaviors_column=self.inview_col, 96 | mapping=self.lookup_indexes, 97 | fill_nulls=self.unknown_index, 98 | ) 99 | ) 100 | # => 101 | repeats = np.array(batch[self.n_samples_col]) 102 | # => 103 | history_input = repeat_by_list_values_from_matrix( 104 | input_array=x[self.history_column].to_list(), 105 | matrix=self.lookup_matrix, 106 | repeats=repeats, 107 | ).squeeze(2) 108 | # => 109 | candidate_input = self.lookup_matrix[x[self.inview_col].explode().to_list()] 110 | # => 111 | history_input = torch.Tensor(history_input).type(torch.int).to(self.device) 112 | candidate_input = torch.Tensor(candidate_input).type(torch.int).to(self.device) 113 | y = ( 114 | torch.Tensor(batch[self.labels_col].explode()) 115 | .view(-1, 1) 116 | .type(torch.float) 117 | .to(self.device) 118 | ) 119 | # ======================== 120 | return (history_input, candidate_input), y 121 | 122 | 123 | def batch_input_label_concatenation( 124 | inputs: tuple[torch.Tensor], labels: torch.Tensor 125 | ) -> tuple[torch.Tensor, torch.Tensor]: 126 | """ """ 127 | return (inputs[0].squeeze(0), inputs[1].squeeze(0)), labels.squeeze(0) 128 | 129 | 130 | def compute_auc_from_fixed_pos_neg_samples( 131 | y_true: list[float], y_pred: list[float] 132 | ) -> float: 133 | # 134 | n_samples = int(np.sum(y_true)) 135 | y_true = convert_to_nested_list(y_true, n_samples) 136 | y_pred = convert_to_nested_list(y_pred, n_samples) 137 | val_auc = AucScore().calculate(y_true=y_true, y_pred=y_pred) 138 | return val_auc 139 | 140 | 141 | def train( 142 | model: nn.Module, 143 | train_dataloader: DataLoader, 144 | criterion: nn.Module, 145 | optimizer: optim.Optimizer, 146 | num_epochs: int = 5, 147 | val_dataloader: DataLoader = None, 148 | state_dict_path: str = "model_state_dict.pt", 149 | patience: int = None, 150 | summary_writer: SummaryWriter = None, 151 | gradient_accumulation_steps: int = 1, 152 | tqdm_disable: bool = False, 153 | tqdm_ncol: int = 80, 154 | monitor_metric: str = "loss", 155 | ) -> nn.Module: 156 | """ """ 157 | min_val_loss = np.inf 158 | max_val_auc = -np.inf 159 | early_stop = 0 160 | global_steps = 0 161 | total_batches = len(train_dataloader) 162 | running_loss = 0.0 163 | running_samples = 0 164 | # ==> TRAIN LOOP: 165 | for epoch in range(num_epochs): 166 | # => Set the model to train mode 167 | model.train(True) 168 | progress_bar = tqdm( 169 | train_dataloader, 170 | desc=f"Epoch [{epoch + 1}/{num_epochs}]", 171 | disable=tqdm_disable, 172 | ncols=tqdm_ncol, 173 | ) 174 | # => Zero the parameter gradients 175 | optimizer.zero_grad() 176 | for batch_idx, (inputs, labels) in enumerate(progress_bar, start=1): 177 | # => Move inputs and labels to device 178 | inputs, labels = batch_input_label_concatenation(inputs, labels) 179 | # => Forward pass 180 | outputs = model(*inputs) 181 | loss = criterion(outputs, labels) 182 | # => Backward pass and optimization 183 | loss.backward() 184 | # => Update training loss 185 | global_steps += 1 186 | running_loss += loss.item() * len(outputs) 187 | running_samples += len(outputs) 188 | current_loss = running_loss / running_samples 189 | progress_bar.set_postfix({"Loss": round(current_loss, 6)}) 190 | # => 191 | if summary_writer is not None: 192 | summary_writer.add_scalar( 193 | tag="Train/Loss", 194 | scalar_value=current_loss, 195 | global_step=global_steps, 196 | ) 197 | # => Accumulated gradient step: 198 | if ( 199 | batch_idx % gradient_accumulation_steps == 0 200 | or batch_idx == total_batches 201 | ): 202 | # => Take step and zero gradients 203 | optimizer.step() 204 | optimizer.zero_grad() 205 | 206 | # ==> EVAL LOOP: 207 | if val_dataloader: 208 | model.train(False) 209 | all_outputs, all_labels, val_loss = evaluate( 210 | model=model, 211 | dataloader=val_dataloader, 212 | criterion=criterion, 213 | tqdm_disable=tqdm_disable, 214 | ) 215 | 216 | if summary_writer is not None: 217 | summary_writer.add_scalar( 218 | tag="Val/Loss", scalar_value=val_loss, global_step=global_steps 219 | ) 220 | 221 | if monitor_metric == "auc": 222 | val_auc = compute_auc_from_fixed_pos_neg_samples( 223 | y_true=np.ravel(all_labels.tolist()), 224 | y_pred=np.ravel(all_outputs.tolist()), 225 | ) 226 | print(f"Val/AUC : {round(val_auc, 6)}") 227 | if summary_writer is not None: 228 | summary_writer.add_scalar( 229 | tag="Val/AUC", scalar_value=val_auc, global_step=global_steps 230 | ) 231 | 232 | # => MODEL CHECKPOINT 233 | if monitor_metric == "loss" and val_loss < min_val_loss: 234 | save_checkpoint(model, path=state_dict_path) 235 | min_val_loss = val_loss 236 | early_stop = 0 237 | elif monitor_metric == "auc" and val_auc > max_val_auc: 238 | save_checkpoint(model, path=state_dict_path) 239 | max_val_auc = val_auc 240 | early_stop = 0 241 | else: 242 | early_stop += 1 243 | # => EARLYSTOP 244 | if patience is not None and early_stop == patience: 245 | break 246 | 247 | if summary_writer is not None: 248 | summary_writer.close() 249 | 250 | if val_dataloader: 251 | model.load_state_dict(torch.load(state_dict_path), strict=True) 252 | 253 | return model 254 | 255 | 256 | def evaluate( 257 | model: nn.Module, 258 | dataloader: DataLoader, 259 | criterion: nn.Module, 260 | tqdm_disable: bool = False, 261 | tqdm_ncol: int = 80, 262 | device: str = "cpu", 263 | ) -> tuple[list[float], list[float], float]: 264 | model.eval() 265 | all_outputs = [] 266 | all_labels = [] 267 | loss = 0.0 268 | n_samples = 0 269 | with torch.no_grad(): 270 | progress_bar = tqdm( 271 | dataloader, 272 | desc="Evaluating", 273 | total=dataloader.__len__(), 274 | disable=tqdm_disable, 275 | ncols=tqdm_ncol, 276 | ) 277 | for inputs, labels in progress_bar: 278 | inputs, labels = batch_input_label_concatenation(inputs, labels) 279 | # Forward pass 280 | outputs = model(*inputs) 281 | batch_loss = criterion(outputs, labels) 282 | # => 283 | loss += batch_loss.item() * len(outputs) 284 | n_samples += len(outputs) 285 | # => 286 | all_outputs.append(outputs) 287 | all_labels.append(labels) 288 | # 289 | progress_bar.set_postfix({"Eval Loss": round(loss / n_samples, 4)}) 290 | # => 291 | all_outputs = torch.cat(all_outputs, dim=0) 292 | all_labels = torch.cat(all_labels, dim=0) 293 | loss = loss / n_samples 294 | return all_outputs, all_labels, loss 295 | -------------------------------------------------------------------------------- /src/ebrec/models/fastformer/fastformer_wu.py: -------------------------------------------------------------------------------- 1 | from transformers.models.bert.modeling_bert import ( 2 | BertSelfOutput, 3 | BertIntermediate, 4 | BertOutput, 5 | ) 6 | import logging 7 | import torch.nn as nn 8 | import torch 9 | 10 | 11 | class AttentionPooling(nn.Module): 12 | def __init__(self, config): 13 | self.config = config 14 | super(AttentionPooling, self).__init__() 15 | self.att_fc1 = nn.Linear(config.hidden_size, config.hidden_size) 16 | self.att_fc2 = nn.Linear(config.hidden_size, 1) 17 | self.apply(self.init_weights) 18 | 19 | def init_weights(self, module): 20 | if isinstance(module, nn.Linear): 21 | module.weight.data.normal_(mean=0.0, std=self.config.initializer_range) 22 | if isinstance(module, nn.Linear) and module.bias is not None: 23 | module.bias.data.zero_() 24 | 25 | def forward(self, x, attn_mask=None): 26 | bz = x.shape[0] 27 | e = self.att_fc1(x) 28 | e = nn.Tanh()(e) 29 | alpha = self.att_fc2(e) 30 | alpha = torch.exp(alpha) 31 | if attn_mask is not None: 32 | alpha = alpha * attn_mask.unsqueeze(2) 33 | alpha = alpha / (torch.sum(alpha, dim=1, keepdim=True) + 1e-8) 34 | x = torch.bmm(x.permute(0, 2, 1), alpha) 35 | x = torch.reshape(x, (bz, -1)) 36 | return x 37 | 38 | 39 | class FastSelfAttention(nn.Module): 40 | def __init__(self, config): 41 | super(FastSelfAttention, self).__init__() 42 | self.config = config 43 | if config.hidden_size % config.num_attention_heads != 0: 44 | raise ValueError( 45 | "The hidden size (%d) is not a multiple of the number of attention " 46 | "heads (%d)" % (config.hidden_size, config.num_attention_heads) 47 | ) 48 | self.attention_head_size = int(config.hidden_size / config.num_attention_heads) 49 | self.num_attention_heads = config.num_attention_heads 50 | self.all_head_size = self.num_attention_heads * self.attention_head_size 51 | self.input_dim = config.hidden_size 52 | 53 | self.query = nn.Linear(self.input_dim, self.all_head_size) 54 | self.query_att = nn.Linear(self.all_head_size, self.num_attention_heads) 55 | self.key = nn.Linear(self.input_dim, self.all_head_size) 56 | self.key_att = nn.Linear(self.all_head_size, self.num_attention_heads) 57 | self.transform = nn.Linear(self.all_head_size, self.all_head_size) 58 | 59 | self.softmax = nn.Softmax(dim=-1) 60 | 61 | self.apply(self.init_weights) 62 | 63 | def init_weights(self, module): 64 | if isinstance(module, nn.Linear): 65 | module.weight.data.normal_(mean=0.0, std=self.config.initializer_range) 66 | if isinstance(module, nn.Linear) and module.bias is not None: 67 | module.bias.data.zero_() 68 | 69 | def transpose_for_scores(self, x): 70 | new_x_shape = x.size()[:-1] + ( 71 | self.num_attention_heads, 72 | self.attention_head_size, 73 | ) 74 | x = x.view(*new_x_shape) 75 | return x.permute(0, 2, 1, 3) 76 | 77 | def forward(self, hidden_states, attention_mask): 78 | # batch_size, seq_len, num_head * head_dim, batch_size, seq_len 79 | batch_size, seq_len, _ = hidden_states.shape 80 | mixed_query_layer = self.query(hidden_states) 81 | mixed_key_layer = self.key(hidden_states) 82 | # batch_size, num_head, seq_len 83 | query_for_score = ( 84 | self.query_att(mixed_query_layer).transpose(1, 2) 85 | / self.attention_head_size**0.5 86 | ) 87 | # add attention mask 88 | query_for_score += attention_mask 89 | 90 | # batch_size, num_head, 1, seq_len 91 | query_weight = self.softmax(query_for_score).unsqueeze(2) 92 | 93 | # batch_size, num_head, seq_len, head_dim 94 | query_layer = self.transpose_for_scores(mixed_query_layer) 95 | 96 | # batch_size, num_head, head_dim, 1 97 | pooled_query = ( 98 | torch.matmul(query_weight, query_layer) 99 | .transpose(1, 2) 100 | .view(-1, 1, self.num_attention_heads * self.attention_head_size) 101 | ) 102 | pooled_query_repeat = pooled_query.repeat(1, seq_len, 1) 103 | # batch_size, num_head, seq_len, head_dim 104 | 105 | # batch_size, num_head, seq_len 106 | mixed_query_key_layer = mixed_key_layer * pooled_query_repeat 107 | 108 | query_key_score = ( 109 | self.key_att(mixed_query_key_layer) / self.attention_head_size**0.5 110 | ).transpose(1, 2) 111 | 112 | # add attention mask 113 | query_key_score += attention_mask 114 | 115 | # batch_size, num_head, 1, seq_len 116 | query_key_weight = self.softmax(query_key_score).unsqueeze(2) 117 | 118 | key_layer = self.transpose_for_scores(mixed_query_key_layer) 119 | pooled_key = torch.matmul(query_key_weight, key_layer) 120 | 121 | # query = value 122 | weighted_value = (pooled_key * query_layer).transpose(1, 2) 123 | weighted_value = weighted_value.reshape( 124 | weighted_value.size()[:-2] 125 | + (self.num_attention_heads * self.attention_head_size,) 126 | ) 127 | weighted_value = self.transform(weighted_value) + mixed_query_layer 128 | 129 | return weighted_value 130 | 131 | 132 | class FastAttention(nn.Module): 133 | def __init__(self, config): 134 | super(FastAttention, self).__init__() 135 | self.self = FastSelfAttention(config) 136 | self.output = BertSelfOutput(config) 137 | 138 | def forward(self, input_tensor, attention_mask): 139 | self_output = self.self(input_tensor, attention_mask) 140 | attention_output = self.output(self_output, input_tensor) 141 | return attention_output 142 | 143 | 144 | class FastformerLayer(nn.Module): 145 | def __init__(self, config): 146 | super(FastformerLayer, self).__init__() 147 | self.attention = FastAttention(config) 148 | self.intermediate = BertIntermediate(config) 149 | self.output = BertOutput(config) 150 | 151 | def forward(self, hidden_states, attention_mask): 152 | attention_output = self.attention(hidden_states, attention_mask) 153 | intermediate_output = self.intermediate(attention_output) 154 | layer_output = self.output(intermediate_output, attention_output) 155 | return layer_output 156 | 157 | 158 | class StandardFastformerEncoder(nn.Module): 159 | def __init__(self, config, pooler_count=1): 160 | super(StandardFastformerEncoder, self).__init__() 161 | self.config = config 162 | self.encoders = nn.ModuleList( 163 | [FastformerLayer(config) for _ in range(config.num_hidden_layers)] 164 | ) 165 | self.position_embeddings = nn.Embedding( 166 | config.max_position_embeddings, config.hidden_size 167 | ) 168 | self.LayerNorm = nn.LayerNorm(config.hidden_size, eps=config.layer_norm_eps) 169 | self.dropout = nn.Dropout(config.hidden_dropout_prob) 170 | 171 | # support multiple different poolers with shared bert encoder. 172 | self.poolers = nn.ModuleList() 173 | if config.pooler_type == "weightpooler": 174 | for _ in range(pooler_count): 175 | self.poolers.append(AttentionPooling(config)) 176 | logging.info(f"This model has {len(self.poolers)} poolers.") 177 | self.apply(self.init_weights) 178 | 179 | def init_weights(self, module): 180 | if isinstance(module, (nn.Linear, nn.Embedding)): 181 | module.weight.data.normal_(mean=0.0, std=self.config.initializer_range) 182 | if isinstance(module, (nn.Embedding)) and module.padding_idx is not None: 183 | with torch.no_grad(): 184 | module.weight[module.padding_idx].fill_(0) 185 | elif isinstance(module, nn.LayerNorm): 186 | module.bias.data.zero_() 187 | module.weight.data.fill_(1.0) 188 | if isinstance(module, nn.Linear) and module.bias is not None: 189 | module.bias.data.zero_() 190 | 191 | def forward(self, input_embs, attention_mask, pooler_index=0) -> torch.Tensor: 192 | """ 193 | Forward pass through the encoder. 194 | 195 | Parameters: 196 | input_embs (torch.Tensor): The input embeddings, with shape (batch_size, n_tokens, emb_dim). 197 | attention_mask (torch.Tensor): The attention mask, with shape (batch_size, n_tokens), where 198 | values of 1 indicate positions to attend to and 0s indicate positions to mask. 199 | pooler_index (int, optional): Index of the pooler to use to aggregate the encoder's output. Default is 0. 200 | 201 | Returns: 202 | torch.Tensor: The output of the encoder, processed and pooled according to the specified pooler. 203 | with shape (batch_size, config.hidden_size). 204 | 205 | Usage: 206 | >>> encoder_output = model.forward(input_embs, attention_mask, pooler_index=0) 207 | """ 208 | extended_attention_mask = attention_mask.unsqueeze(1) 209 | extended_attention_mask = extended_attention_mask.to( 210 | dtype=next(self.parameters()).dtype 211 | ) # fp16 compatibility 212 | extended_attention_mask = (1.0 - extended_attention_mask) * -10000.0 213 | 214 | batch_size, n_tokens, emb_dim = input_embs.shape 215 | 216 | position_ids = torch.arange( 217 | n_tokens, dtype=torch.long, device=input_embs.device 218 | ) 219 | position_ids = position_ids.unsqueeze(0).expand(batch_size, -1) 220 | position_embeddings = self.position_embeddings(position_ids) 221 | 222 | embeddings = input_embs + position_embeddings 223 | embeddings = self.LayerNorm(embeddings) 224 | embeddings = self.dropout(embeddings) 225 | 226 | all_hidden_states = [embeddings] 227 | 228 | for layer_module in self.encoders: 229 | layer_outputs = layer_module(all_hidden_states[-1], extended_attention_mask) 230 | all_hidden_states.append(layer_outputs) 231 | 232 | output = self.poolers[pooler_index](all_hidden_states[-1], attention_mask) 233 | 234 | return output 235 | 236 | 237 | class Fastformer_wu(torch.nn.Module): 238 | def __init__( 239 | self, 240 | config, 241 | word_embedding: nn.Embedding, 242 | ): 243 | super(Fastformer_wu, self).__init__() 244 | self.config = config 245 | self.word_embedding = word_embedding 246 | self.embedding_transform = nn.Linear( 247 | word_embedding.weight.shape[1], config.hidden_size 248 | ) 249 | # 4 classes; likely the npratio 250 | self.output_layer = nn.Linear(config.hidden_size, 4) 251 | self.fastformer_model = StandardFastformerEncoder(config) 252 | self.criterion = nn.CrossEntropyLoss() 253 | self.apply(self.init_weights) 254 | 255 | def init_weights(self, module): 256 | if isinstance(module, (nn.Linear, nn.Embedding)): 257 | module.weight.data.normal_(mean=0.0, std=self.config.initializer_range) 258 | if isinstance(module, (nn.Embedding)) and module.padding_idx is not None: 259 | with torch.no_grad(): 260 | module.weight[module.padding_idx].fill_(0) 261 | if isinstance(module, nn.Linear) and module.bias is not None: 262 | module.bias.data.zero_() 263 | 264 | def forward(self, input_ids, targets): 265 | mask = input_ids.bool().float() 266 | embds = self.word_embedding(input_ids) 267 | embds = self.embedding_transform(embds) 268 | text_vec = self.fastformer_model(embds, mask) 269 | score = self.output_layer(text_vec) 270 | loss = self.criterion(score, targets) 271 | return loss, score 272 | -------------------------------------------------------------------------------- /src/ebrec/models/newsrec/__init__.py: -------------------------------------------------------------------------------- 1 | from .npa import NPAModel 2 | from .lstur import LSTURModel 3 | from .nrms import NRMSModel 4 | from .naml import NAMLModel 5 | -------------------------------------------------------------------------------- /src/ebrec/models/newsrec/base_model.py: -------------------------------------------------------------------------------- 1 | from typing import Any, Dict 2 | from tensorflow import keras 3 | import tensorflow as tf 4 | import numpy as np 5 | import abc 6 | 7 | __all__ = ["BaseModel"] 8 | 9 | 10 | class BaseModel: 11 | """Basic class of models 12 | 13 | Attributes: 14 | hparams (object): A tf.contrib.training.HParams object, hold the entire set of hyperparameters. 15 | graph (object): An optional graph. 16 | seed (int): Random seed. 17 | """ 18 | 19 | def __init__( 20 | self, 21 | hparams: Dict[str, Any], 22 | word2vec_embedding: np.ndarray = None, 23 | # if 'word2vec_embedding' not provided: 24 | word_emb_dim: int = 300, 25 | vocab_size: int = 32000, 26 | seed=None, 27 | ): 28 | """Initializing the model. Create common logics which are needed by all deeprec models, such as loss function, 29 | parameter set. 30 | 31 | Args: 32 | hparams (object): Hold the entire set of hyperparameters. 33 | seed (int): Random seed. 34 | """ 35 | self.seed = seed 36 | tf.random.set_seed(seed) 37 | np.random.seed(seed) 38 | 39 | # ASSIGN 'hparams': 40 | self.hparams = hparams 41 | 42 | # INIT THE WORD-EMBEDDINGS: 43 | if word2vec_embedding is None: 44 | self.word2vec_embedding = np.random.rand(vocab_size, word_emb_dim) 45 | else: 46 | self.word2vec_embedding = word2vec_embedding 47 | 48 | # BUILD AND COMPILE MODEL: 49 | self.model, self.scorer = self._build_graph() 50 | self.loss = self._get_loss(self.hparams.loss) 51 | self.train_optimizer = self._get_opt( 52 | optimizer=self.hparams.optimizer, lr=self.hparams.learning_rate 53 | ) 54 | self.model.compile(loss=self.loss, optimizer=self.train_optimizer) 55 | 56 | @abc.abstractmethod 57 | def _build_graph(self): 58 | """Subclass will implement this.""" 59 | pass 60 | 61 | def _get_loss(self, loss: str): 62 | """Make loss function, consists of data loss and regularization loss 63 | 64 | Returns: 65 | object: Loss function or loss function name 66 | """ 67 | if loss == "cross_entropy_loss": 68 | data_loss = "categorical_crossentropy" 69 | elif loss == "log_loss": 70 | data_loss = "binary_crossentropy" 71 | else: 72 | raise ValueError(f"this loss not defined {loss}") 73 | return data_loss 74 | 75 | def _get_opt(self, optimizer: str, lr: float): 76 | """Get the optimizer according to configuration. Usually we will use Adam. 77 | Returns: 78 | object: An optimizer. 79 | """ 80 | 81 | if optimizer == "adam": 82 | train_opt = keras.optimizers.Adam(learning_rate=lr) 83 | else: 84 | raise ValueError(f"this optimizer not defined {optimizer}") 85 | 86 | return train_opt 87 | -------------------------------------------------------------------------------- /src/ebrec/models/newsrec/layers.py: -------------------------------------------------------------------------------- 1 | import tensorflow as tf 2 | import tensorflow.keras as keras 3 | from tensorflow.keras import layers 4 | from tensorflow.keras import backend as K 5 | 6 | 7 | class AttLayer2(layers.Layer): 8 | """Soft alignment attention implement. 9 | 10 | Attributes: 11 | dim (int): attention hidden dim 12 | """ 13 | 14 | def __init__(self, dim=200, seed=0, **kwargs): 15 | """Initialization steps for AttLayer2. 16 | 17 | Args: 18 | dim (int): attention hidden dim 19 | """ 20 | 21 | self.dim = dim 22 | self.seed = seed 23 | super(AttLayer2, self).__init__(**kwargs) 24 | 25 | def build(self, input_shape): 26 | """Initialization for variables in AttLayer2 27 | There are there variables in AttLayer2, i.e. W, b and q. 28 | 29 | Args: 30 | input_shape (object): shape of input tensor. 31 | """ 32 | 33 | assert len(input_shape) == 3 34 | dim = self.dim 35 | self.W = self.add_weight( 36 | name="W", 37 | shape=(int(input_shape[-1]), dim), 38 | initializer=keras.initializers.glorot_uniform(seed=self.seed), 39 | trainable=True, 40 | ) 41 | self.b = self.add_weight( 42 | name="b", 43 | shape=(dim,), 44 | initializer=keras.initializers.Zeros(), 45 | trainable=True, 46 | ) 47 | self.q = self.add_weight( 48 | name="q", 49 | shape=(dim, 1), 50 | initializer=keras.initializers.glorot_uniform(seed=self.seed), 51 | trainable=True, 52 | ) 53 | super(AttLayer2, self).build(input_shape) # be sure you call this somewhere! 54 | 55 | def call(self, inputs, mask=None, **kwargs): 56 | """Core implemention of soft attention 57 | 58 | Args: 59 | inputs (object): input tensor. 60 | 61 | Returns: 62 | object: weighted sum of input tensors. 63 | """ 64 | 65 | attention = K.tanh(K.dot(inputs, self.W) + self.b) 66 | attention = K.dot(attention, self.q) 67 | 68 | attention = K.squeeze(attention, axis=2) 69 | 70 | if mask == None: 71 | attention = K.exp(attention) 72 | else: 73 | attention = K.exp(attention) * K.cast(mask, dtype="float32") 74 | 75 | attention_weight = attention / ( 76 | K.sum(attention, axis=-1, keepdims=True) + K.epsilon() 77 | ) 78 | 79 | attention_weight = K.expand_dims(attention_weight) 80 | weighted_input = inputs * attention_weight 81 | return K.sum(weighted_input, axis=1) 82 | 83 | def compute_mask(self, input, input_mask=None): 84 | """Compte output mask value 85 | 86 | Args: 87 | input (object): input tensor. 88 | input_mask: input mask 89 | 90 | Returns: 91 | object: output mask. 92 | """ 93 | return None 94 | 95 | def compute_output_shape(self, input_shape): 96 | """Compute shape of output tensor 97 | 98 | Args: 99 | input_shape (tuple): shape of input tensor. 100 | 101 | Returns: 102 | tuple: shape of output tensor. 103 | """ 104 | return input_shape[0], input_shape[-1] 105 | 106 | 107 | class SelfAttention(layers.Layer): 108 | """Multi-head self attention implement. 109 | 110 | Args: 111 | multiheads (int): The number of heads. 112 | head_dim (object): Dimention of each head. 113 | mask_right (boolean): whether to mask right words. 114 | 115 | Returns: 116 | object: Weighted sum after attention. 117 | """ 118 | 119 | def __init__(self, multiheads, head_dim, seed=0, mask_right=False, **kwargs): 120 | """Initialization steps for AttLayer2. 121 | 122 | Args: 123 | multiheads (int): The number of heads. 124 | head_dim (object): Dimention of each head. 125 | mask_right (boolean): whether to mask right words. 126 | """ 127 | 128 | self.multiheads = multiheads 129 | self.head_dim = head_dim 130 | self.output_dim = multiheads * head_dim 131 | self.mask_right = mask_right 132 | self.seed = seed 133 | super(SelfAttention, self).__init__(**kwargs) 134 | 135 | def compute_output_shape(self, input_shape): 136 | """Compute shape of output tensor. 137 | 138 | Returns: 139 | tuple: output shape tuple. 140 | """ 141 | 142 | return (input_shape[0][0], input_shape[0][1], self.output_dim) 143 | 144 | def build(self, input_shape): 145 | """Initialization for variables in SelfAttention. 146 | There are three variables in SelfAttention, i.e. WQ, WK ans WV. 147 | WQ is used for linear transformation of query. 148 | WK is used for linear transformation of key. 149 | WV is used for linear transformation of value. 150 | 151 | Args: 152 | input_shape (object): shape of input tensor. 153 | """ 154 | 155 | self.WQ = self.add_weight( 156 | name="WQ", 157 | shape=(int(input_shape[0][-1]), self.output_dim), 158 | initializer=keras.initializers.glorot_uniform(seed=self.seed), 159 | trainable=True, 160 | ) 161 | self.WK = self.add_weight( 162 | name="WK", 163 | shape=(int(input_shape[1][-1]), self.output_dim), 164 | initializer=keras.initializers.glorot_uniform(seed=self.seed), 165 | trainable=True, 166 | ) 167 | self.WV = self.add_weight( 168 | name="WV", 169 | shape=(int(input_shape[2][-1]), self.output_dim), 170 | initializer=keras.initializers.glorot_uniform(seed=self.seed), 171 | trainable=True, 172 | ) 173 | super(SelfAttention, self).build(input_shape) 174 | 175 | def Mask(self, inputs, seq_len, mode="add"): 176 | """Mask operation used in multi-head self attention 177 | 178 | Args: 179 | seq_len (object): sequence length of inputs. 180 | mode (str): mode of mask. 181 | 182 | Returns: 183 | object: tensors after masking. 184 | """ 185 | 186 | if seq_len is None: 187 | return inputs 188 | else: 189 | mask = K.one_hot(indices=seq_len[:, 0], num_classes=K.shape(inputs)[1]) 190 | mask = 1 - K.cumsum(mask, axis=1) 191 | 192 | for _ in range(len(inputs.shape) - 2): 193 | mask = K.expand_dims(mask, 2) 194 | 195 | if mode == "mul": 196 | return inputs * mask 197 | elif mode == "add": 198 | return inputs - (1 - mask) * 1e12 199 | 200 | def call(self, QKVs): 201 | """Core logic of multi-head self attention. 202 | 203 | Args: 204 | QKVs (list): inputs of multi-head self attention i.e. qeury, key and value. 205 | 206 | Returns: 207 | object: ouput tensors. 208 | """ 209 | if len(QKVs) == 3: 210 | Q_seq, K_seq, V_seq = QKVs 211 | Q_len, V_len = None, None 212 | elif len(QKVs) == 5: 213 | Q_seq, K_seq, V_seq, Q_len, V_len = QKVs 214 | Q_seq = K.dot(Q_seq, self.WQ) 215 | Q_seq = K.reshape( 216 | Q_seq, shape=(-1, K.shape(Q_seq)[1], self.multiheads, self.head_dim) 217 | ) 218 | Q_seq = K.permute_dimensions(Q_seq, pattern=(0, 2, 1, 3)) 219 | 220 | K_seq = K.dot(K_seq, self.WK) 221 | K_seq = K.reshape( 222 | K_seq, shape=(-1, K.shape(K_seq)[1], self.multiheads, self.head_dim) 223 | ) 224 | K_seq = K.permute_dimensions(K_seq, pattern=(0, 2, 1, 3)) 225 | 226 | V_seq = K.dot(V_seq, self.WV) 227 | V_seq = K.reshape( 228 | V_seq, shape=(-1, K.shape(V_seq)[1], self.multiheads, self.head_dim) 229 | ) 230 | V_seq = K.permute_dimensions(V_seq, pattern=(0, 2, 1, 3)) 231 | A = tf.matmul(Q_seq, K_seq, adjoint_a=False, adjoint_b=True) / K.sqrt( 232 | K.cast(self.head_dim, dtype="float32") 233 | ) 234 | 235 | A = K.permute_dimensions( 236 | A, pattern=(0, 3, 2, 1) 237 | ) # A.shape=[batch_size,K_sequence_length,Q_sequence_length,self.multiheads] 238 | 239 | A = self.Mask(A, V_len, "add") 240 | A = K.permute_dimensions(A, pattern=(0, 3, 2, 1)) 241 | 242 | if self.mask_right: 243 | ones = K.ones_like(A[:1, :1]) 244 | lower_triangular = K.tf.matrix_band_part(ones, num_lower=-1, num_upper=0) 245 | mask = (ones - lower_triangular) * 1e12 246 | A = A - mask 247 | A = K.softmax(A) 248 | 249 | O_seq = tf.matmul(A, V_seq, adjoint_a=True, adjoint_b=False) 250 | O_seq = K.permute_dimensions(O_seq, pattern=(0, 2, 1, 3)) 251 | 252 | O_seq = K.reshape(O_seq, shape=(-1, K.shape(O_seq)[1], self.output_dim)) 253 | O_seq = self.Mask(O_seq, Q_len, "mul") 254 | return O_seq 255 | 256 | def get_config(self): 257 | """add multiheads, multiheads and mask_right into layer config. 258 | 259 | Returns: 260 | dict: config of SelfAttention layer. 261 | """ 262 | config = super(SelfAttention, self).get_config() 263 | config.update( 264 | { 265 | "multiheads": self.multiheads, 266 | "head_dim": self.head_dim, 267 | "mask_right": self.mask_right, 268 | } 269 | ) 270 | return config 271 | 272 | 273 | class ComputeMasking(layers.Layer): 274 | """Compute if inputs contains zero value. 275 | 276 | Returns: 277 | bool tensor: True for values not equal to zero. 278 | """ 279 | 280 | def __init__(self, **kwargs): 281 | super(ComputeMasking, self).__init__(**kwargs) 282 | 283 | def call(self, inputs, **kwargs): 284 | mask = K.not_equal(inputs, 0) 285 | return K.cast(mask, K.floatx()) 286 | 287 | def compute_output_shape(self, input_shape): 288 | return input_shape 289 | 290 | 291 | class OverwriteMasking(layers.Layer): 292 | """Set values at spasific positions to zero. 293 | 294 | Args: 295 | inputs (list): value tensor and mask tensor. 296 | 297 | Returns: 298 | object: tensor after setting values to zero. 299 | """ 300 | 301 | def __init__(self, **kwargs): 302 | super(OverwriteMasking, self).__init__(**kwargs) 303 | 304 | def build(self, input_shape): 305 | super(OverwriteMasking, self).build(input_shape) 306 | 307 | def call(self, inputs, **kwargs): 308 | return inputs[0] * K.expand_dims(inputs[1]) 309 | 310 | def compute_output_shape(self, input_shape): 311 | return input_shape[0] 312 | 313 | 314 | def PersonalizedAttentivePooling(dim1, dim2, dim3, seed=0): 315 | """Soft alignment attention implement. 316 | Attributes: 317 | dim1 (int): first dimention of value shape. 318 | dim2 (int): second dimention of value shape. 319 | dim3 (int): shape of query 320 | 321 | Returns: 322 | object: weighted summary of inputs value. 323 | """ 324 | vecs_input = keras.Input(shape=(dim1, dim2), dtype="float32") 325 | query_input = keras.Input(shape=(dim3,), dtype="float32") 326 | 327 | user_vecs = layers.Dropout(0.2)(vecs_input) 328 | user_att = layers.Dense( 329 | dim3, 330 | activation="tanh", 331 | kernel_initializer=keras.initializers.glorot_uniform(seed=seed), 332 | bias_initializer=keras.initializers.Zeros(), 333 | )(user_vecs) 334 | user_att2 = layers.Dot(axes=-1)([query_input, user_att]) 335 | user_att2 = layers.Activation("softmax")(user_att2) 336 | user_vec = layers.Dot((1, 1))([user_vecs, user_att2]) 337 | 338 | model = keras.Model([vecs_input, query_input], user_vec) 339 | return model 340 | -------------------------------------------------------------------------------- /src/ebrec/models/newsrec/lstur.py: -------------------------------------------------------------------------------- 1 | # Copyright (c) Microsoft Corporation. All rights reserved. 2 | # Licensed under the MIT License. 3 | from ebrec.models.newsrec.layers import AttLayer2, ComputeMasking, OverwriteMasking 4 | from ebrec.models.newsrec.base_model import BaseModel 5 | from tensorflow.keras import layers 6 | import tensorflow.keras as keras 7 | 8 | 9 | __all__ = ["LSTURModel"] 10 | 11 | 12 | class LSTURModel(BaseModel): 13 | """LSTUR model(Neural News Recommendation with Multi-Head Self-Attention) 14 | 15 | Mingxiao An, Fangzhao Wu, Chuhan Wu, Kun Zhang, Zheng Liu and Xing Xie: 16 | Neural News Recommendation with Long- and Short-term User Representations, ACL 2019 17 | 18 | Attributes:0 19 | word2vec_embedding (numpy.ndarray): Pretrained word embedding matrix. 20 | hparam (object): Global hyper-parameters. 21 | """ 22 | 23 | def __init__( 24 | self, 25 | hparams, 26 | word2vec_embedding=None, 27 | seed=None, 28 | **kwargs, 29 | ): 30 | """Initialization steps for LSTUR. 31 | Compared with the BaseModel, LSTUR need word embedding. 32 | After creating word embedding matrix, BaseModel's __init__ method will be called. 33 | 34 | Args: 35 | hparams (object): Global hyper-parameters. Some key setttings such as type and gru_unit are there. 36 | """ 37 | 38 | super().__init__( 39 | hparams=hparams, 40 | word2vec_embedding=word2vec_embedding, 41 | seed=seed, 42 | **kwargs, 43 | ) 44 | 45 | def _build_graph(self): 46 | """Build LSTUR model and scorer. 47 | 48 | Returns: 49 | object: a model used to train. 50 | object: a model used to evaluate and inference. 51 | """ 52 | 53 | model, scorer = self._build_lstur() 54 | return model, scorer 55 | 56 | def _build_userencoder(self, titleencoder, type="ini"): 57 | """The main function to create user encoder of LSTUR. 58 | 59 | Args: 60 | titleencoder (object): the news encoder of LSTUR. 61 | 62 | Return: 63 | object: the user encoder of LSTUR. 64 | """ 65 | 66 | his_input_title = keras.Input( 67 | shape=(self.hparams.history_size, self.hparams.title_size), dtype="int32" 68 | ) 69 | user_indexes = keras.Input(shape=(1,), dtype="int32") 70 | 71 | user_embedding_layer = layers.Embedding( 72 | input_dim=self.hparams.n_users + 1, 73 | output_dim=self.hparams.gru_unit, # Dimension of the dense embedding. 74 | trainable=True, 75 | embeddings_initializer="zeros", 76 | ) 77 | 78 | long_u_emb = layers.Reshape((self.hparams.gru_unit,))( 79 | user_embedding_layer(user_indexes) 80 | ) 81 | click_title_presents = layers.TimeDistributed(titleencoder)(his_input_title) 82 | 83 | if type == "ini": 84 | user_present = layers.GRU( 85 | self.hparams.gru_unit, 86 | kernel_initializer=keras.initializers.glorot_uniform(seed=self.seed), 87 | recurrent_initializer=keras.initializers.glorot_uniform(seed=self.seed), 88 | bias_initializer=keras.initializers.Zeros(), 89 | )( 90 | layers.Masking(mask_value=0.0)(click_title_presents), 91 | initial_state=[long_u_emb], 92 | ) 93 | elif type == "con": 94 | short_uemb = layers.GRU( 95 | self.hparams.gru_unit, 96 | kernel_initializer=keras.initializers.glorot_uniform(seed=self.seed), 97 | recurrent_initializer=keras.initializers.glorot_uniform(seed=self.seed), 98 | bias_initializer=keras.initializers.Zeros(), 99 | )(layers.Masking(mask_value=0.0)(click_title_presents)) 100 | 101 | user_present = layers.Concatenate()([short_uemb, long_u_emb]) 102 | user_present = layers.Dense( 103 | self.hparams.gru_unit, 104 | bias_initializer=keras.initializers.Zeros(), 105 | kernel_initializer=keras.initializers.glorot_uniform(seed=self.seed), 106 | )(user_present) 107 | 108 | model = keras.Model( 109 | [his_input_title, user_indexes], user_present, name="user_encoder" 110 | ) 111 | return model 112 | 113 | def _build_newsencoder(self, embedding_layer): 114 | """The main function to create news encoder of LSTUR. 115 | 116 | Args: 117 | embedding_layer (object): a word embedding layer. 118 | 119 | Return: 120 | object: the news encoder of LSTUR. 121 | """ 122 | 123 | sequences_input_title = keras.Input( 124 | shape=(self.hparams.title_size,), dtype="int32" 125 | ) 126 | embedded_sequences_title = embedding_layer(sequences_input_title) 127 | 128 | y = layers.Dropout(self.hparams.dropout)(embedded_sequences_title) 129 | y = layers.Conv1D( 130 | self.hparams.filter_num, 131 | self.hparams.window_size, 132 | activation=self.hparams.cnn_activation, 133 | padding="same", 134 | bias_initializer=keras.initializers.Zeros(), 135 | kernel_initializer=keras.initializers.glorot_uniform(seed=self.seed), 136 | )(y) 137 | y = layers.Dropout(self.hparams.dropout)(y) 138 | y = layers.Masking()( 139 | OverwriteMasking()([y, ComputeMasking()(sequences_input_title)]) 140 | ) 141 | pred_title = AttLayer2(self.hparams.attention_hidden_dim, seed=self.seed)(y) 142 | print(pred_title) 143 | model = keras.Model(sequences_input_title, pred_title, name="news_encoder") 144 | return model 145 | 146 | def _build_lstur(self): 147 | """The main function to create LSTUR's logic. The core of LSTUR 148 | is a user encoder and a news encoder. 149 | 150 | Returns: 151 | object: a model used to train. 152 | object: a model used to evaluate and inference. 153 | """ 154 | 155 | his_input_title = keras.Input( 156 | shape=(self.hparams.history_size, self.hparams.title_size), dtype="int32" 157 | ) 158 | pred_input_title = keras.Input( 159 | # shape=(hparams.npratio + 1, hparams.title_size), dtype="int32" 160 | shape=(None, self.hparams.title_size), 161 | dtype="int32", 162 | ) 163 | pred_input_title_one = keras.Input( 164 | shape=( 165 | 1, 166 | self.hparams.title_size, 167 | ), 168 | dtype="int32", 169 | ) 170 | pred_title_reshape = layers.Reshape((self.hparams.title_size,))( 171 | pred_input_title_one 172 | ) 173 | user_indexes = keras.Input(shape=(1,), dtype="int32") 174 | 175 | embedding_layer = layers.Embedding( 176 | self.word2vec_embedding.shape[0], 177 | self.word2vec_embedding.shape[1], 178 | weights=[self.word2vec_embedding], 179 | trainable=True, 180 | ) 181 | 182 | titleencoder = self._build_newsencoder(embedding_layer) 183 | self.userencoder = self._build_userencoder(titleencoder, type=self.hparams.type) 184 | self.newsencoder = titleencoder 185 | 186 | user_present = self.userencoder([his_input_title, user_indexes]) 187 | news_present = layers.TimeDistributed(self.newsencoder)(pred_input_title) 188 | news_present_one = self.newsencoder(pred_title_reshape) 189 | 190 | preds = layers.Dot(axes=-1)([news_present, user_present]) 191 | preds = layers.Activation(activation="softmax")(preds) 192 | 193 | pred_one = layers.Dot(axes=-1)([news_present_one, user_present]) 194 | pred_one = layers.Activation(activation="sigmoid")(pred_one) 195 | 196 | model = keras.Model([user_indexes, his_input_title, pred_input_title], preds) 197 | scorer = keras.Model( 198 | [user_indexes, his_input_title, pred_input_title_one], pred_one 199 | ) 200 | 201 | return model, scorer 202 | -------------------------------------------------------------------------------- /src/ebrec/models/newsrec/model_config.py: -------------------------------------------------------------------------------- 1 | # 2 | DEFAULT_TITLE_SIZE = 30 3 | DEFAULT_BODY_SIZE = 40 4 | UNKNOWN_TITLE_VALUE = [0] * DEFAULT_TITLE_SIZE 5 | UNKNOWN_BODY_VALUE = [0] * DEFAULT_BODY_SIZE 6 | 7 | DEFAULT_DOCUMENT_SIZE = 768 8 | 9 | 10 | def print_hparams(hparams_class): 11 | for attr, value in hparams_class.__annotations__.items(): 12 | # Print attribute names and values 13 | print(f"{attr}: {getattr(hparams_class, attr)}") 14 | 15 | 16 | def hparams_to_dict(hparams_class) -> dict: 17 | params = {} 18 | for attr, value in hparams_class.__annotations__.items(): 19 | params[attr] = getattr(hparams_class, attr) 20 | return params 21 | 22 | 23 | class hparams_naml: 24 | # INPUT DIMENTIONS: 25 | title_size: int = DEFAULT_TITLE_SIZE 26 | history_size: int = 20 27 | body_size: int = DEFAULT_BODY_SIZE 28 | vert_num: int = 100 29 | vert_emb_dim: int = 10 30 | subvert_num: int = 100 31 | subvert_emb_dim: int = 10 32 | # MODEL ARCHITECTURE 33 | dense_activation: str = "relu" 34 | cnn_activation: str = "relu" 35 | attention_hidden_dim: int = 200 36 | filter_num: int = 400 37 | window_size: int = 3 38 | # MODEL OPTIMIZER: 39 | optimizer: str = "adam" 40 | loss: str = "cross_entropy_loss" 41 | dropout: float = 0.2 42 | learning_rate: float = 1e-4 43 | 44 | 45 | class hparams_lstur: 46 | # INPUT DIMENTIONS: 47 | title_size: int = DEFAULT_TITLE_SIZE 48 | history_size: int = 20 49 | n_users: int = 50000 50 | # MODEL ARCHITECTURE 51 | cnn_activation: str = "relu" 52 | type: str = "ini" 53 | attention_hidden_dim: int = 200 54 | gru_unit: int = 400 55 | filter_num: int = 400 56 | window_size: int = 3 57 | # MODEL OPTIMIZER: 58 | optimizer: str = "adam" 59 | loss: str = "cross_entropy_loss" 60 | dropout: float = 0.2 61 | learning_rate: float = 1e-4 62 | 63 | 64 | class hparams_npa: 65 | # INPUT DIMENTIONS: 66 | title_size: int = DEFAULT_TITLE_SIZE 67 | history_size: int = 20 68 | n_users: int = 50000 69 | # MODEL ARCHITECTURE 70 | cnn_activation: str = "relu" 71 | attention_hidden_dim: int = 200 72 | user_emb_dim: int = 400 73 | filter_num: int = 400 74 | window_size: int = 3 75 | # MODEL OPTIMIZER: 76 | optimizer: str = "adam" 77 | loss: str = "cross_entropy_loss" 78 | dropout: float = 0.2 79 | learning_rate: float = 1e-4 80 | 81 | 82 | class hparams_nrms: 83 | # INPUT DIMENTIONS: 84 | title_size: int = DEFAULT_TITLE_SIZE 85 | history_size: int = 20 86 | # MODEL ARCHITECTURE 87 | head_num: int = 20 88 | head_dim: int = 20 89 | attention_hidden_dim: int = 200 90 | # MODEL OPTIMIZER: 91 | optimizer: str = "adam" 92 | loss: str = "cross_entropy_loss" 93 | dropout: float = 0.2 94 | learning_rate: float = 1e-4 95 | # MY OWN LITTLE TWIST: 96 | newsencoder_units_per_layer: list[int] = None 97 | newsencoder_l2_regularization: float = 1e-4 98 | 99 | 100 | class hparams_nrms_docvec: 101 | # INPUT DIMENTIONS: 102 | title_size: int = DEFAULT_DOCUMENT_SIZE 103 | history_size: int = 20 104 | # MODEL ARCHITECTURE 105 | head_num: int = 16 106 | head_dim: int = 16 107 | attention_hidden_dim: int = 200 108 | # MODEL OPTIMIZER: 109 | optimizer: str = "adam" 110 | loss: str = "cross_entropy_loss" 111 | dropout: float = 0.2 112 | learning_rate: float = 1e-4 113 | newsencoder_units_per_layer: list[int] = [512, 512, 512] 114 | newsencoder_l2_regularization: float = 1e-4 115 | -------------------------------------------------------------------------------- /src/ebrec/models/newsrec/npa.py: -------------------------------------------------------------------------------- 1 | # Copyright (c) Microsoft Corporation. All rights reserved. 2 | # Licensed under the MIT License. 3 | from tensorflow.keras import layers 4 | import tensorflow.keras as keras 5 | import tensorflow as tf 6 | import numpy as np 7 | 8 | from ebrec.models.newsrec.layers import PersonalizedAttentivePooling 9 | from ebrec.models.newsrec.base_model import BaseModel 10 | 11 | __all__ = ["NPAModel"] 12 | 13 | 14 | class NPAModel(BaseModel): 15 | """NPA model(Neural News Recommendation with Attentive Multi-View Learning) 16 | 17 | Attributes: 18 | word2vec_embedding (numpy.ndarray): Pretrained word embedding matrix. 19 | hparam (object): Global hyper-parameters. 20 | """ 21 | 22 | def __init__( 23 | self, 24 | hparams, 25 | word2vec_embedding=None, 26 | seed=None, 27 | **kwargs, 28 | ): 29 | """Initialization steps for MANL. 30 | Compared with the BaseModel, NPA need word embedding. 31 | After creating word embedding matrix, BaseModel's __init__ method will be called. 32 | 33 | Args: 34 | hparams (object): Global hyper-parameters. Some key setttings such as filter_num are there. 35 | """ 36 | 37 | super().__init__( 38 | hparams=hparams, 39 | word2vec_embedding=word2vec_embedding, 40 | seed=seed, 41 | **kwargs, 42 | ) 43 | 44 | def _get_input_label_from_iter(self, batch_data): 45 | input_feat = [ 46 | batch_data["user_index_batch"], 47 | batch_data["clicked_title_batch"], 48 | batch_data["candidate_title_batch"], 49 | ] 50 | input_label = batch_data["labels"] 51 | return input_feat, input_label 52 | 53 | def _build_graph(self): 54 | """Build NPA model and scorer. 55 | 56 | Returns: 57 | object: a model used to train. 58 | object: a model used to evaluate and inference. 59 | """ 60 | 61 | model, scorer = self._build_npa() 62 | return model, scorer 63 | 64 | def _build_userencoder(self, titleencoder, user_embedding_layer): 65 | """The main function to create user encoder of NPA. 66 | 67 | Args: 68 | titleencoder (object): the news encoder of NPA. 69 | 70 | Return: 71 | object: the user encoder of NPA. 72 | """ 73 | 74 | his_input_title = keras.Input( 75 | shape=(self.hparams.history_size, self.hparams.title_size), dtype="int32" 76 | ) 77 | user_indexes = keras.Input(shape=(1,), dtype="int32") 78 | 79 | nuser_id = layers.Reshape((1, 1))(user_indexes) 80 | repeat_uids = layers.Concatenate(axis=-2)( 81 | [nuser_id] * self.hparams.history_size 82 | ) 83 | his_title_uid = layers.Concatenate(axis=-1)([his_input_title, repeat_uids]) 84 | 85 | click_title_presents = layers.TimeDistributed(titleencoder)(his_title_uid) 86 | 87 | u_emb = layers.Reshape((self.hparams.user_emb_dim,))( 88 | user_embedding_layer(user_indexes) 89 | ) 90 | user_present = PersonalizedAttentivePooling( 91 | self.hparams.history_size, 92 | self.hparams.filter_num, 93 | self.hparams.attention_hidden_dim, 94 | seed=self.seed, 95 | )( 96 | [ 97 | click_title_presents, 98 | layers.Dense(self.hparams.attention_hidden_dim)(u_emb), 99 | ] 100 | ) 101 | 102 | model = keras.Model( 103 | [his_input_title, user_indexes], user_present, name="user_encoder" 104 | ) 105 | return model 106 | 107 | def _build_newsencoder(self, embedding_layer, user_embedding_layer): 108 | """The main function to create news encoder of NPA. 109 | 110 | Args: 111 | embedding_layer (object): a word embedding layer. 112 | 113 | Return: 114 | object: the news encoder of NPA. 115 | """ 116 | 117 | sequence_title_uindex = keras.Input( 118 | shape=(self.hparams.title_size + 1,), dtype="int32" 119 | ) 120 | 121 | sequences_input_title = layers.Lambda( 122 | lambda x: x[:, : self.hparams.title_size] 123 | )(sequence_title_uindex) 124 | user_index = layers.Lambda(lambda x: x[:, self.hparams.title_size :])( 125 | sequence_title_uindex 126 | ) 127 | 128 | u_emb = layers.Reshape((self.hparams.user_emb_dim,))( 129 | user_embedding_layer(user_index) 130 | ) 131 | embedded_sequences_title = embedding_layer(sequences_input_title) 132 | 133 | y = layers.Dropout(self.hparams.dropout)(embedded_sequences_title) 134 | y = layers.Conv1D( 135 | self.hparams.filter_num, 136 | self.hparams.window_size, 137 | activation=self.hparams.cnn_activation, 138 | padding="same", 139 | bias_initializer=keras.initializers.Zeros(), 140 | kernel_initializer=keras.initializers.glorot_uniform(seed=self.seed), 141 | )(y) 142 | y = layers.Dropout(self.hparams.dropout)(y) 143 | 144 | pred_title = PersonalizedAttentivePooling( 145 | self.hparams.title_size, 146 | self.hparams.filter_num, 147 | self.hparams.attention_hidden_dim, 148 | seed=self.seed, 149 | )([y, layers.Dense(self.hparams.attention_hidden_dim)(u_emb)]) 150 | 151 | # pred_title = Reshape((1, feature_size))(pred_title) 152 | model = keras.Model(sequence_title_uindex, pred_title, name="news_encoder") 153 | return model 154 | 155 | def _build_npa(self): 156 | """The main function to create NPA's logic. The core of NPA 157 | is a user encoder and a news encoder. 158 | 159 | Returns: 160 | object: a model used to train. 161 | object: a model used to evaluate and predict. 162 | """ 163 | 164 | his_input_title = keras.Input( 165 | shape=(self.hparams.history_size, self.hparams.title_size), dtype="int32" 166 | ) 167 | pred_input_title = keras.Input( 168 | # shape=(hparams.npratio + 1, hparams.title_size), dtype="int32" 169 | shape=(None, self.hparams.title_size), 170 | dtype="int32", 171 | ) 172 | pred_input_title_one = keras.Input( 173 | shape=( 174 | 1, 175 | self.hparams.title_size, 176 | ), 177 | dtype="int32", 178 | ) 179 | pred_title_one_reshape = layers.Reshape((self.hparams.title_size,))( 180 | pred_input_title_one 181 | ) 182 | 183 | user_indexes = keras.Input(shape=(1,), dtype="int32") 184 | 185 | nuser_index = layers.Reshape((1, 1))(user_indexes) 186 | 187 | # Calculate npratio + 1 based on the dynamic shape of pred_input_title 188 | npratio_plus_one = tf.shape(pred_input_title)[1] 189 | 190 | repeat_uindex = tf.tile(nuser_index, [1, npratio_plus_one, 1]) 191 | 192 | pred_title_uindex = layers.Concatenate(axis=-1)( 193 | [pred_input_title, repeat_uindex] 194 | ) 195 | pred_title_uindex_one = layers.Concatenate()( 196 | [pred_title_one_reshape, user_indexes] 197 | ) 198 | 199 | embedding_layer = layers.Embedding( 200 | self.word2vec_embedding.shape[0], 201 | self.word2vec_embedding.shape[1], 202 | weights=[self.word2vec_embedding], 203 | trainable=True, 204 | ) 205 | 206 | user_embedding_layer = layers.Embedding( 207 | input_dim=self.hparams.n_users + 1, 208 | output_dim=self.hparams.user_emb_dim, 209 | trainable=True, 210 | embeddings_initializer="zeros", 211 | ) 212 | 213 | titleencoder = self._build_newsencoder(embedding_layer, user_embedding_layer) 214 | userencoder = self._build_userencoder(titleencoder, user_embedding_layer) 215 | newsencoder = titleencoder 216 | 217 | user_present = userencoder([his_input_title, user_indexes]) 218 | 219 | news_present = layers.TimeDistributed(newsencoder)(pred_title_uindex) 220 | news_present_one = newsencoder(pred_title_uindex_one) 221 | 222 | preds = layers.Dot(axes=-1)([news_present, user_present]) 223 | preds = layers.Activation(activation="softmax")(preds) 224 | 225 | pred_one = layers.Dot(axes=-1)([news_present_one, user_present]) 226 | pred_one = layers.Activation(activation="sigmoid")(pred_one) 227 | 228 | model = keras.Model([user_indexes, his_input_title, pred_input_title], preds) 229 | scorer = keras.Model( 230 | [user_indexes, his_input_title, pred_input_title_one], pred_one 231 | ) 232 | 233 | return model, scorer 234 | -------------------------------------------------------------------------------- /src/ebrec/models/newsrec/nrms.py: -------------------------------------------------------------------------------- 1 | # Copyright (c) Microsoft Corporation. All rights reserved. 2 | # Licensed under the MIT License. 3 | from ebrec.models.newsrec.layers import AttLayer2, SelfAttention 4 | import tensorflow as tf 5 | import numpy as np 6 | 7 | from tensorflow.keras.layers import Embedding, Input, Dropout, Dense, BatchNormalization 8 | from tensorflow.keras.initializers import GlorotUniform 9 | from tensorflow.keras.regularizers import l2 10 | 11 | 12 | class NRMSModel: 13 | """NRMS model(Neural News Recommendation with Multi-Head Self-Attention) 14 | 15 | Chuhan Wu, Fangzhao Wu, Suyu Ge, Tao Qi, Yongfeng Huang,and Xing Xie, "Neural News 16 | Recommendation with Multi-Head Self-Attention" in Proceedings of the 2019 Conference 17 | on Empirical Methods in Natural Language Processing and the 9th International Joint Conference 18 | on Natural Language Processing (EMNLP-IJCNLP) 19 | 20 | Attributes: 21 | """ 22 | 23 | def __init__( 24 | self, 25 | hparams: dict, 26 | word2vec_embedding: np.ndarray = None, 27 | word_emb_dim: int = 300, 28 | vocab_size: int = 32000, 29 | seed: int = None, 30 | ): 31 | """Initialization steps for NRMS.""" 32 | self.hparams = hparams 33 | self.seed = seed 34 | 35 | # SET SEED: 36 | tf.random.set_seed(seed) 37 | np.random.seed(seed) 38 | 39 | # INIT THE WORD-EMBEDDINGS: 40 | if word2vec_embedding is None: 41 | # Xavier Initialization 42 | initializer = GlorotUniform(seed=self.seed) 43 | self.word2vec_embedding = initializer(shape=(vocab_size, word_emb_dim)) 44 | # self.word2vec_embedding = np.random.rand(vocab_size, word_emb_dim) 45 | else: 46 | self.word2vec_embedding = word2vec_embedding 47 | 48 | # BUILD AND COMPILE MODEL: 49 | self.model, self.scorer = self._build_graph() 50 | data_loss = self._get_loss(self.hparams.loss) 51 | train_optimizer = self._get_opt( 52 | optimizer=self.hparams.optimizer, lr=self.hparams.learning_rate 53 | ) 54 | self.model.compile(loss=data_loss, optimizer=train_optimizer) 55 | 56 | def _get_loss(self, loss: str): 57 | """Make loss function, consists of data loss and regularization loss 58 | Returns: 59 | object: Loss function or loss function name 60 | """ 61 | if loss == "cross_entropy_loss": 62 | data_loss = "categorical_crossentropy" 63 | elif loss == "log_loss": 64 | data_loss = "binary_crossentropy" 65 | else: 66 | raise ValueError(f"this loss not defined {loss}") 67 | return data_loss 68 | 69 | def _get_opt(self, optimizer: str, lr: float): 70 | """Get the optimizer according to configuration. Usually we will use Adam. 71 | Returns: 72 | object: An optimizer. 73 | """ 74 | # TODO: shouldn't be a string input you should just set the optimizer, to avoid stuff like this: 75 | # => 'WARNING:absl:At this time, the v2.11+ optimizer `tf.keras.optimizers.Adam` runs slowly on M1/M2 Macs, please use the legacy Keras optimizer instead, located at `tf.keras.optimizers.legacy.Adam`.' 76 | if optimizer == "adam": 77 | train_opt = tf.keras.optimizers.Adam(learning_rate=lr) 78 | else: 79 | raise ValueError(f"this optimizer not defined {optimizer}") 80 | return train_opt 81 | 82 | def _build_graph(self): 83 | """Build NRMS model and scorer. 84 | 85 | Returns: 86 | object: a model used to train. 87 | object: a model used to evaluate and inference. 88 | """ 89 | model, scorer = self._build_nrms() 90 | return model, scorer 91 | 92 | def _build_userencoder(self, titleencoder): 93 | """The main function to create user encoder of NRMS. 94 | 95 | Args: 96 | titleencoder (object): the news encoder of NRMS. 97 | 98 | Return: 99 | object: the user encoder of NRMS. 100 | """ 101 | his_input_title = tf.keras.Input( 102 | shape=(self.hparams.history_size, self.hparams.title_size), dtype="int32" 103 | ) 104 | 105 | click_title_presents = tf.keras.layers.TimeDistributed(titleencoder)( 106 | his_input_title 107 | ) 108 | y = SelfAttention(self.hparams.head_num, self.hparams.head_dim, seed=self.seed)( 109 | [click_title_presents] * 3 110 | ) 111 | user_present = AttLayer2(self.hparams.attention_hidden_dim, seed=self.seed)(y) 112 | 113 | model = tf.keras.Model(his_input_title, user_present, name="user_encoder") 114 | return model 115 | 116 | def _build_newsencoder(self, units_per_layer: list[int] = None): 117 | """The main function to create news encoder of NRMS. 118 | 119 | Args: 120 | embedding_layer (object): a word embedding layer. 121 | 122 | Return: 123 | object: the news encoder of NRMS. 124 | """ 125 | embedding_layer = tf.keras.layers.Embedding( 126 | self.word2vec_embedding.shape[0], 127 | self.word2vec_embedding.shape[1], 128 | weights=[self.word2vec_embedding], 129 | trainable=True, 130 | ) 131 | sequences_input_title = tf.keras.Input( 132 | shape=(self.hparams.title_size,), dtype="int32" 133 | ) 134 | embedded_sequences_title = embedding_layer(sequences_input_title) 135 | 136 | y = tf.keras.layers.Dropout(self.hparams.dropout)(embedded_sequences_title) 137 | y = SelfAttention(self.hparams.head_num, self.hparams.head_dim, seed=self.seed)( 138 | [y, y, y] 139 | ) 140 | 141 | # Create configurable Dense layers (the if - else is something I've added): 142 | if units_per_layer: 143 | for layer in units_per_layer: 144 | y = tf.keras.layers.Dense( 145 | units=layer, 146 | activation="relu", 147 | kernel_regularizer=tf.keras.regularizers.l2( 148 | self.hparams.newsencoder_l2_regularization 149 | ), 150 | )(y) 151 | y = tf.keras.layers.BatchNormalization()(y) 152 | y = tf.keras.layers.Dropout(self.hparams.dropout)(y) 153 | else: 154 | y = tf.keras.layers.Dropout(self.hparams.dropout)(y) 155 | 156 | pred_title = AttLayer2(self.hparams.attention_hidden_dim, seed=self.seed)(y) 157 | 158 | model = tf.keras.Model(sequences_input_title, pred_title, name="news_encoder") 159 | return model 160 | 161 | def _build_nrms(self): 162 | """The main function to create NRMS's logic. The core of NRMS 163 | is a user encoder and a news encoder. 164 | 165 | Returns: 166 | object: a model used to train. 167 | object: a model used to evaluate and inference. 168 | """ 169 | 170 | his_input_title = tf.keras.Input( 171 | shape=(self.hparams.history_size, self.hparams.title_size), 172 | dtype="int32", 173 | ) 174 | pred_input_title = tf.keras.Input( 175 | # shape = (hparams.npratio + 1, hparams.title_size) 176 | shape=(None, self.hparams.title_size), 177 | dtype="int32", 178 | ) 179 | pred_input_title_one = tf.keras.Input( 180 | shape=( 181 | 1, 182 | self.hparams.title_size, 183 | ), 184 | dtype="int32", 185 | ) 186 | pred_title_one_reshape = tf.keras.layers.Reshape((self.hparams.title_size,))( 187 | pred_input_title_one 188 | ) 189 | titleencoder = self._build_newsencoder( 190 | units_per_layer=self.hparams.newsencoder_units_per_layer 191 | ) 192 | self.userencoder = self._build_userencoder(titleencoder) 193 | self.newsencoder = titleencoder 194 | 195 | user_present = self.userencoder(his_input_title) 196 | news_present = tf.keras.layers.TimeDistributed(self.newsencoder)( 197 | pred_input_title 198 | ) 199 | news_present_one = self.newsencoder(pred_title_one_reshape) 200 | 201 | preds = tf.keras.layers.Dot(axes=-1)([news_present, user_present]) 202 | preds = tf.keras.layers.Activation(activation="softmax")(preds) 203 | 204 | pred_one = tf.keras.layers.Dot(axes=-1)([news_present_one, user_present]) 205 | pred_one = tf.keras.layers.Activation(activation="sigmoid")(pred_one) 206 | 207 | model = tf.keras.Model([his_input_title, pred_input_title], preds) 208 | scorer = tf.keras.Model([his_input_title, pred_input_title_one], pred_one) 209 | 210 | return model, scorer 211 | -------------------------------------------------------------------------------- /src/ebrec/models/newsrec/nrms_docvec.py: -------------------------------------------------------------------------------- 1 | # Copyright (c) Microsoft Corporation. All rights reserved. 2 | # Licensed under the MIT License. 3 | from ebrec.models.newsrec.layers import AttLayer2, SelfAttention 4 | import tensorflow as tf 5 | import numpy as np 6 | 7 | 8 | class NRMSDocVec: 9 | """ 10 | Modified NRMS model (Neural News Recommendation with Multi-Head Self-Attention) 11 | - Initiated with article-embeddings. 12 | 13 | Chuhan Wu, Fangzhao Wu, Suyu Ge, Tao Qi, Yongfeng Huang,and Xing Xie, "Neural News 14 | Recommendation with Multi-Head Self-Attention" in Proceedings of the 2019 Conference 15 | on Empirical Methods in Natural Language Processing and the 9th International Joint Conference 16 | on Natural Language Processing (EMNLP-IJCNLP) 17 | 18 | Attributes: 19 | """ 20 | 21 | def __init__( 22 | self, 23 | hparams: dict, 24 | seed: int = None, 25 | ): 26 | """Initialization steps for NRMS.""" 27 | self.hparams = hparams 28 | self.seed = seed 29 | 30 | # SET SEED: 31 | tf.random.set_seed(seed) 32 | np.random.seed(seed) 33 | # BUILD AND COMPILE MODEL: 34 | self.model, self.scorer = self._build_graph() 35 | data_loss = self._get_loss(self.hparams.loss) 36 | train_optimizer = self._get_opt( 37 | optimizer=self.hparams.optimizer, lr=self.hparams.learning_rate 38 | ) 39 | self.model.compile(loss=data_loss, optimizer=train_optimizer) 40 | 41 | def _get_loss(self, loss: str): 42 | """Make loss function, consists of data loss and regularization loss 43 | Returns: 44 | object: Loss function or loss function name 45 | """ 46 | if loss == "cross_entropy_loss": 47 | data_loss = "categorical_crossentropy" 48 | elif loss == "log_loss": 49 | data_loss = "binary_crossentropy" 50 | else: 51 | raise ValueError(f"this loss not defined {loss}") 52 | return data_loss 53 | 54 | def _get_opt(self, optimizer: str, lr: float): 55 | """Get the optimizer according to configuration. Usually we will use Adam. 56 | Returns: 57 | object: An optimizer. 58 | """ 59 | if optimizer == "adam": 60 | train_opt = tf.keras.optimizers.Adam(learning_rate=lr) 61 | else: 62 | raise ValueError(f"this optimizer not defined {optimizer}") 63 | return train_opt 64 | 65 | def _build_graph(self): 66 | """Build NRMS model and scorer. 67 | 68 | Returns: 69 | object: a model used to train. 70 | object: a model used to evaluate and inference. 71 | """ 72 | model, scorer = self._build_nrms() 73 | return model, scorer 74 | 75 | def _build_userencoder(self, titleencoder): 76 | """The main function to create user encoder of NRMS. 77 | 78 | Args: 79 | titleencoder (object): the news encoder of NRMS. 80 | 81 | Return: 82 | object: the user encoder of NRMS. 83 | """ 84 | his_input_title = tf.keras.Input( 85 | shape=(self.hparams.history_size, self.hparams.title_size), dtype="float32" 86 | ) 87 | 88 | click_title_presents = tf.keras.layers.TimeDistributed(titleencoder)( 89 | his_input_title 90 | ) 91 | y = SelfAttention(self.hparams.head_num, self.hparams.head_dim, seed=self.seed)( 92 | [click_title_presents] * 3 93 | ) 94 | user_present = AttLayer2(self.hparams.attention_hidden_dim, seed=self.seed)(y) 95 | 96 | model = tf.keras.Model(his_input_title, user_present, name="user_encoder") 97 | return model 98 | 99 | def _build_newsencoder(self, units_per_layer: list[int] = list[512, 512, 512]): 100 | """THIS IS OUR IMPLEMENTATION. 101 | The main function to create a news encoder. 102 | 103 | Parameters: 104 | units_per_layer (int): The number of neurons in each Dense layer. 105 | 106 | Return: 107 | object: the news encoder. 108 | """ 109 | DOCUMENT_VECTOR_DIM = self.hparams.title_size 110 | OUTPUT_DIM = self.hparams.head_num * self.hparams.head_dim 111 | 112 | # DENSE LAYERS (FINE-TUNED): 113 | sequences_input_title = tf.keras.Input( 114 | shape=(DOCUMENT_VECTOR_DIM), dtype="float32" 115 | ) 116 | x = sequences_input_title 117 | # Create configurable Dense layers: 118 | for layer in units_per_layer: 119 | x = tf.keras.layers.Dense( 120 | units=layer, 121 | activation="relu", 122 | kernel_regularizer=tf.keras.regularizers.l2( 123 | self.hparams.newsencoder_l2_regularization 124 | ), 125 | )(x) 126 | x = tf.keras.layers.BatchNormalization()(x) 127 | x = tf.keras.layers.Dropout(self.hparams.dropout)(x) 128 | 129 | # OUTPUT: 130 | pred_title = tf.keras.layers.Dense(units=OUTPUT_DIM, activation="relu")(x) 131 | 132 | # Construct the final model 133 | model = tf.keras.Model( 134 | inputs=sequences_input_title, outputs=pred_title, name="news_encoder" 135 | ) 136 | 137 | return model 138 | 139 | def _build_nrms(self): 140 | """The main function to create NRMS's logic. The core of NRMS 141 | is a user encoder and a news encoder. 142 | 143 | Returns: 144 | object: a model used to train. 145 | object: a model used to evaluate and inference. 146 | """ 147 | 148 | his_input_title = tf.keras.Input( 149 | shape=(self.hparams.history_size, self.hparams.title_size), 150 | dtype="float32", 151 | ) 152 | pred_input_title = tf.keras.Input( 153 | # shape = (hparams.npratio + 1, hparams.title_size) 154 | shape=(None, self.hparams.title_size), 155 | dtype="float32", 156 | ) 157 | pred_input_title_one = tf.keras.Input( 158 | shape=( 159 | 1, 160 | self.hparams.title_size, 161 | ), 162 | dtype="float32", 163 | ) 164 | pred_title_one_reshape = tf.keras.layers.Reshape((self.hparams.title_size,))( 165 | pred_input_title_one 166 | ) 167 | titleencoder = self._build_newsencoder( 168 | units_per_layer=self.hparams.newsencoder_units_per_layer 169 | ) 170 | self.userencoder = self._build_userencoder(titleencoder) 171 | self.newsencoder = titleencoder 172 | 173 | user_present = self.userencoder(his_input_title) 174 | news_present = tf.keras.layers.TimeDistributed(self.newsencoder)( 175 | pred_input_title 176 | ) 177 | news_present_one = self.newsencoder(pred_title_one_reshape) 178 | 179 | preds = tf.keras.layers.Dot(axes=-1)([news_present, user_present]) 180 | preds = tf.keras.layers.Activation(activation="softmax")(preds) 181 | 182 | pred_one = tf.keras.layers.Dot(axes=-1)([news_present_one, user_present]) 183 | pred_one = tf.keras.layers.Activation(activation="sigmoid")(pred_one) 184 | 185 | model = tf.keras.Model([his_input_title, pred_input_title], preds) 186 | scorer = tf.keras.Model([his_input_title, pred_input_title_one], pred_one) 187 | 188 | return model, scorer 189 | -------------------------------------------------------------------------------- /src/ebrec/models/newsrec/utils.py: -------------------------------------------------------------------------------- 1 | class set_args: 2 | def __init__(self, args_dict): 3 | _ = [setattr(set_args, key, val) for key, val in args_dict.items()] 4 | 5 | 6 | def print_n_parameters(model) -> None: 7 | num_params = model.count_params() 8 | print("Number of parameters:", num_params) 9 | 10 | 11 | def print_parameter_device(model) -> None: 12 | for variable in model.variables: 13 | print(f"Variable name: {variable.name}, Device: {variable.device}") 14 | -------------------------------------------------------------------------------- /src/ebrec/utils/__init__.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/ebanalyse/ebnerd-benchmark/c12f9b74c83eea68df69f9640d368adcd5d42a9e/src/ebrec/utils/__init__.py -------------------------------------------------------------------------------- /src/ebrec/utils/_articles.py: -------------------------------------------------------------------------------- 1 | from ebrec.utils._python import create_lookup_dict 2 | import polars as pl 3 | from ebrec.utils._constants import DEFAULT_ARTICLE_ID_COL 4 | 5 | try: 6 | from transformers import AutoTokenizer 7 | except ImportError: 8 | print("transformers not available") 9 | 10 | 11 | def load_article_id_embeddings( 12 | df: pl.DataFrame, path: str, item_col: str = DEFAULT_ARTICLE_ID_COL 13 | ) -> pl.DataFrame: 14 | """Load embeddings artifacts and join to articles on 'article_id' 15 | Args: 16 | path (str): Path to document embeddings 17 | """ 18 | return df.join(pl.read_parquet(path), on=item_col, how="left") 19 | 20 | 21 | def create_article_id_to_value_mapping( 22 | df: pl.DataFrame, 23 | value_col: str, 24 | article_col: str = DEFAULT_ARTICLE_ID_COL, 25 | ): 26 | return create_lookup_dict( 27 | df.select(article_col, value_col), key=article_col, value=value_col 28 | ) 29 | 30 | 31 | def convert_text2encoding_with_transformers( 32 | df: pl.DataFrame, 33 | tokenizer: AutoTokenizer, 34 | column: str, 35 | max_length: int = None, 36 | ) -> pl.DataFrame: 37 | """Converts text in a specified DataFrame column to tokens using a provided tokenizer. 38 | Args: 39 | df (pl.DataFrame): The input DataFrame containing the text column. 40 | tokenizer (AutoTokenizer): The tokenizer to use for encoding the text. (from transformers import AutoTokenizer) 41 | column (str): The name of the column containing the text. 42 | max_length (int, optional): The maximum length of the encoded tokens. Defaults to None. 43 | Returns: 44 | pl.DataFrame: A new DataFrame with an additional column containing the encoded tokens. 45 | Example: 46 | >>> from transformers import AutoTokenizer 47 | >>> import polars as pl 48 | >>> df = pl.DataFrame({ 49 | 'text': ['This is a test.', 'Another test string.', 'Yet another one.'] 50 | }) 51 | >>> tokenizer = AutoTokenizer.from_pretrained('bert-base-uncased') 52 | >>> encoded_df, new_column = convert_text2encoding_with_transformers(df, tokenizer, 'text', max_length=20) 53 | >>> print(encoded_df) 54 | shape: (3, 2) 55 | ┌──────────────────────┬───────────────────────────────┐ 56 | │ text ┆ text_encode_bert-base-uncased │ 57 | │ --- ┆ --- │ 58 | │ str ┆ list[i64] │ 59 | ╞══════════════════════╪═══════════════════════════════╡ 60 | │ This is a test. ┆ [2023, 2003, … 0] │ 61 | │ Another test string. ┆ [2178, 3231, … 0] │ 62 | │ Yet another one. ┆ [2664, 2178, … 0] │ 63 | └──────────────────────┴───────────────────────────────┘ 64 | >>> print(new_column) 65 | text_encode_bert-base-uncased 66 | """ 67 | text = df[column].to_list() 68 | # set columns 69 | new_column = f"{column}_encode_{tokenizer.name_or_path}" 70 | # If 'max_length' is provided then set it, else encode each string its original length 71 | padding = "max_length" if max_length else False 72 | encoded_tokens = tokenizer( 73 | text, 74 | add_special_tokens=False, 75 | padding=padding, 76 | max_length=max_length, 77 | truncation=True, 78 | )["input_ids"] 79 | return df.with_columns(pl.Series(new_column, encoded_tokens)), new_column 80 | 81 | 82 | def create_sort_based_prediction_score( 83 | df: pl.DataFrame, 84 | column: str, 85 | desc: bool, 86 | article_id_col: str = DEFAULT_ARTICLE_ID_COL, 87 | prediction_score_col: str = "prediction_score", 88 | ) -> pl.DataFrame: 89 | """ 90 | Generates a prediction score for each row in a Polars DataFrame based on the sorting of a specified column. 91 | 92 | Args: 93 | df (pl.DataFrame): The input DataFrame to process. 94 | column (str): The name of the column to sort by and to base the prediction scores on. 95 | desc (bool): Determines the sorting order. If True, sort in descending order; otherwise, in ascending order. 96 | article_id_col (str, optional): The name article ID column. Defaults to "article_id". 97 | prediction_score_col (str, optional): The name to assign to the prediction score column. Defaults to "prediction_score". 98 | 99 | Returns: 100 | pl.DataFrame: A Polars DataFrame including the original data along with the new prediction score column. 101 | 102 | Examples: 103 | >>> import polars as pl 104 | >>> df = pl.DataFrame({ 105 | "article_id": [1, 2, 3, 4, 5], 106 | "views": [100, 150, 200, 50, 300], 107 | }) 108 | >>> create_sort_based_prediction_score(df, "views", True) 109 | shape: (5, 3) 110 | ┌────────────┬───────┬──────────────────┐ 111 | │ article_id ┆ views ┆ prediction_score │ 112 | │ --- ┆ --- ┆ --- │ 113 | │ i64 ┆ i64 ┆ f64 │ 114 | ╞════════════╪═══════╪══════════════════╡ 115 | │ 5 ┆ 300 ┆ 1.0 │ 116 | │ 3 ┆ 200 ┆ 0.5 │ 117 | │ 2 ┆ 150 ┆ 0.333333 │ 118 | │ 1 ┆ 100 ┆ 0.25 │ 119 | │ 4 ┆ 50 ┆ 0.2 │ 120 | └────────────┴───────┴──────────────────┘ 121 | """ 122 | _TEMP_NAME = "index" 123 | return ( 124 | ( 125 | df.select(article_id_col, column) 126 | .sort(by=column, descending=desc) 127 | .with_row_index(name=_TEMP_NAME, offset=1) 128 | ) 129 | .with_columns((1 / pl.col(_TEMP_NAME)).alias(prediction_score_col)) 130 | .drop(_TEMP_NAME) 131 | ) 132 | -------------------------------------------------------------------------------- /src/ebrec/utils/_articles_behaviors.py: -------------------------------------------------------------------------------- 1 | from ebrec.utils._python import generate_unique_name 2 | 3 | try: 4 | import polars as pl 5 | except ImportError: 6 | print("polars not available") 7 | 8 | 9 | def map_list_article_id_to_value( 10 | behaviors: pl.DataFrame, 11 | behaviors_column: str, 12 | mapping: dict[int, pl.Series], 13 | drop_nulls: bool = False, 14 | fill_nulls: any = None, 15 | ) -> pl.DataFrame: 16 | """ 17 | 18 | Maps the values of a column in a DataFrame `behaviors` containing article IDs to their corresponding values 19 | in a column in another DataFrame `articles`. The mapping is performed using a dictionary constructed from 20 | the two DataFrames. The resulting DataFrame has the same columns as `behaviors`, but with the article IDs 21 | replaced by their corresponding values. 22 | 23 | Args: 24 | behaviors (pl.DataFrame): The DataFrame containing the column to be mapped. 25 | behaviors_column (str): The name of the column to be mapped in `behaviors`. 26 | mapping (dict[int, pl.Series]): A dictionary with article IDs as keys and corresponding values as values. 27 | Note, 'replace' works a lot faster when values are of type pl.Series! 28 | drop_nulls (bool): If `True`, any rows in the resulting DataFrame with null values will be dropped. 29 | If `False` and `fill_nulls` is specified, null values in `behaviors_column` will be replaced with `fill_null`. 30 | fill_nulls (Optional[any]): If specified, any null values in `behaviors_column` will be replaced with this value. 31 | 32 | Returns: 33 | pl.DataFrame: A new DataFrame with the same columns as `behaviors`, but with the article IDs in 34 | `behaviors_column` replaced by their corresponding values in `mapping`. 35 | 36 | Example: 37 | >>> behaviors = pl.DataFrame( 38 | {"user_id": [1, 2, 3, 4, 5], "article_ids": [["A1", "A2"], ["A2", "A3"], ["A1", "A4"], ["A4", "A4"], None]} 39 | ) 40 | >>> articles = pl.DataFrame( 41 | { 42 | "article_id": ["A1", "A2", "A3"], 43 | "article_type": ["News", "Sports", "Entertainment"], 44 | } 45 | ) 46 | >>> articles_dict = dict(zip(articles["article_id"], articles["article_type"])) 47 | >>> map_list_article_id_to_value( 48 | behaviors=behaviors, 49 | behaviors_column="article_ids", 50 | mapping=articles_dict, 51 | fill_nulls="Unknown", 52 | ) 53 | shape: (4, 2) 54 | ┌─────────┬─────────────────────────────┐ 55 | │ user_id ┆ article_ids │ 56 | │ --- ┆ --- │ 57 | │ i64 ┆ list[str] │ 58 | ╞═════════╪═════════════════════════════╡ 59 | │ 1 ┆ ["News", "Sports"] │ 60 | │ 2 ┆ ["Sports", "Entertainment"] │ 61 | │ 3 ┆ ["News", "Unknown"] │ 62 | │ 4 ┆ ["Unknown", "Unknown"] │ 63 | │ 5 ┆ ["Unknown"] │ 64 | └─────────┴─────────────────────────────┘ 65 | >>> map_list_article_id_to_value( 66 | behaviors=behaviors, 67 | behaviors_column="article_ids", 68 | mapping=articles_dict, 69 | drop_nulls=True, 70 | ) 71 | shape: (4, 2) 72 | ┌─────────┬─────────────────────────────┐ 73 | │ user_id ┆ article_ids │ 74 | │ --- ┆ --- │ 75 | │ i64 ┆ list[str] │ 76 | ╞═════════╪═════════════════════════════╡ 77 | │ 1 ┆ ["News", "Sports"] │ 78 | │ 2 ┆ ["Sports", "Entertainment"] │ 79 | │ 3 ┆ ["News"] │ 80 | │ 4 ┆ null │ 81 | │ 5 ┆ null │ 82 | └─────────┴─────────────────────────────┘ 83 | >>> map_list_article_id_to_value( 84 | behaviors=behaviors, 85 | behaviors_column="article_ids", 86 | mapping=articles_dict, 87 | drop_nulls=False, 88 | ) 89 | shape: (4, 2) 90 | ┌─────────┬─────────────────────────────┐ 91 | │ user_id ┆ article_ids │ 92 | │ --- ┆ --- │ 93 | │ i64 ┆ list[str] │ 94 | ╞═════════╪═════════════════════════════╡ 95 | │ 1 ┆ ["News", "Sports"] │ 96 | │ 2 ┆ ["Sports", "Entertainment"] │ 97 | │ 3 ┆ ["News", null] │ 98 | │ 4 ┆ [null, null] │ 99 | │ 5 ┆ [null] │ 100 | └─────────┴─────────────────────────────┘ 101 | """ 102 | GROUPBY_ID = generate_unique_name(behaviors.columns, "_groupby_id") 103 | behaviors = behaviors.lazy().with_row_index(GROUPBY_ID) 104 | # => 105 | select_column = ( 106 | behaviors.select(pl.col(GROUPBY_ID), pl.col(behaviors_column)) 107 | .explode(behaviors_column) 108 | .with_columns(pl.col(behaviors_column).replace(mapping, default=None)) 109 | .collect() 110 | ) 111 | # => 112 | if drop_nulls: 113 | select_column = select_column.drop_nulls() 114 | elif fill_nulls is not None: 115 | select_column = select_column.with_columns( 116 | pl.col(behaviors_column).fill_null(fill_nulls) 117 | ) 118 | # => 119 | select_column = ( 120 | select_column.lazy().group_by(GROUPBY_ID).agg(behaviors_column).collect() 121 | ) 122 | return ( 123 | behaviors.drop(behaviors_column) 124 | .collect() 125 | .join(select_column, on=GROUPBY_ID, how="left") 126 | .drop(GROUPBY_ID) 127 | ) 128 | -------------------------------------------------------------------------------- /src/ebrec/utils/_constants.py: -------------------------------------------------------------------------------- 1 | # BEHAVIORS 2 | DEFAULT_IMPRESSION_TIMESTAMP_COL = "impression_time" 3 | DEFAULT_IS_BEYOND_ACCURACY_COL = "is_beyond_accuracy" 4 | DEFAULT_CLICKED_ARTICLES_COL = "article_ids_clicked" 5 | DEFAULT_SCROLL_PERCENTAGE_COL = "scroll_percentage" 6 | DEFAULT_INVIEW_ARTICLES_COL = "article_ids_inview" 7 | DEFAULT_IMPRESSION_ID_COL = "impression_id" 8 | DEFAULT_IS_SUBSCRIBER_COL = "is_subscriber" 9 | DEFAULT_IS_SSO_USER_COL = "is_sso_user" 10 | DEFAULT_ARTICLE_ID_COL = "article_id" 11 | DEFAULT_SESSION_ID_COL = "session_id" 12 | DEFAULT_READ_TIME_COL = "read_time" 13 | DEFAULT_DEVICE_COL = "device_type" 14 | DEFAULT_POSTCODE_COL = "postcode" 15 | DEFAULT_GENDER_COL = "gender" 16 | DEFAULT_USER_COL = "user_id" 17 | DEFAULT_AGE_COL = "age" 18 | 19 | DEFAULT_NEXT_SCROLL_PERCENTAGE_COL = f"next_{DEFAULT_SCROLL_PERCENTAGE_COL}" 20 | DEFAULT_NEXT_READ_TIME_COL = f"next_{DEFAULT_READ_TIME_COL}" 21 | 22 | # ARTICLES 23 | DEFAULT_ARTICLE_MODIFIED_TIMESTAMP_COL = "last_modified_time" 24 | DEFAULT_ARTICLE_PUBLISHED_TIMESTAMP_COL = "published_time" 25 | DEFAULT_SENTIMENT_LABEL_COL = "sentiment_label" 26 | DEFAULT_SENTIMENT_SCORE_COL = "sentiment_score" 27 | DEFAULT_TOTAL_READ_TIME_COL = "total_read_time" 28 | DEFAULT_TOTAL_PAGEVIEWS_COL = "total_pageviews" 29 | DEFAULT_TOTAL_INVIEWS_COL = "total_inviews" 30 | DEFAULT_ARTICLE_TYPE_COL = "article_type" 31 | DEFAULT_CATEGORY_STR_COL = "category_str" 32 | DEFAULT_SUBCATEGORY_COL = "subcategory" 33 | DEFAULT_ENTITIES_COL = "entity_groups" 34 | DEFAULT_IMAGE_IDS_COL = "image_ids" 35 | DEFAULT_SUBTITLE_COL = "subtitle" 36 | DEFAULT_CATEGORY_COL = "category" 37 | DEFAULT_NER_COL = "ner_clusters" 38 | DEFAULT_PREMIUM_COL = "premium" 39 | DEFAULT_TOPICS_COL = "topics" 40 | DEFAULT_TITLE_COL = "title" 41 | DEFAULT_BODY_COL = "body" 42 | DEFAULT_URL_COL = "url" 43 | 44 | # HISTORY 45 | DEFAULT_HISTORY_IMPRESSION_TIMESTAMP_COL = f"{DEFAULT_IMPRESSION_TIMESTAMP_COL}_fixed" 46 | DEFAULT_HISTORY_SCROLL_PERCENTAGE_COL = f"{DEFAULT_SCROLL_PERCENTAGE_COL}_fixed" 47 | DEFAULT_HISTORY_ARTICLE_ID_COL = f"{DEFAULT_ARTICLE_ID_COL}_fixed" 48 | DEFAULT_HISTORY_READ_TIME_COL = f"{DEFAULT_READ_TIME_COL}_fixed" 49 | 50 | # CREATE 51 | DEFAULT_KNOWN_USER_COL = "is_known_user" 52 | DEFAULT_LABELS_COL = "labels" 53 | -------------------------------------------------------------------------------- /src/ebrec/utils/_decay.py: -------------------------------------------------------------------------------- 1 | try: 2 | import polars as pl 3 | except ImportError: 4 | print("polars not available") 5 | 6 | 7 | def linear_decay_weights(n: int, ascending: bool = True, **kwargs) -> list[float]: 8 | """ 9 | Generates a list of weights in a linear decaying pattern. 10 | Args: 11 | n (int): The number of weights to generate. Must be a positive integer. 12 | ascending (bool, optional): Flag to determine the order of decay. 13 | If True, the decay is ascending. If False, it's descending. 14 | Defaults to True. 15 | Returns: 16 | List[float]: A list of linearly decaying weights. 17 | Raises: 18 | ValueError: If 'n' is not a positive integer. 19 | Examples: 20 | >>> linear_decay_weights(5, True) 21 | [0.2, 0.4, 0.6, 0.8, 1.0] 22 | >>> linear_decay_weights(10, False) 23 | [1.0, 0.9, 0.8, 0.7, 0.6, 0.5, 0.4, 0.3, 0.2, 0.1] 24 | """ 25 | weights = [(n - i) / n for i in range(n)] 26 | return weights if not ascending else weights[::-1] 27 | 28 | 29 | def exponential_decay_weights( 30 | n: int, lambda_factor: float, ascending: bool = True, **kwargs 31 | ) -> list[float]: 32 | """ 33 | Generates a list of weights in an exponential decay pattern. 34 | Args: 35 | n (int): The number of weights to generate. Must be a non-negative integer. 36 | lambda_factor (float): The factor by which the weights decay exponentially. 37 | ascending (bool, optional): Flag to determine the order of decay. 38 | If True, the decay is ascending. If False, it's descending. 39 | Defaults to True. 40 | Returns: 41 | List[float]: A list of exponentially decaying weights. 42 | Raises: 43 | ValueError: If 'n' is negative. 44 | Examples: 45 | >>> exponential_decay_weights(5, 0.5, True) 46 | [0.0625, 0.125, 0.25, 0.5, 1.0] 47 | >>> exponential_decay_weights(10, 0.5, False) 48 | [1.0, 0.5, 0.25, 0.125, 0.0625, 0.03125, 0.015625, 0.0078125, 0.00390625, 0.001953125] 49 | """ 50 | weights = [lambda_factor ** (n - i - 1) for i in range(n)] 51 | return weights if ascending else weights[::-1] 52 | 53 | 54 | def add_decay_weights( 55 | df, column: str, decay_func: callable, ascending: bool = True, **kwargs: dict 56 | ): 57 | """ 58 | Wrapper function: Adding decay weights to column using decay function scheme 59 | >>> df = pl.DataFrame( 60 | { 61 | "col1": [ 62 | [[1], [1], [1], [1]], 63 | [[1, 1], [1, 1], [1, 1]], 64 | [[1, 1, 1], [1, 1, 1]], 65 | None, 66 | ], 67 | "col2": [4, 5, 6, 7], 68 | } 69 | ) 70 | >>> add_decay_weights(df, "col1", decay_func=linear_decay_weights, ascending=True) 71 | shape: (4, 3) 72 | ┌──────────────────────────┬───────────────────────────┬──────┐ 73 | │ col1 ┆ col1_weights ┆ col2 │ 74 | │ --- ┆ --- ┆ --- │ 75 | │ list[list[i64]] ┆ list[f64] ┆ i64 │ 76 | ╞══════════════════════════╪═══════════════════════════╪══════╡ 77 | │ [[1], [1], … [1]] ┆ [0.25, 0.5, … 1.0] ┆ 4 │ 78 | │ [[1, 1], [1, 1], [1, 1]] ┆ [0.333333, 0.666667, 1.0] ┆ 5 │ 79 | │ [[1, 1, 1], [1, 1, 1]] ┆ [0.5, 1.0] ┆ 6 │ 80 | │ null ┆ [] ┆ 7 │ 81 | └──────────────────────────┴───────────────────────────┴──────┘ 82 | >>> add_decay_weights(df, "col1", decay_func=exponential_decay_weights, ascending=True, **{"lambda_factor" : 0.5}) 83 | shape: (4, 3) 84 | ┌──────────────────────────┬──────────────────────┬──────┐ 85 | │ col1 ┆ col1_weights ┆ col2 │ 86 | │ --- ┆ --- ┆ --- │ 87 | │ list[list[i64]] ┆ list[f64] ┆ i64 │ 88 | ╞══════════════════════════╪══════════════════════╪══════╡ 89 | │ [[1], [1], … [1]] ┆ [0.125, 0.25, … 1.0] ┆ 4 │ 90 | │ [[1, 1], [1, 1], [1, 1]] ┆ [0.25, 0.5, 1.0] ┆ 5 │ 91 | │ [[1, 1, 1], [1, 1, 1]] ┆ [0.5, 1.0] ┆ 6 │ 92 | │ null ┆ [] ┆ 7 │ 93 | └──────────────────────────┴──────────────────────┴──────┘ 94 | """ 95 | lengths = df[column].list.len().to_list() 96 | weights = [decay_func(n=i, ascending=ascending, **kwargs) for i in lengths] 97 | return df.with_columns(pl.Series(f"{column}_weights", weights)) 98 | 99 | 100 | def decay_weighting_nested_lists( 101 | df, column_history: str, column_history_weights: str, fill_nulls: int = None 102 | ): 103 | """ 104 | >>> df = pl.DataFrame( 105 | { 106 | "col1": [ 107 | [[1], [1], [1], [1]], 108 | [[1, 1], [1, 1], [1, 1]], 109 | [[1, 1, 1], [1, 1, 1]], 110 | [[1], None], 111 | None, 112 | ], 113 | "col1_weights": 114 | [[0.25, 0.5, 0.75, 1.0], 115 | [0.33, 0.67, 1.0], 116 | [0.5, 1.0], 117 | [0.5, 1.0], 118 | [] 119 | ], 120 | "col2": [4, 5, 6, 7, 8 ], 121 | } 122 | ) 123 | >>> decay_weighting_nested_lists(df, column_history="col1", column_history_weights="col1_weights")["col1"] 124 | Series: 'col1' [list[list[f64]]] 125 | [ 126 | [[0.25], [0.5], … [1.0]] 127 | [[0.33, 0.33], [0.67, 0.67], [1.0, 1.0]] 128 | [[0.5, 0.5, 0.5], [1.0, 1.0, 1.0]] 129 | [[0.5], [null]] 130 | null 131 | ] 132 | >>> decay_weighting_nested_lists(df.lazy(), "col1", "col1_weights").collect() 133 | """ 134 | GROUP_BY_COLUMN_FIRST = "group_by_1" 135 | GROUP_BY_COLUMN_SECOND = "group_by_2" 136 | COLUMNS = df.columns 137 | 138 | df = df.with_row_count(GROUP_BY_COLUMN_FIRST) 139 | 140 | exploded_weights = df.drop_nulls(column_history).select( 141 | pl.col(column_history_weights).explode() 142 | ) 143 | 144 | if isinstance(exploded_weights, pl.LazyFrame): 145 | exploded_weights = exploded_weights.collect() 146 | 147 | df_ = ( 148 | df.select(pl.col(GROUP_BY_COLUMN_FIRST, column_history)) 149 | .drop_nulls(column_history) 150 | .explode(column_history) 151 | .with_columns(exploded_weights.select(column_history_weights)) 152 | .with_row_count(GROUP_BY_COLUMN_SECOND) 153 | # Not optimal to explode, I want to compute [1,2,2] * 0.5 => (list * float) 154 | .explode(column_history) 155 | .with_columns( 156 | (pl.col(column_history) * pl.col(column_history_weights)).alias( 157 | column_history 158 | ) 159 | ) 160 | .group_by([GROUP_BY_COLUMN_SECOND]) 161 | .agg(pl.col(GROUP_BY_COLUMN_FIRST).first(), column_history) 162 | .group_by(GROUP_BY_COLUMN_FIRST) 163 | .agg(column_history) 164 | .sort(GROUP_BY_COLUMN_FIRST) 165 | ) 166 | 167 | return ( 168 | df.drop(column_history) 169 | .join(df_, on=GROUP_BY_COLUMN_FIRST, how="left") 170 | .select(COLUMNS) 171 | ) 172 | -------------------------------------------------------------------------------- /src/ebrec/utils/_descriptive_analysis.py: -------------------------------------------------------------------------------- 1 | import polars as pl 2 | 3 | from ebrec.utils._constants import ( 4 | DEFAULT_HISTORY_IMPRESSION_TIMESTAMP_COL, 5 | DEFAULT_IMPRESSION_TIMESTAMP_COL, 6 | ) 7 | 8 | 9 | def min_max_impression_time_history( 10 | df: pl.DataFrame, timestamp_col: str = DEFAULT_HISTORY_IMPRESSION_TIMESTAMP_COL 11 | ): 12 | """ 13 | Check min/max for user history timestamp column. 14 | """ 15 | return ( 16 | df.select(pl.col(timestamp_col)) 17 | .with_columns( 18 | pl.col(timestamp_col).list.eval(pl.element().min()).explode().alias("min") 19 | ) 20 | .with_columns( 21 | pl.col(timestamp_col).list.eval(pl.element().max()).explode().alias("max") 22 | ) 23 | .select(pl.col("min").min(), pl.col("max").max()) 24 | ) 25 | 26 | 27 | def min_max_impression_time_behaviors( 28 | df: pl.DataFrame, timestamp_col: str = DEFAULT_IMPRESSION_TIMESTAMP_COL 29 | ): 30 | """ 31 | Check min/max for behaviors timestamp column. 32 | """ 33 | return df.select( 34 | pl.col(timestamp_col).min().alias("min"), 35 | pl.col(timestamp_col).max().alias("max"), 36 | ) 37 | -------------------------------------------------------------------------------- /src/ebrec/utils/_nlp.py: -------------------------------------------------------------------------------- 1 | from tqdm import tqdm 2 | import numpy as np 3 | import torch 4 | 5 | from ebrec.utils._python import get_torch_device 6 | 7 | try: 8 | from torch.utils.data import DataLoader, TensorDataset 9 | except ImportError: 10 | print("torch not available") 11 | try: 12 | from transformers import AutoTokenizer, AutoModel 13 | except ImportError: 14 | print("transformers not available") 15 | 16 | 17 | def get_transformers_word_embeddings(model: AutoModel): 18 | return model.embeddings.word_embeddings.weight.data.to("cpu").numpy() 19 | 20 | 21 | def generate_embeddings_with_transformers( 22 | model: AutoModel, 23 | tokenizer: AutoTokenizer, 24 | text_list: list[str], 25 | batch_size: int = 8, 26 | device: str = None, 27 | disable_tqdm: bool = False, 28 | ) -> torch.Tensor: 29 | """ 30 | Generates embeddings for a list of texts using a pre-trained transformer model. 31 | 32 | Args: 33 | model_name (str): The name of the pre-trained transformer model to use. 34 | text_list (list of str): A list of texts to generate embeddings for. 35 | batch_size (int): The batch size to use for generating embeddings. Defaults to 8. 36 | device (str): The device to use for generating embeddings (e.g., "cpu", "cuda"). 37 | If None, defaults to the first available GPU or CPU. 38 | 39 | Returns: 40 | embeddings (torch.Tensor): A tensor containing the embeddings for the input texts. 41 | The shape of the tensor is (num_texts, embedding_dim), where num_texts is the number 42 | of input texts and embedding_dim is the dimensionality of the embeddings produced by 43 | the pre-trained model. 44 | 45 | Examples: 46 | >>> model_name = "bert-base-uncased" 47 | >>> text_list = ["hello world", "how are you"] 48 | >>> batch_size = 2 49 | >>> device = "cpu" 50 | >>> model = AutoModel.from_pretrained(model_name) 51 | >>> tokenizer = AutoTokenizer.from_pretrained(model_name) 52 | >>> embeddings_tensor = generate_embeddings_with_transformers(model, tokenizer, text_list, batch_size, device) 53 | >>> print(embeddings_tensor) 54 | tensor([[-0.0243, 0.1144, 0.0830, ..., -0.2666, 0.1662, 0.1519], 55 | [ 0.0827, 0.0877, -0.0688, ..., -0.4381, 0.0462, -0.1446]]) 56 | >>> print(embeddings_tensor.shape) 57 | torch.Size([2, 768]) 58 | """ 59 | device = get_torch_device(use_gpu=True) if device is None else device 60 | model = model.to(device) 61 | 62 | tokenized_text = tokenizer( 63 | text_list, padding=True, truncation=True, return_tensors="pt" 64 | ) 65 | feature_names = list(tokenized_text) 66 | 67 | dataset = TensorDataset( 68 | tokenized_text["input_ids"], tokenized_text["attention_mask"] 69 | ) 70 | dataloader = DataLoader(dataset, batch_size=batch_size) 71 | embeddings = [] 72 | with torch.no_grad(): 73 | for batch in tqdm(dataloader, desc="Encoding", disable=disable_tqdm): 74 | inputs = {feat: t.to(device) for feat, t in zip(feature_names, batch)} 75 | outputs = model( 76 | **inputs, 77 | output_hidden_states=True, 78 | ) 79 | embeddings.append(outputs.last_hidden_state[:, 0, :].squeeze(dim=1)) 80 | return torch.vstack(embeddings) 81 | 82 | 83 | if __name__ == "__main__": 84 | # 85 | model_name = "xlm-roberta-base" 86 | batch_size = 8 87 | text_list = [ 88 | "hej med dig. Jeg er en tekst.", 89 | "Jeg er en anden tekst, skal du spille smart?", 90 | "oh nej..", 91 | ] 92 | model = AutoModel.from_pretrained(model_name) 93 | tokenizer = AutoTokenizer.from_pretrained(model_name) 94 | t = generate_embeddings_with_transformers( 95 | model, tokenizer, text_list, batch_size, "cpu" 96 | ) 97 | -------------------------------------------------------------------------------- /src/ebrec/utils/_torch.py: -------------------------------------------------------------------------------- 1 | from pathlib import Path 2 | 3 | try: 4 | import torch 5 | except ImportError: 6 | print("torch not available") 7 | 8 | 9 | def save_checkpoint(model, path="model_state_dict.pt"): 10 | path = Path(path) 11 | path.parent.mkdir(parents=True, exist_ok=True) 12 | print(f"Saving model weights: {path}") 13 | torch.save(model.state_dict(), path.as_posix()) 14 | -------------------------------------------------------------------------------- /test/bombing/bomb_dataloader.py: -------------------------------------------------------------------------------- 1 | from pathlib import Path 2 | import polars as pl 3 | import numpy as np 4 | 5 | from ebrec.models.newsrec.dataloader import ( 6 | LSTURDataLoader, 7 | NRMSDataLoader, 8 | ) 9 | from ebrec.utils._behaviors import create_user_id_to_int_mapping 10 | from ebrec.utils._articles import create_article_id_to_value_mapping 11 | 12 | from ebrec.utils._python import time_it 13 | from tqdm import tqdm 14 | 15 | from ebrec.utils._behaviors import create_binary_labels_column 16 | from ebrec.utils._constants import ( 17 | DEFAULT_HISTORY_ARTICLE_ID_COL, 18 | DEFAULT_CLICKED_ARTICLES_COL, 19 | DEFAULT_INVIEW_ARTICLES_COL, 20 | DEFAULT_ARTICLE_ID_COL, 21 | DEFAULT_CATEGORY_COL, 22 | DEFAULT_USER_COL, 23 | ) 24 | 25 | from ebrec.models.fastformer.dataloader import FastformerDataset 26 | from torch.utils.data import DataLoader 27 | 28 | N_ITERATIONS = 300 29 | BATCH_SIZE = 100 30 | TOKEN_COL = "tokens" 31 | N_SAMPLES = "n" 32 | 33 | # LOAD DATA: 34 | PATH_DATA = Path("test/data") 35 | df_articles = ( 36 | pl.scan_parquet(PATH_DATA.joinpath("ebnerd", "articles.parquet")) 37 | .select(pl.col(DEFAULT_ARTICLE_ID_COL, DEFAULT_CATEGORY_COL)) 38 | .with_columns(pl.Series(TOKEN_COL, np.random.randint(0, 20, (1, 10)))) 39 | .collect() 40 | ) 41 | df_history = ( 42 | pl.scan_parquet(PATH_DATA.joinpath("ebnerd", "history.parquet")) 43 | .select(DEFAULT_USER_COL, DEFAULT_HISTORY_ARTICLE_ID_COL) 44 | .with_columns(pl.col(DEFAULT_HISTORY_ARTICLE_ID_COL).list.tail(3)) 45 | ) 46 | df_behaviors = ( 47 | pl.scan_parquet(PATH_DATA.joinpath("ebnerd", "behaviors.parquet")) 48 | .select(DEFAULT_USER_COL, DEFAULT_INVIEW_ARTICLES_COL, DEFAULT_CLICKED_ARTICLES_COL) 49 | .with_columns(pl.col(DEFAULT_INVIEW_ARTICLES_COL).list.len().alias(N_SAMPLES)) 50 | .join(df_history, on=DEFAULT_USER_COL, how="left") 51 | .collect() 52 | .pipe(create_binary_labels_column) 53 | ) 54 | # => MAPPINGS: 55 | article_mapping = create_article_id_to_value_mapping( 56 | df=df_articles, value_col=TOKEN_COL 57 | ) 58 | user_mapping = create_user_id_to_int_mapping(df=df_behaviors) 59 | # => NPRATIO IMPRESSION - SAME LENGTHS: 60 | df_behaviors_train = df_behaviors.filter(pl.col(N_SAMPLES) == pl.col(N_SAMPLES).min()) 61 | # => FOR TEST-DATALOADER 62 | label_lengths = df_behaviors[DEFAULT_INVIEW_ARTICLES_COL].list.len().to_list() 63 | 64 | 65 | def iter_dataloader(dataloader, name: str, iterations: int): 66 | for _ in tqdm(range(iterations), desc=name): 67 | for _ in dataloader: 68 | pass 69 | 70 | 71 | # === 72 | @time_it(True) 73 | def bomb_NRMSDataLoader(): 74 | dataloader = NRMSDataLoader( 75 | behaviors=df_behaviors_train, 76 | article_dict=article_mapping, 77 | history_column=DEFAULT_HISTORY_ARTICLE_ID_COL, 78 | unknown_representation="zeros", 79 | eval_mode=False, 80 | batch_size=BATCH_SIZE, 81 | ) 82 | iter_dataloader(dataloader, "NRMS-train", iterations=N_ITERATIONS) 83 | 84 | dataloader = NRMSDataLoader( 85 | behaviors=df_behaviors, 86 | article_dict=article_mapping, 87 | history_column=DEFAULT_HISTORY_ARTICLE_ID_COL, 88 | unknown_representation="zeros", 89 | eval_mode=True, 90 | batch_size=BATCH_SIZE, 91 | ) 92 | iter_dataloader(dataloader, "NRMS-test", iterations=N_ITERATIONS) 93 | 94 | 95 | @time_it(True) 96 | def bomb_LSTURDataLoader(): 97 | user_mapping = create_user_id_to_int_mapping(df=df_behaviors_train) 98 | 99 | dataloader = LSTURDataLoader( 100 | behaviors=df_behaviors_train, 101 | article_dict=article_mapping, 102 | user_id_mapping=user_mapping, 103 | history_column=DEFAULT_HISTORY_ARTICLE_ID_COL, 104 | unknown_representation="zeros", 105 | batch_size=BATCH_SIZE, 106 | ) 107 | iter_dataloader(dataloader, "LSTUR-train", iterations=N_ITERATIONS) 108 | 109 | dataloader = LSTURDataLoader( 110 | behaviors=df_behaviors, 111 | article_dict=article_mapping, 112 | user_id_mapping=user_mapping, 113 | history_column=DEFAULT_HISTORY_ARTICLE_ID_COL, 114 | unknown_representation="zeros", 115 | batch_size=BATCH_SIZE, 116 | eval_mode=True, 117 | ) 118 | iter_dataloader(dataloader, "LSTUR-test", iterations=N_ITERATIONS) 119 | 120 | 121 | # === 122 | @time_it(True) 123 | def bomb_FastformerDataLoader(): 124 | dataloader = DataLoader( 125 | FastformerDataset( 126 | behaviors=df_behaviors_train, 127 | history_column=DEFAULT_HISTORY_ARTICLE_ID_COL, 128 | article_dict=article_mapping, 129 | batch_size=BATCH_SIZE, 130 | shuffle=True, 131 | ) 132 | ) 133 | iter_dataloader(dataloader, "Fastformer-train", iterations=N_ITERATIONS) 134 | 135 | dataloader = DataLoader( 136 | FastformerDataset( 137 | behaviors=df_behaviors, 138 | history_column=DEFAULT_HISTORY_ARTICLE_ID_COL, 139 | article_dict=article_mapping, 140 | batch_size=BATCH_SIZE, 141 | shuffle=False, 142 | ) 143 | ) 144 | iter_dataloader(dataloader, "Fastformer-test", iterations=N_ITERATIONS) 145 | 146 | 147 | if __name__ == "__main__": 148 | bomb_NRMSDataLoader() 149 | bomb_LSTURDataLoader() 150 | bomb_FastformerDataLoader() 151 | -------------------------------------------------------------------------------- /test/data/ebnerd/articles.parquet: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/ebanalyse/ebnerd-benchmark/c12f9b74c83eea68df69f9640d368adcd5d42a9e/test/data/ebnerd/articles.parquet -------------------------------------------------------------------------------- /test/data/ebnerd/behaviors.parquet: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/ebanalyse/ebnerd-benchmark/c12f9b74c83eea68df69f9640d368adcd5d42a9e/test/data/ebnerd/behaviors.parquet -------------------------------------------------------------------------------- /test/data/ebnerd/document_vector.parquet: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/ebanalyse/ebnerd-benchmark/c12f9b74c83eea68df69f9640d368adcd5d42a9e/test/data/ebnerd/document_vector.parquet -------------------------------------------------------------------------------- /test/data/ebnerd/history.parquet: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/ebanalyse/ebnerd-benchmark/c12f9b74c83eea68df69f9640d368adcd5d42a9e/test/data/ebnerd/history.parquet -------------------------------------------------------------------------------- /test/dataloader/test_fastformer.py: -------------------------------------------------------------------------------- 1 | from pathlib import Path 2 | import polars as pl 3 | import numpy as np 4 | import torch 5 | from ebrec.utils._behaviors import create_user_id_to_int_mapping 6 | from ebrec.utils._articles import create_article_id_to_value_mapping 7 | 8 | from ebrec.utils._python import time_it 9 | from ebrec.utils._behaviors import create_binary_labels_column 10 | from ebrec.utils._constants import ( 11 | DEFAULT_HISTORY_ARTICLE_ID_COL, 12 | DEFAULT_CLICKED_ARTICLES_COL, 13 | DEFAULT_INVIEW_ARTICLES_COL, 14 | DEFAULT_ARTICLE_ID_COL, 15 | DEFAULT_CATEGORY_COL, 16 | DEFAULT_USER_COL, 17 | ) 18 | 19 | from ebrec.models.fastformer.dataloader import FastformerDataset 20 | from torch.utils.data import DataLoader 21 | 22 | TOKEN_COL = "tokens" 23 | N_SAMPLES = "n" 24 | BATCH_SIZE = 100 25 | 26 | # LOAD DATA: 27 | PATH_DATA = Path("test/data") 28 | df_articles = ( 29 | pl.scan_parquet(PATH_DATA.joinpath("ebnerd", "articles.parquet")) 30 | .select(pl.col(DEFAULT_ARTICLE_ID_COL, DEFAULT_CATEGORY_COL)) 31 | .with_columns(pl.Series(TOKEN_COL, np.random.randint(0, 20, (1, 10)))) 32 | .collect() 33 | ) 34 | df_history = ( 35 | pl.scan_parquet(PATH_DATA.joinpath("ebnerd", "history.parquet")) 36 | .select(DEFAULT_USER_COL, DEFAULT_HISTORY_ARTICLE_ID_COL) 37 | .with_columns(pl.col(DEFAULT_HISTORY_ARTICLE_ID_COL).list.tail(3)) 38 | ) 39 | df_behaviors = ( 40 | pl.scan_parquet(PATH_DATA.joinpath("ebnerd", "behaviors.parquet")) 41 | .select(DEFAULT_USER_COL, DEFAULT_INVIEW_ARTICLES_COL, DEFAULT_CLICKED_ARTICLES_COL) 42 | .with_columns(pl.col(DEFAULT_INVIEW_ARTICLES_COL).list.len().alias(N_SAMPLES)) 43 | .join(df_history, on=DEFAULT_USER_COL, how="left") 44 | .collect() 45 | .pipe(create_binary_labels_column) 46 | ) 47 | # => MAPPINGS: 48 | article_mapping = create_article_id_to_value_mapping( 49 | df=df_articles, value_col=TOKEN_COL 50 | ) 51 | user_mapping = create_user_id_to_int_mapping(df=df_behaviors) 52 | # => NPRATIO IMPRESSION - SAME LENGTHS: 53 | df_behaviors_train = df_behaviors.filter(pl.col(N_SAMPLES) == pl.col(N_SAMPLES).min()) 54 | # => FOR TEST-DATALOADER 55 | label_lengths = df_behaviors[DEFAULT_INVIEW_ARTICLES_COL].list.len().to_list() 56 | 57 | 58 | @time_it(True) 59 | def test_FastformerDataloader(): 60 | train_dataloader = DataLoader( 61 | FastformerDataset( 62 | behaviors=df_behaviors_train, 63 | history_column=DEFAULT_HISTORY_ARTICLE_ID_COL, 64 | article_dict=article_mapping, 65 | batch_size=BATCH_SIZE, 66 | shuffle=True, 67 | ) 68 | ) 69 | 70 | batch = train_dataloader.__iter__().__next__() 71 | 72 | assert train_dataloader.__len__() == int(np.ceil(df_behaviors_train.shape[0] / 100)) 73 | assert len(batch) == 2, "There should be two outputs: (inputs, labels)" 74 | assert ( 75 | len(batch[0]) == 2 76 | ), "Fastformer has two outputs (history_input, candidate_input)" 77 | 78 | for type_in_batch in batch[0]: 79 | assert ( 80 | type_in_batch.dtype == torch.int 81 | ), "Expected output to be integer; used for lookup value" 82 | 83 | assert batch[1].dtype == torch.float, "Expected output to be integer; this is label" 84 | 85 | test_dataloader = DataLoader( 86 | FastformerDataset( 87 | behaviors=df_behaviors, 88 | history_column=DEFAULT_HISTORY_ARTICLE_ID_COL, 89 | article_dict=article_mapping, 90 | batch_size=BATCH_SIZE, 91 | shuffle=False, 92 | ) 93 | ) 94 | 95 | batch = test_dataloader.__iter__().__next__() 96 | assert len(batch[1].squeeze(0)) == sum( 97 | label_lengths[:BATCH_SIZE] 98 | ), "Should have unfolded all the test samples" 99 | -------------------------------------------------------------------------------- /test/dataloader/test_newsrec.py: -------------------------------------------------------------------------------- 1 | from pathlib import Path 2 | import polars as pl 3 | import numpy as np 4 | import torch 5 | from ebrec.utils._behaviors import create_user_id_to_int_mapping 6 | from ebrec.utils._articles import create_article_id_to_value_mapping 7 | from ebrec.utils._python import create_lookup_dict 8 | 9 | from ebrec.models.newsrec.dataloader import ( 10 | LSTURDataLoader, 11 | NAMLDataLoader, 12 | NRMSDataLoader, 13 | ) 14 | from ebrec.utils._python import time_it 15 | from ebrec.utils._behaviors import create_binary_labels_column 16 | from ebrec.utils._constants import ( 17 | DEFAULT_HISTORY_ARTICLE_ID_COL, 18 | DEFAULT_CLICKED_ARTICLES_COL, 19 | DEFAULT_INVIEW_ARTICLES_COL, 20 | DEFAULT_ARTICLE_ID_COL, 21 | DEFAULT_CATEGORY_COL, 22 | DEFAULT_USER_COL, 23 | ) 24 | 25 | from ebrec.models.fastformer.dataloader import FastformerDataset 26 | from torch.utils.data import DataLoader 27 | 28 | TOKEN_COL = "tokens" 29 | N_SAMPLES = "n" 30 | BATCH_SIZE = 100 31 | 32 | # LOAD DATA: 33 | PATH_DATA = Path("test/data") 34 | df_articles = ( 35 | pl.scan_parquet(PATH_DATA.joinpath("ebnerd", "articles.parquet")) 36 | .select(pl.col(DEFAULT_ARTICLE_ID_COL, DEFAULT_CATEGORY_COL)) 37 | .with_columns(pl.Series(TOKEN_COL, np.random.randint(0, 20, (1, 10)))) 38 | .collect() 39 | ) 40 | df_history = ( 41 | pl.scan_parquet(PATH_DATA.joinpath("ebnerd", "history.parquet")) 42 | .select(DEFAULT_USER_COL, DEFAULT_HISTORY_ARTICLE_ID_COL) 43 | .with_columns(pl.col(DEFAULT_HISTORY_ARTICLE_ID_COL).list.tail(3)) 44 | ) 45 | df_behaviors = ( 46 | pl.scan_parquet(PATH_DATA.joinpath("ebnerd", "behaviors.parquet")) 47 | .select(DEFAULT_USER_COL, DEFAULT_INVIEW_ARTICLES_COL, DEFAULT_CLICKED_ARTICLES_COL) 48 | .with_columns(pl.col(DEFAULT_INVIEW_ARTICLES_COL).list.len().alias(N_SAMPLES)) 49 | .join(df_history, on=DEFAULT_USER_COL, how="left") 50 | .collect() 51 | .pipe(create_binary_labels_column) 52 | ) 53 | # => MAPPINGS: 54 | article_mapping = create_article_id_to_value_mapping( 55 | df=df_articles, value_col=TOKEN_COL 56 | ) 57 | user_mapping = create_user_id_to_int_mapping(df=df_behaviors) 58 | # => NPRATIO IMPRESSION - SAME LENGTHS: 59 | df_behaviors_train = df_behaviors.filter(pl.col(N_SAMPLES) == pl.col(N_SAMPLES).min()) 60 | # => FOR TEST-DATALOADER 61 | label_lengths = df_behaviors[DEFAULT_INVIEW_ARTICLES_COL].list.len().to_list() 62 | 63 | 64 | # === 65 | @time_it(True) 66 | def test_NRMSDataLoader(): 67 | train_dataloader = NRMSDataLoader( 68 | behaviors=df_behaviors_train, 69 | article_dict=article_mapping, 70 | history_column=DEFAULT_HISTORY_ARTICLE_ID_COL, 71 | unknown_representation="zeros", 72 | eval_mode=False, 73 | batch_size=BATCH_SIZE, 74 | ) 75 | 76 | batch = train_dataloader.__iter__().__next__() 77 | 78 | assert train_dataloader.__len__() == int(np.ceil(df_behaviors_train.shape[0] / 100)) 79 | assert len(batch) == 2, "There should be two outputs: (inputs, labels)" 80 | assert ( 81 | len(batch[0]) == 2 82 | ), "NRMS has two outputs (his_input_title, pred_input_title_one)" 83 | 84 | for type_in_batch in batch[0][0]: 85 | assert isinstance( 86 | type_in_batch.ravel()[0], np.integer 87 | ), "Expected output to be integer; used for lookup value" 88 | 89 | assert isinstance( 90 | batch[1].ravel()[0], np.integer 91 | ), "Expected output to be integer; this is label" 92 | 93 | test_dataloader = NRMSDataLoader( 94 | behaviors=df_behaviors, 95 | article_dict=article_mapping, 96 | history_column=DEFAULT_HISTORY_ARTICLE_ID_COL, 97 | unknown_representation="zeros", 98 | eval_mode=True, 99 | batch_size=BATCH_SIZE, 100 | ) 101 | 102 | batch = test_dataloader.__iter__().__next__() 103 | assert len(batch[1]) == sum( 104 | label_lengths[:BATCH_SIZE] 105 | ), "Should have unfolded all the test samples" 106 | 107 | 108 | @time_it(True) 109 | def test_LSTURDataLoader(): 110 | train_dataloader = LSTURDataLoader( 111 | behaviors=df_behaviors_train, 112 | article_dict=article_mapping, 113 | user_id_mapping=user_mapping, 114 | history_column=DEFAULT_HISTORY_ARTICLE_ID_COL, 115 | unknown_representation="zeros", 116 | batch_size=BATCH_SIZE, 117 | ) 118 | 119 | batch = train_dataloader.__iter__().__next__() 120 | 121 | assert train_dataloader.__len__() == int(np.ceil(df_behaviors_train.shape[0] / 100)) 122 | assert len(batch) == 2, "There should be two outputs: (inputs, labels)" 123 | assert ( 124 | len(batch[0]) == 3 125 | ), "LSTUR has two outputs (user_indexes, his_input_title, pred_input_title_one)" 126 | 127 | for type_in_batch in batch[0][0]: 128 | assert isinstance( 129 | type_in_batch.ravel()[0], np.integer 130 | ), "Expected output to be integer; used for lookup value" 131 | 132 | assert isinstance( 133 | batch[1].ravel()[0], np.integer 134 | ), "Expected output to be integer; this is label" 135 | 136 | test_dataloader = LSTURDataLoader( 137 | behaviors=df_behaviors, 138 | article_dict=article_mapping, 139 | user_id_mapping=user_mapping, 140 | history_column=DEFAULT_HISTORY_ARTICLE_ID_COL, 141 | unknown_representation="zeros", 142 | batch_size=BATCH_SIZE, 143 | eval_mode=True, 144 | ) 145 | 146 | batch = test_dataloader.__iter__().__next__() 147 | assert len(batch[1]) == sum( 148 | label_lengths[:BATCH_SIZE] 149 | ), "Should have unfolded all the test samples" 150 | 151 | 152 | @time_it(True) 153 | def test_NAMLDataLoader(): 154 | body_mapping = article_mapping 155 | category_mapping = create_lookup_dict( 156 | df_articles.select(pl.col(DEFAULT_CATEGORY_COL).unique()).with_row_index( 157 | "row_nr" 158 | ), 159 | key=DEFAULT_CATEGORY_COL, 160 | value="row_nr", 161 | ) 162 | subcategory_mapping = category_mapping 163 | 164 | train_dataloader = NAMLDataLoader( 165 | behaviors=df_behaviors_train, 166 | article_dict=article_mapping, 167 | body_mapping=body_mapping, 168 | category_mapping=category_mapping, 169 | unknown_representation="zeros", 170 | subcategory_mapping=subcategory_mapping, 171 | history_column=DEFAULT_HISTORY_ARTICLE_ID_COL, 172 | batch_size=BATCH_SIZE, 173 | ) 174 | 175 | batch = train_dataloader.__iter__().__next__() 176 | 177 | assert train_dataloader.__len__() == int(np.ceil(df_behaviors_train.shape[0] / 100)) 178 | assert len(batch) == 2, "There should be two outputs: (inputs, labels)" 179 | assert ( 180 | len(batch[0]) == 8 181 | ), "NAML has two outputs (his_input_title,his_input_body,his_input_vert,his_input_subvert,pred_input_title,pred_input_body,pred_input_vert,pred_input_subvert)" 182 | 183 | for type_in_batch in batch[0][0]: 184 | assert isinstance( 185 | type_in_batch.ravel()[0], np.integer 186 | ), "Expected output to be integer; used for lookup value" 187 | 188 | assert isinstance( 189 | batch[1].ravel()[0], np.integer 190 | ), "Expected output to be integer; this is label" 191 | -------------------------------------------------------------------------------- /test/evaluation/test_beyond_accuracy.py: -------------------------------------------------------------------------------- 1 | import numpy as np 2 | from sklearn.metrics.pairwise import cosine_distances 3 | 4 | from ebrec.evaluation.beyond_accuracy import ( 5 | IntralistDiversity, 6 | Distribution, 7 | Serendipity, 8 | Novelty, 9 | Coverage, 10 | ) 11 | 12 | lookup_dict = { 13 | "101": {"doc_vec": np.array([1, 0, 0]), "v": 1, "sv": [1], "pop_sc": 0.50}, 14 | "102": {"doc_vec": np.array([0, 1, 0]), "v": 2, "sv": [1], "pop_sc": 0.25}, 15 | "103": {"doc_vec": np.array([1, 1, 1]), "v": 3, "sv": [1], "pop_sc": 0.75}, 16 | "104": {"doc_vec": np.array([1, 1, 1]), "v": 4, "sv": [1], "pop_sc": 0.50}, 17 | "105": {"doc_vec": np.array([-1, 0, 0]), "v": 5, "sv": [1], "pop_sc": 0.94}, 18 | "106": {"doc_vec": np.array([-1, 0, 0]), "v": 6, "sv": [1, 2], "pop_sc": 0.95}, 19 | "107": {"doc_vec": np.array([-1, 0, 0]), "v": 7, "sv": [1, 2], "pop_sc": 0.96}, 20 | "108": {"doc_vec": np.array([0, 0, 1]), "v": 8, "sv": [1, 2], "pop_sc": 0.50}, 21 | "400": {"doc_vec": np.array([0, 0, 1]), "v": 9, "sv": [4], "pop_sc": 0.20}, 22 | "401": {"doc_vec": np.array([0, 0, 1]), "v": 9, "sv": [4, 5], "pop_sc": 0.20}, 23 | } 24 | 25 | # 404 is not excepted, however, setup supports it: 26 | R = np.array( 27 | [ 28 | ["101", "102", "400"], 29 | ["101", "103", "400"], 30 | ["101", "102", "103"], 31 | ["101", "104", "400"], 32 | ["101", "106", "404"], 33 | ["404", "404", "404"], 34 | ] 35 | ) 36 | 37 | C = ["1", "2", "101", "102", "103", "104", "105", "106", "107", "108", "400", "401"] 38 | 39 | click_histories = [ 40 | np.array([["101", "102"]]), 41 | np.array([["105", "106", "400"]]), 42 | np.array([["102", "103", "104"]]), 43 | np.array([["101", "400"]]), 44 | np.array([["400"]]), 45 | np.array([["400"]]), 46 | ] 47 | pairwise_distance_function = cosine_distances 48 | 49 | # TODO: add the test 50 | --------------------------------------------------------------------------------