├── config.json ├── utils ├── metrics.py ├── utils.py ├── unsupervised_utils.py └── evaluators.py ├── README.md ├── 3. Supervised Training.py ├── 1. Unsupervised Training.py ├── 0. Generating Splits.ipynb └── 4. Inference.ipynb /config.json: -------------------------------------------------------------------------------- 1 | { 2 | "unsupervised_model": { 3 | "base_name": "AIDA-UPM/mstsb-paraphrase-multilingual-mpnet-base-v2", 4 | "save_name": "trained_models/unsupervised/paraphrase-multilingual-mpnet-base-v2", 5 | "seq_len": 128, 6 | "epochs": 50, 7 | "batch_size": 128, 8 | "warmup_ratio": 0.03, 9 | "top_n": 100, 10 | "seed": 42 11 | }, 12 | "supervised_model": { 13 | "base_name": "trained_models/unsupervised/paraphrase-multilingual-mpnet-base-v2", 14 | "save_name": "trained_models/supervised/paraphrase-multilingual-mpnet-base-v2", 15 | "threshold": 0.0125, 16 | "seq_len": 128, 17 | "epochs": 40, 18 | "batch_size": 256, 19 | "warmup_ratio": 0.05 20 | } 21 | } -------------------------------------------------------------------------------- /utils/metrics.py: -------------------------------------------------------------------------------- 1 | import numpy as np 2 | 3 | 4 | def get_pos_score(y_true, y_pred): 5 | y_true = y_true.apply(lambda x: set(x.split())) 6 | y_pred = y_pred.apply(lambda x: set(x.split())) 7 | int_true = np.array([len(x[0] & x[1]) / len(x[0]) for x in zip(y_true, y_pred)]) 8 | return round(np.mean(int_true), 5) 9 | 10 | def get_f2_score(y_true, y_pred): 11 | y_true = y_true.apply(lambda x: set(x.split())) 12 | y_pred = y_pred.apply(lambda x: set(x.split())) 13 | tp = np.array([len(x[0] & x[1]) for x in zip(y_true, y_pred)]) 14 | fp = np.array([len(x[1] - x[0]) for x in zip(y_true, y_pred)]) 15 | fn = np.array([len(x[0] - x[1]) for x in zip(y_true, y_pred)]) 16 | f2 = tp / (tp + 0.2 * fp + 0.8 * fn) 17 | return round(f2.mean(), 4) -------------------------------------------------------------------------------- /utils/utils.py: -------------------------------------------------------------------------------- 1 | import json 2 | import pandas as pd 3 | from tqdm import tqdm 4 | import random 5 | import os 6 | import numpy as np 7 | import torch 8 | 9 | # ========================================================================================= 10 | # Seed everything for deterministic results 11 | # ========================================================================================= 12 | def seed_everything(seed): 13 | random.seed(seed) 14 | os.environ['PYTHONHASHSEED'] = str(seed) 15 | np.random.seed(seed) 16 | torch.manual_seed(seed) 17 | torch.cuda.manual_seed(seed) 18 | torch.backends.cudnn.deterministic = True 19 | 20 | def generate_topic_tree(input_topic_df): 21 | df = pd.DataFrame() 22 | 23 | for channel in tqdm(input_topic_df['channel'].unique()): 24 | channel_df = input_topic_df[(input_topic_df['channel'] == channel)].reset_index(drop=True) 25 | for level in sorted(channel_df.level.unique()): 26 | # For level 0, it first creates a topic tree column which is the title of that topic. 27 | if level == 0: 28 | topic_tree = channel_df[channel_df['level'] == level]['title'].astype(str) 29 | topic_tree_df = pd.DataFrame([channel_df[channel_df['level'] == level][['id']], topic_tree.values]).T 30 | topic_tree_df.columns = ['child_id', 'topic_tree'] 31 | channel_df = channel_df.merge(topic_tree_df, left_on='id', right_on='child_id', how='left').drop( 32 | ['child_id'], axis=1) 33 | 34 | # Once the topic tree column has been created, the parent node and child node is merged on parent_id = child_id 35 | topic_df_parent = channel_df[channel_df['level'] == level][['id', 'title', 'parent', 'topic_tree']] 36 | topic_df_parent.columns = 'parent_' + topic_df_parent.columns 37 | 38 | topic_df_child = channel_df[channel_df['level'] == level + 1][['id', 'title', 'parent', 'topic_tree']] 39 | topic_df_child.columns = 'child_' + topic_df_child.columns 40 | 41 | topic_df_merged = topic_df_parent.merge(topic_df_child, left_on='parent_id', right_on='child_parent')[ 42 | ['child_id', 'parent_id', 'parent_title', 'child_title', 'parent_topic_tree']] 43 | 44 | # Topic tree is parent topic tree + title of the current child on that level 45 | topic_tree = topic_df_merged['parent_topic_tree'].astype(str) + ' > ' + topic_df_merged[ 46 | 'child_title'].astype(str) 47 | 48 | topic_tree_df = pd.DataFrame([topic_df_merged['child_id'].values, topic_tree.values]).T 49 | topic_tree_df.columns = ['child_id', 'topic_tree'] 50 | 51 | channel_df = channel_df.merge(topic_tree_df, left_on='id', right_on='child_id', how='left').drop( 52 | ['child_id'], axis=1) 53 | if 'topic_tree_y' in list(channel_df.columns): 54 | channel_df['topic_tree'] = channel_df['topic_tree_x'].combine_first(channel_df['topic_tree_y']) 55 | channel_df = channel_df.drop(['topic_tree_x', 'topic_tree_y'], axis=1) 56 | 57 | df = pd.concat([df, channel_df], ignore_index=True) 58 | 59 | df = df.merge(df.groupby("channel")["level"].max().rename("max_level_of_channel").reset_index(), 60 | how="left", 61 | on="channel") 62 | 63 | df["reverse_level"] = df["max_level_of_channel"] - df["level"] 64 | 65 | return df[["id", "topic_tree", "reverse_level"]] 66 | 67 | def read_config(): 68 | f = open('config.json') 69 | config = json.load(f) 70 | # config["supervised_model"]["betas"] = eval(config["supervised_model"]["betas"]) 71 | return config -------------------------------------------------------------------------------- /README.md: -------------------------------------------------------------------------------- 1 | ## Solution for "Learning Equality - Curriculum Recommendations" @Kaggle 2 | 3 | ![architecture](https://raw.githubusercontent.com/UKPLab/sentence-transformers/master/docs/img/InformationRetrieval.png) 4 | 5 | I used [sentence-transformers](https://github.com/UKPLab/sentence-transformers) library and the models from [HuggingFace](https://huggingface.co/). I tried to implement the shared architecture [here](https://www.sbert.net/examples/applications/retrieve_rerank/README.html). 6 | 7 | The pipeline consists of: 8 | - [Splitting the Data as Train/Val](https://github.com/nlztrk/Learning-Equality-Curriculum-Recommendations/blob/main/0.%20Generating%20Splits.ipynb) 9 | - [Text Processing](https://github.com/nlztrk/Learning-Equality-Curriculum-Recommendations/blob/main/utils/unsupervised_utils.py#L154) 10 | - [Training Sentence-Transformer (Stage 1)](https://github.com/nlztrk/Learning-Equality-Curriculum-Recommendations/blob/main/1.%20Unsupervised%20Training.py) 11 | - [Retrieve with kNN using Stage 1 Embeddings](https://github.com/nlztrk/Learning-Equality-Curriculum-Recommendations/blob/main/2.%20Unsupervised%20Sampling.ipynb) 12 | - [Training Cross-Encoder (Stage 2)](https://github.com/nlztrk/Learning-Equality-Curriculum-Recommendations/blob/main/3.%20Supervised%20Training.py) 13 | - [Inference](https://github.com/nlztrk/Learning-Equality-Curriculum-Recommendations/blob/main/4.%20Inference.ipynb) 14 | 15 | ### Splitting the Data as Train/Val 16 | I've seen a lot of different approaches on the forum. I also wanted to use the imbalance in language distribution in my approach. I set all the data coming from **source** as **train**. For the remaining, I used: 17 | 18 | - **CV Scheme:** Grouped Stratified K-Fold 19 | - **Folds:** 5 (Used only the first) 20 | - **Group:** Topic ID 21 | - **Stratifier Label:** Language 22 | 23 | ### Text Processing 24 | - Created topic tree 25 | - Created special tokens for each value **language** and **content kind** can take. 26 | - Created identifier separators for **topic title**, **topic tree**, **topic description**, **content title**, **content description** and **content text**. 27 | 28 | My final input for the model was like: 29 | - **Topic:** `[<[language_en]>] [<[topic_title]>] videos [<[topic_tree]>] maths g3 to g10 > maths > g6 > 17. geometrical constructions > perpendicular and perpendicular bisector > videos [<[topic_desc]>] nan` 30 | - **Content:** `[<[language_en]>] [<[kind_exercise]>] [<[cntnt_title]>] level 3: identify elements of simple machine(axle,wheel,pulley and inclined plane etc [<[cntnt_desc]>] nan [<[cntnt_text]>] nan` 31 | 32 | ### Training Sentence-Transformer (Stage 1) 33 | - **Base Model:** [AIDA-UPM/mstsb-paraphrase-multilingual-mpnet-base-v2](https://huggingface.co/AIDA-UPM/mstsb-paraphrase-multilingual-mpnet-base-v2) 34 | - **Sequence Length:** 128 35 | - **Epochs:** 50 36 | - **Batch Size:** 128 37 | - **Warm-Up Ratio:** 0.03 38 | 39 | ### Retrieve with kNN using Stage 1 Embeddings 40 | I used **kNN** from [RAPIDS](https://rapids.ai/) and get closest **100** content embedding for each topic embedding using **cosine-similarity**. 41 | 42 | ### Training Cross-Encoder (Stage 2) 43 | - **Base Model:** Trained model from Stage 1 44 | - **Output:** Sigmoid 45 | - **Sequence Length:** 128 46 | - **Epochs:** 15 47 | - **Batch Size:** 256 48 | - **Warm-Up Ratio:** 0.05 49 | 50 | ### Inference 51 | - Ran all the steps above sequentially in a single script. 52 | - Tuned classification threshold on the hold-out validation set to maximize F2-Score. 53 | - Imputed empty topic rows with the highest scoring content IDs. 54 | 55 | ## Didn't Work & Improve 56 | 57 | - Language specific kNN 58 | - Smaller models 59 | - Lower sequence length 60 | - Lower batch-size 61 | - Union submission blending 62 | -------------------------------------------------------------------------------- /3. Supervised Training.py: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env python 2 | # coding: utf-8 3 | # %% 4 | import os 5 | os.environ["KMP_DUPLICATE_LIB_OK"]="TRUE" 6 | 7 | 8 | # %% 9 | # ========================================================================================= 10 | # Libraries 11 | # ========================================================================================= 12 | import gc 13 | import time 14 | import math 15 | import random 16 | import warnings 17 | warnings.filterwarnings("ignore") 18 | import numpy as np 19 | import pandas as pd 20 | from tqdm.auto import tqdm 21 | 22 | 23 | from sentence_transformers import SentenceTransformer, CrossEncoder, util 24 | from sentence_transformers.readers import InputExample 25 | from sentence_transformers.cross_encoder.evaluation import CEBinaryClassificationEvaluator 26 | from torch.utils.data import DataLoader 27 | 28 | # Custom libraries 29 | from utils.unsupervised_utils import read_data 30 | from utils.utils import read_config 31 | from utils.metrics import get_pos_score, get_f2_score 32 | 33 | os.environ["TOKENIZERS_PARALLELISM"]="false" 34 | os.environ["TRANSFORMERS_NO_ADVISORY_WARNINGS"]="true" 35 | 36 | # %% 37 | config = read_config() 38 | DATA_PATH = "../raw_data/" 39 | GENERATED_DATA_PATH = "./generated_files/" 40 | 41 | 42 | # %% 43 | train_df = pd.read_parquet(GENERATED_DATA_PATH + "unsupervised_train.parquet") 44 | test_df = pd.read_parquet(GENERATED_DATA_PATH + "unsupervised_test.parquet") 45 | 46 | correlation_df = pd.read_csv(DATA_PATH + "correlations.csv") 47 | 48 | # %% 49 | train_samples = [InputExample(texts=[row.model_input1, 50 | row.model_input2], 51 | label=int(row.target)) for row in tqdm(train_df.itertuples())] 52 | 53 | test_samples = [InputExample(texts=[row.model_input1, 54 | row.model_input2], 55 | label=int(row.target)) for row in tqdm(test_df.itertuples())] 56 | 57 | 58 | # %% 59 | model = CrossEncoder(config["supervised_model"]["base_name"], 60 | num_labels=1, 61 | max_length=config["supervised_model"]["seq_len"]) 62 | 63 | num_epochs = config["supervised_model"]["epochs"] 64 | 65 | train_dataloader = DataLoader(train_samples, 66 | shuffle=True, 67 | batch_size=config["supervised_model"]["batch_size"], 68 | num_workers=0, 69 | pin_memory=False) 70 | 71 | evaluator = CEBinaryClassificationEvaluator.from_input_examples(test_samples, 72 | name='K12-local-test', 73 | show_progress_bar=True 74 | ) 75 | 76 | warmup_steps = math.ceil(len(train_dataloader) * config["supervised_model"]["warmup_ratio"]) 77 | 78 | 79 | # %% 80 | model.fit(train_dataloader=train_dataloader, 81 | show_progress_bar=True, 82 | evaluator=evaluator, 83 | epochs=num_epochs, 84 | warmup_steps=warmup_steps, 85 | save_best_model=True, 86 | output_path=config["supervised_model"]["save_name"], 87 | use_amp=True) 88 | 89 | 90 | # %% 91 | model 92 | 93 | # %% 94 | # ### Load Model & Tune Threshold 95 | 96 | model = CrossEncoder(config["supervised_model"]["save_name"], 97 | num_labels=1, 98 | max_length=config["supervised_model"]["seq_len"]) 99 | 100 | preds = model.predict(test_df[["model_input1", "model_input2"]].values, 101 | show_progress_bar=True, 102 | batch_size=96) 103 | 104 | test_df["pred_score"] = preds 105 | 106 | 107 | # %% 108 | for thr in np.arange(0., 0.3, 0.0025): 109 | preds_thr_df = test_df[test_df.pred_score >= thr].sort_values(by="pred_score", 110 | ascending=False)[["topics_ids", 111 | "content_ids"]].\ 112 | groupby("topics_ids")["content_ids"].apply(lambda x: " ".join(x)).rename("pred_content_ids").reset_index() 113 | 114 | preds_thr_df = preds_thr_df.merge(correlation_df[correlation_df.topic_id.isin(test_df.topics_ids)], 115 | how="outer", right_on="topic_id", left_on="topics_ids") 116 | preds_thr_df.fillna("None", inplace=True) 117 | f2score_for_threshold = get_f2_score(preds_thr_df['content_ids'], 118 | preds_thr_df['pred_content_ids']) 119 | 120 | print(f"Threshold: {thr} | Score: {f2score_for_threshold}") 121 | 122 | # %% 123 | # Threshold: 0.0175 | Score: 0.6395 @100 124 | # Threshold: 0.0175 | Score: 0.6424 @75 125 | # Threshold: 0.0150 | Score: 0.6461 @50 126 | # Threshold: 0.0050 | Score: 0.6464 @25 127 | -------------------------------------------------------------------------------- /1. Unsupervised Training.py: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env python 2 | # coding: utf-8 3 | # %% 4 | ## 1. Training Unsupervised SentenceTransformer 5 | 6 | # %% 7 | 8 | import faulthandler 9 | faulthandler.enable() 10 | 11 | import os 12 | os.environ["KMP_DUPLICATE_LIB_OK"]="TRUE" 13 | 14 | 15 | # %% 16 | import pandas as pd 17 | import numpy as np 18 | from tqdm.auto import tqdm 19 | 20 | from sentence_transformers import SentenceTransformer, models, InputExample, losses 21 | from sentence_transformers import datasets 22 | 23 | 24 | from datasets import Dataset 25 | from utils.evaluators import InformationRetrievalEvaluator 26 | 27 | import warnings 28 | warnings.filterwarnings('ignore') 29 | 30 | 31 | # %% 32 | # Custom libraries 33 | from utils.unsupervised_utils import generate_topic_model_input, generate_content_model_input, read_data 34 | from utils.utils import read_config 35 | 36 | # %% 37 | os.environ["TOKENIZERS_PARALLELISM"]="true" 38 | os.environ["TRANSFORMERS_NO_ADVISORY_WARNINGS"]="false" 39 | 40 | 41 | # %% 42 | config = read_config() 43 | 44 | 45 | # %% 46 | DATA_PATH = "../raw_data/" 47 | 48 | 49 | # %% 50 | topics, content, correlations, _ = read_data(data_path=DATA_PATH, 51 | config_obj=config, 52 | read_mode="train") 53 | 54 | topics.rename(columns=lambda x: "topic_" + x, inplace=True) 55 | content.rename(columns=lambda x: "content_" + x, inplace=True) 56 | 57 | correlations["content_id"] = correlations["content_ids"].str.split(" ") 58 | corr = correlations.explode("content_id").drop(columns=["content_ids"]) 59 | 60 | corr = corr.merge(topics, how="left", on="topic_id") 61 | corr = corr.merge(content, how="left", on="content_id") 62 | 63 | corr["set"] = corr[["topic_model_input", "content_model_input"]].values.tolist() 64 | train_df = pd.DataFrame(corr["set"]) 65 | 66 | dataset = Dataset.from_pandas(train_df) 67 | 68 | train_examples = [] 69 | train_data = dataset["set"] 70 | n_examples = dataset.num_rows 71 | 72 | for i in range(n_examples): 73 | example = train_data[i] 74 | if example[0] == None: 75 | continue 76 | train_examples.append(InputExample(texts=[str(example[0]), str(example[1])])) 77 | 78 | # %% 79 | # Setting-up the Evaluation 80 | 81 | test_topics, test_content, test_correlations, _ = read_data(data_path=DATA_PATH, 82 | config_obj=config, 83 | read_mode="test") 84 | 85 | test_correlations["content_id"] = test_correlations["content_ids"].str.split(" ") 86 | test_correlations = test_correlations[test_correlations.topic_id.isin(test_topics.id)].reset_index(drop=True) 87 | test_correlations["content_id"] = test_correlations["content_id"].apply(set) 88 | test_correlations = test_correlations[["topic_id", "content_id"]] 89 | 90 | 91 | # %% 92 | ir_relevant_docs = { 93 | row['topic_id']: row['content_id'] for i, row in tqdm(test_correlations.iterrows()) 94 | } 95 | 96 | 97 | # %% 98 | unq_test_topics = test_correlations.explode("topic_id")[["topic_id"]].reset_index(drop=True).drop_duplicates().reset_index(drop=True) 99 | unq_test_topics = unq_test_topics.merge(test_topics[["id", "model_input"]], how="left", left_on="topic_id", 100 | right_on="id").drop("id", 1) 101 | 102 | ir_queries = { 103 | row['topic_id']: row['model_input'] for i, row in tqdm(unq_test_topics.iterrows()) 104 | } 105 | 106 | 107 | # %% 108 | all_topics, all_content, _, special_tokens = read_data(data_path=DATA_PATH, 109 | config_obj=config, 110 | read_mode="all") 111 | 112 | unq_contents = correlations.explode("content_id")[["content_id"]].reset_index(drop=True).drop_duplicates().reset_index(drop=True) 113 | unq_contents = unq_contents.merge(all_content[["id", "model_input"]], how="left", left_on="content_id", 114 | right_on="id").drop("id", 1) 115 | 116 | ir_corpus = { 117 | row['content_id']: row['model_input'] for i, row in tqdm(unq_contents.iterrows()) 118 | } 119 | 120 | # %% 121 | evaluator = InformationRetrievalEvaluator( 122 | ir_queries, 123 | ir_corpus, 124 | ir_relevant_docs, 125 | show_progress_bar=True, 126 | main_score_function="cos_sim", 127 | precision_recall_at_k=[5, 10, 25, 50, 100], 128 | name='K12-local-test-unsupervised' 129 | ) 130 | 131 | # %% 132 | # Training 133 | 134 | train_dataloader = datasets.NoDuplicatesDataLoader(train_examples, 135 | batch_size=config["unsupervised_model"]["batch_size"]) 136 | 137 | 138 | # %% 139 | TARGET_MODEL = config["unsupervised_model"]["base_name"] 140 | OUT_MODEL = config["unsupervised_model"]["save_name"] 141 | TARGET_MODEL, OUT_MODEL 142 | 143 | 144 | # %% 145 | model = SentenceTransformer(TARGET_MODEL) 146 | model.max_seq_length = config["unsupervised_model"]["seq_len"] 147 | 148 | word_embedding_model = model._first_module() 149 | word_embedding_model.tokenizer.add_tokens(list(special_tokens), 150 | special_tokens=True) 151 | word_embedding_model.auto_model.resize_token_embeddings(len(word_embedding_model.tokenizer)) 152 | 153 | 154 | # %% 155 | train_loss = losses.MultipleNegativesRankingLoss(model=model) 156 | 157 | #k% of train data 158 | num_epochs = config["unsupervised_model"]["epochs"] 159 | warmup_steps = int(len(train_dataloader) * config["unsupervised_model"]["warmup_ratio"]) 160 | 161 | 162 | # %% 163 | model.fit(train_objectives=[(train_dataloader, train_loss)], 164 | # scheduler="constantlr", 165 | # optimizer_class=Lion, 166 | # optimizer_params={'lr': 2e-5}, 167 | evaluator=evaluator, 168 | # evaluation_steps=400, 169 | 170 | checkpoint_path=f"checkpoints/unsupervised/{OUT_MODEL.split('/')[-1]}", 171 | checkpoint_save_steps=len(train_dataloader), 172 | 173 | epochs=num_epochs, 174 | warmup_steps=warmup_steps, 175 | output_path=OUT_MODEL, 176 | save_best_model=True, 177 | use_amp=True) 178 | 179 | -------------------------------------------------------------------------------- /0. Generating Splits.ipynb: -------------------------------------------------------------------------------- 1 | { 2 | "cells": [ 3 | { 4 | "cell_type": "code", 5 | "execution_count": null, 6 | "id": "e4e5bbaa", 7 | "metadata": { 8 | "execution": { 9 | "iopub.execute_input": "2022-12-16T13:22:40.102324Z", 10 | "iopub.status.busy": "2022-12-16T13:22:40.101661Z", 11 | "iopub.status.idle": "2022-12-16T13:22:41.670741Z", 12 | "shell.execute_reply": "2022-12-16T13:22:41.669120Z" 13 | }, 14 | "papermill": { 15 | "duration": 1.58498, 16 | "end_time": "2022-12-16T13:22:41.674272", 17 | "exception": false, 18 | "start_time": "2022-12-16T13:22:40.089292", 19 | "status": "completed" 20 | }, 21 | "tags": [] 22 | }, 23 | "outputs": [], 24 | "source": [ 25 | "import os\n", 26 | "import json\n", 27 | "import matplotlib\n", 28 | "import numpy as np\n", 29 | "import pandas as pd\n", 30 | "from tqdm.notebook import tqdm\n", 31 | "from sklearn.model_selection import StratifiedGroupKFold, GroupKFold" 32 | ] 33 | }, 34 | { 35 | "cell_type": "code", 36 | "execution_count": null, 37 | "id": "5ac9a9a3", 38 | "metadata": { 39 | "execution": { 40 | "iopub.execute_input": "2022-12-16T13:22:41.700052Z", 41 | "iopub.status.busy": "2022-12-16T13:22:41.699543Z", 42 | "iopub.status.idle": "2022-12-16T13:22:41.705438Z", 43 | "shell.execute_reply": "2022-12-16T13:22:41.703955Z" 44 | }, 45 | "papermill": { 46 | "duration": 0.023327, 47 | "end_time": "2022-12-16T13:22:41.708354", 48 | "exception": false, 49 | "start_time": "2022-12-16T13:22:41.685027", 50 | "status": "completed" 51 | }, 52 | "tags": [] 53 | }, 54 | "outputs": [], 55 | "source": [ 56 | "DATA_PATH = \"../raw_data/\"" 57 | ] 58 | }, 59 | { 60 | "cell_type": "code", 61 | "execution_count": null, 62 | "id": "c38c14b5", 63 | "metadata": { 64 | "execution": { 65 | "iopub.execute_input": "2022-12-16T13:22:41.731837Z", 66 | "iopub.status.busy": "2022-12-16T13:22:41.731417Z", 67 | "iopub.status.idle": "2022-12-16T13:23:05.855527Z", 68 | "shell.execute_reply": "2022-12-16T13:23:05.854203Z" 69 | }, 70 | "papermill": { 71 | "duration": 24.139111, 72 | "end_time": "2022-12-16T13:23:05.858134", 73 | "exception": false, 74 | "start_time": "2022-12-16T13:22:41.719023", 75 | "status": "completed" 76 | }, 77 | "tags": [] 78 | }, 79 | "outputs": [], 80 | "source": [ 81 | "topics = pd.read_csv(DATA_PATH + \"topics.csv\")\n", 82 | "content = pd.read_csv(DATA_PATH + \"content.csv\")\n", 83 | "correlations = pd.read_csv(DATA_PATH + \"correlations.csv\")\n", 84 | "\n", 85 | "topics = topics[topics.has_content==True]" 86 | ] 87 | }, 88 | { 89 | "cell_type": "markdown", 90 | "id": "cb493013", 91 | "metadata": {}, 92 | "source": [ 93 | "## Split" 94 | ] 95 | }, 96 | { 97 | "cell_type": "code", 98 | "execution_count": null, 99 | "id": "e9c26411", 100 | "metadata": {}, 101 | "outputs": [], 102 | "source": [ 103 | "topics_train = topics[topics['category'] == \"source\"][[\"id\"]]\n", 104 | "topics_train[\"fold\"] = \"train\"" 105 | ] 106 | }, 107 | { 108 | "cell_type": "code", 109 | "execution_count": null, 110 | "id": "26e43ad6", 111 | "metadata": { 112 | "execution": { 113 | "iopub.execute_input": "2022-12-16T13:23:12.222085Z", 114 | "iopub.status.busy": "2022-12-16T13:23:12.221455Z", 115 | "iopub.status.idle": "2022-12-16T13:23:12.465709Z", 116 | "shell.execute_reply": "2022-12-16T13:23:12.464474Z" 117 | }, 118 | "papermill": { 119 | "duration": 0.264501, 120 | "end_time": "2022-12-16T13:23:12.468560", 121 | "exception": false, 122 | "start_time": "2022-12-16T13:23:12.204059", 123 | "status": "completed" 124 | }, 125 | "tags": [] 126 | }, 127 | "outputs": [], 128 | "source": [ 129 | "topics_val = topics[topics['category'] != \"source\"].reset_index(drop=True)\n", 130 | "\n", 131 | "sgkf = StratifiedGroupKFold(random_state=1773,\n", 132 | " n_splits=4,\n", 133 | " shuffle=True)\n", 134 | "split_idxs = list(sgkf.split(topics_val[\"id\"],\n", 135 | " topics_val[\"language\"],\n", 136 | " groups=topics_val[\"id\"]))[0]\n", 137 | "\n", 138 | "split_idxs" 139 | ] 140 | }, 141 | { 142 | "cell_type": "code", 143 | "execution_count": null, 144 | "id": "d10b66af", 145 | "metadata": {}, 146 | "outputs": [], 147 | "source": [ 148 | "topics_add_train = topics_val.iloc[split_idxs[0]].reset_index(drop=True)[[\"id\"]]\n", 149 | "topics_add_train[\"fold\"] = \"train\"" 150 | ] 151 | }, 152 | { 153 | "cell_type": "code", 154 | "execution_count": null, 155 | "id": "d6a66f63", 156 | "metadata": {}, 157 | "outputs": [], 158 | "source": [ 159 | "topics_train = pd.concat([topics_train, topics_add_train], ignore_index=True)" 160 | ] 161 | }, 162 | { 163 | "cell_type": "code", 164 | "execution_count": null, 165 | "id": "597a3bc7", 166 | "metadata": {}, 167 | "outputs": [], 168 | "source": [ 169 | "topics_holdout = topics_val.iloc[split_idxs[1]].reset_index(drop=True)[[\"id\"]]\n", 170 | "topics_holdout[\"fold\"] = \"test\"" 171 | ] 172 | }, 173 | { 174 | "cell_type": "code", 175 | "execution_count": null, 176 | "id": "a94b0d77", 177 | "metadata": { 178 | "scrolled": true 179 | }, 180 | "outputs": [], 181 | "source": [ 182 | "topics[topics.id.isin(topics_train.id)].language.value_counts()" 183 | ] 184 | }, 185 | { 186 | "cell_type": "code", 187 | "execution_count": null, 188 | "id": "f5277a8b", 189 | "metadata": { 190 | "scrolled": true 191 | }, 192 | "outputs": [], 193 | "source": [ 194 | "topics[topics.id.isin(topics_add_train.id)].language.value_counts()" 195 | ] 196 | }, 197 | { 198 | "cell_type": "code", 199 | "execution_count": null, 200 | "id": "c559862e", 201 | "metadata": {}, 202 | "outputs": [], 203 | "source": [ 204 | "topics[topics.id.isin(topics_holdout.id)].language.value_counts()" 205 | ] 206 | }, 207 | { 208 | "cell_type": "code", 209 | "execution_count": null, 210 | "id": "7b4aadf3", 211 | "metadata": { 212 | "execution": { 213 | "iopub.execute_input": "2022-12-16T13:23:15.013233Z", 214 | "iopub.status.busy": "2022-12-16T13:23:15.012768Z", 215 | "iopub.status.idle": "2022-12-16T13:23:15.020818Z", 216 | "shell.execute_reply": "2022-12-16T13:23:15.019953Z" 217 | }, 218 | "papermill": { 219 | "duration": 0.03394, 220 | "end_time": "2022-12-16T13:23:15.023192", 221 | "exception": false, 222 | "start_time": "2022-12-16T13:23:14.989252", 223 | "status": "completed" 224 | }, 225 | "tags": [] 226 | }, 227 | "outputs": [], 228 | "source": [ 229 | "split_df = pd.concat([topics_train, topics_holdout], ignore_index=True)" 230 | ] 231 | }, 232 | { 233 | "cell_type": "code", 234 | "execution_count": null, 235 | "id": "8e19f6c2", 236 | "metadata": {}, 237 | "outputs": [], 238 | "source": [ 239 | "split_df.to_csv('train_test_splits.csv', index=False)" 240 | ] 241 | }, 242 | { 243 | "cell_type": "markdown", 244 | "id": "47cb14bf", 245 | "metadata": { 246 | "papermill": { 247 | "duration": 0.021488, 248 | "end_time": "2022-12-16T13:23:15.131659", 249 | "exception": false, 250 | "start_time": "2022-12-16T13:23:15.110171", 251 | "status": "completed" 252 | }, 253 | "tags": [] 254 | }, 255 | "source": [ 256 | "Done !" 257 | ] 258 | } 259 | ], 260 | "metadata": { 261 | "kernelspec": { 262 | "display_name": "Python 3 (ipykernel)", 263 | "language": "python", 264 | "name": "python3" 265 | }, 266 | "language_info": { 267 | "codemirror_mode": { 268 | "name": "ipython", 269 | "version": 3 270 | }, 271 | "file_extension": ".py", 272 | "mimetype": "text/x-python", 273 | "name": "python", 274 | "nbconvert_exporter": "python", 275 | "pygments_lexer": "ipython3", 276 | "version": "3.8.10" 277 | }, 278 | "papermill": { 279 | "default_parameters": {}, 280 | "duration": 45.346996, 281 | "end_time": "2022-12-16T13:23:16.176659", 282 | "environment_variables": {}, 283 | "exception": null, 284 | "input_path": "__notebook__.ipynb", 285 | "output_path": "__notebook__.ipynb", 286 | "parameters": {}, 287 | "start_time": "2022-12-16T13:22:30.829663", 288 | "version": "2.3.4" 289 | } 290 | }, 291 | "nbformat": 4, 292 | "nbformat_minor": 5 293 | } 294 | -------------------------------------------------------------------------------- /utils/unsupervised_utils.py: -------------------------------------------------------------------------------- 1 | import pandas as pd 2 | from tqdm import tqdm 3 | import gc 4 | 5 | from sentence_transformers import SentenceTransformer, models, InputExample, losses 6 | 7 | from cuml.neighbors import NearestNeighbors 8 | import cupy as cp 9 | import torch 10 | 11 | from .utils import generate_topic_tree 12 | 13 | 14 | def get_neighbors(topic_df, 15 | content_df, 16 | config_obj): 17 | # Create unsupervised model to extract embeddings 18 | model = SentenceTransformer(config_obj["unsupervised_model"]["save_name"]) 19 | model = model.to("cuda") 20 | 21 | # Predict 22 | topics_preds = model.encode(topic_df["model_input"], 23 | show_progress_bar=True, 24 | convert_to_tensor=True) 25 | topics_preds_gpu = cp.asarray(topics_preds) 26 | 27 | content_preds = model.encode(content_df["model_input"], 28 | show_progress_bar=True, 29 | convert_to_tensor=True, 30 | batch_size=100) 31 | content_preds_gpu = cp.asarray(content_preds) 32 | 33 | # Release memory 34 | torch.cuda.empty_cache() 35 | gc.collect() 36 | 37 | # KNN model 38 | print(' ') 39 | print('Training KNN model...') 40 | neighbors_model = NearestNeighbors(n_neighbors=config_obj["unsupervised_model"]["top_n"], 41 | metric='cosine') 42 | neighbors_model.fit(content_preds_gpu) 43 | indices = neighbors_model.kneighbors(topics_preds_gpu, return_distance=False) 44 | predictions = [] 45 | for k in tqdm(range(len(indices))): 46 | pred = indices[k] 47 | p = ' '.join([content_df.loc[ind, 'id'] for ind in pred.get()]) 48 | predictions.append(p) 49 | topic_df['predictions'] = predictions 50 | 51 | # Release memory 52 | del topics_preds, content_preds, topics_preds_gpu, content_preds_gpu, neighbors_model, predictions, indices, model 53 | gc.collect() 54 | torch.cuda.empty_cache() 55 | gc.collect() 56 | 57 | return topic_df, content_df 58 | 59 | 60 | def build_training_set(topic_df, 61 | content_df, 62 | mode="local"): 63 | # Create lists for training 64 | topics_ids = [] 65 | content_ids = [] 66 | title1 = [] 67 | title2 = [] 68 | targets = [] 69 | # Iterate over each topic 70 | for k in tqdm(range(len(topic_df))): 71 | row = topic_df.iloc[k] 72 | topics_id = row['id'] 73 | topics_title = row['model_input'] 74 | predictions = row['predictions'].split(' ') 75 | 76 | if mode == "local": 77 | ground_truth = row['content_ids'].split(' ') 78 | 79 | for pred in predictions: 80 | content_title = content_df.loc[pred, 'model_input'] 81 | topics_ids.append(topics_id) 82 | content_ids.append(pred) 83 | title1.append(topics_title) 84 | title2.append(content_title) 85 | # If pred is in ground truth, 1 else 0 86 | if mode == "local": 87 | if pred in ground_truth: 88 | targets.append(1) 89 | else: 90 | targets.append(0) 91 | # Build training dataset 92 | train = pd.DataFrame( 93 | {'topics_ids': topics_ids, 94 | 'content_ids': content_ids, 95 | 'model_input1': title1, 96 | 'model_input2': title2 97 | } 98 | ) 99 | if mode == "local": 100 | train["target"] = targets 101 | 102 | return train 103 | 104 | 105 | def read_data(data_path, 106 | config_obj, 107 | read_mode="all"): 108 | topics = pd.read_csv(data_path + 'topics.csv') 109 | content = pd.read_csv(data_path + 'content.csv') 110 | 111 | if read_mode != "all": 112 | correlations = pd.read_csv(data_path + 'correlations.csv') 113 | else: 114 | correlations = None 115 | topic_trees = generate_topic_tree(topics) 116 | 117 | if read_mode != "all": 118 | splits = pd.read_csv("train_test_splits.csv") 119 | topics = topics[topics.id.isin(splits[splits.fold == read_mode].id)].reset_index(drop=True) 120 | 121 | topics = topics.merge(topic_trees, how="left", on="id") 122 | del topic_trees 123 | gc.collect() 124 | 125 | topic_tokens = generate_topic_model_input(input_df=topics) 126 | content_tokens = generate_content_model_input(input_df=content) 127 | 128 | unq_tokens = set(topic_tokens + content_tokens + ["nan"]) 129 | 130 | # Sort by title length to make inference faster 131 | topics['length'] = topics['title'].apply(lambda x: len(x)) 132 | content['length'] = content['title'].apply(lambda x: len(x)) 133 | topics.sort_values('length', inplace=True) 134 | content.sort_values('length', inplace=True) 135 | 136 | # Drop cols 137 | topics.drop(['length'], axis=1, 138 | inplace=True) 139 | content.drop(['length'], axis=1, 140 | inplace=True) 141 | # Reset index 142 | topics.reset_index(drop=True, inplace=True) 143 | content.reset_index(drop=True, inplace=True) 144 | print(' ') 145 | print('-' * 50) 146 | print(f"topics.shape: {topics.shape}") 147 | print(f"content.shape: {content.shape}") 148 | if read_mode != "all": 149 | print(f"correlations.shape: {correlations.shape}") 150 | 151 | return topics, content, correlations, unq_tokens 152 | 153 | 154 | def generate_token_features(input_df, 155 | token_features): 156 | """ 157 | :param input_df: Input topic dataframe 158 | :param token_features: Target columns for "unique value token encoding" 159 | :return: Tuple of (Dataframe with additional model input column, Unique Special Tokens) 160 | """ 161 | token_feature_set = None 162 | special_tokens = [] 163 | 164 | for token_feature in token_features: 165 | 166 | token_feature_str = "[<[" + token_feature + "_" +\ 167 | input_df[token_feature].astype(str) + "]>]" 168 | special_tokens += set(token_feature_str.values) 169 | 170 | if not isinstance(token_feature_set, pd.Series): 171 | token_feature_set = token_feature_str 172 | else: 173 | token_feature_set += " " + token_feature_str 174 | 175 | return token_feature_set, special_tokens 176 | 177 | 178 | def generate_topic_model_input(input_df): 179 | """ 180 | :param input_df: Input topic dataframe 181 | :return: Dataframe with additional model input column 182 | """ 183 | 184 | input_df.fillna("nan", inplace=True) 185 | 186 | token_features = [ 187 | "language", 188 | # "level", 189 | # "reverse_level" 190 | ] 191 | token_feature_text, special_tokens = generate_token_features(input_df=input_df, 192 | token_features=token_features) 193 | 194 | 195 | input_df["model_input"] = ( 196 | token_feature_text + 197 | " [<[topic_title]>] " + input_df["title"].astype(str) + 198 | " [<[topic_tree]>] " + input_df["topic_tree"].astype(str) + 199 | " [<[topic_desc]>] " + input_df["description"].astype(str) 200 | ).str.lower()#.str.split().apply(lambda x: " ".join(x[:seq_len])) 201 | 202 | del token_feature_text 203 | 204 | input_df.drop(['description', 'channel', 'category', 205 | 'level', 'parent', 'has_content'], 206 | axis=1, 207 | inplace=True) 208 | gc.collect() 209 | 210 | return special_tokens 211 | 212 | 213 | def generate_content_model_input(input_df): 214 | """ 215 | :param input_df: Input content dataframe 216 | :return: Dataframe with additional model input column 217 | """ 218 | 219 | input_df.fillna("nan", inplace=True) 220 | 221 | token_features = ["language", "kind"] 222 | token_feature_text, special_tokens = generate_token_features(input_df=input_df, 223 | token_features=token_features) 224 | 225 | input_df["model_input"] = ( 226 | token_feature_text + 227 | " [<[cntnt_title]>] " + input_df["title"].astype(str) + 228 | " [<[cntnt_desc]>] " + input_df["description"].astype(str) + 229 | " [<[cntnt_text]>] " + input_df["text"].astype(str) 230 | ).apply(lambda x: " ".join(x.split()[:512])).str.lower() 231 | 232 | del token_feature_text 233 | 234 | input_df.drop(['description', 'kind', 'text', 'copyright_holder', 'license'], 235 | axis=1, 236 | inplace=True) 237 | gc.collect() 238 | 239 | return special_tokens -------------------------------------------------------------------------------- /utils/evaluators.py: -------------------------------------------------------------------------------- 1 | from typing import List, Tuple, Dict, Set, Callable, Optional 2 | from sentence_transformers.util import cos_sim 3 | from torch import Tensor 4 | import heapq 5 | from sentence_transformers import evaluation 6 | import os 7 | from tqdm import trange 8 | import torch 9 | import numpy as np 10 | 11 | class InformationRetrievalEvaluator(evaluation.SentenceEvaluator): 12 | """ 13 | This class evaluates an Information Retrieval (IR) setting. 14 | Given a set of queries and a large corpus set. It will retrieve for each query the top-k most similar document. It measures 15 | Mean Reciprocal Rank (MRR), Recall@k, and Normalized Discounted Cumulative Gain (NDCG) 16 | """ 17 | 18 | def __init__(self, 19 | queries: Dict[str, str], #qid => query 20 | corpus: Dict[str, str], #cid => doc 21 | relevant_docs: Dict[str, Set[str]], #qid => Set[cid] 22 | corpus_chunk_size: int = 50000, 23 | mrr_at_k: List[int] = [10], 24 | ndcg_at_k: List[int] = [10], 25 | accuracy_at_k: List[int] = [1, 3, 5, 10], 26 | precision_recall_at_k: List[int] = [1, 3, 5, 10], 27 | map_at_k: List[int] = [100], 28 | show_progress_bar: bool = False, 29 | batch_size: int = 32, 30 | name: str = '', 31 | write_csv: bool = True, 32 | score_functions: List[Callable[[Tensor, Tensor], Tensor] ] = {'cos_sim': cos_sim}, #Score function, higher=more similar 33 | main_score_function: str = None 34 | ): 35 | 36 | self.queries_ids = [] 37 | for qid in queries: 38 | if qid in relevant_docs and len(relevant_docs[qid]) > 0: 39 | self.queries_ids.append(qid) 40 | 41 | self.queries = [queries[qid] for qid in self.queries_ids] 42 | 43 | self.corpus_ids = list(corpus.keys()) 44 | self.corpus = [corpus[cid] for cid in self.corpus_ids] 45 | 46 | self.relevant_docs = relevant_docs 47 | self.corpus_chunk_size = corpus_chunk_size 48 | self.precision_recall_at_k = precision_recall_at_k 49 | 50 | self.show_progress_bar = show_progress_bar 51 | self.batch_size = batch_size 52 | self.name = name 53 | self.write_csv = write_csv 54 | self.score_functions = score_functions 55 | self.score_function_names = sorted(list(self.score_functions.keys())) 56 | self.main_score_function = main_score_function 57 | 58 | if name: 59 | name = "_" + name 60 | 61 | self.csv_file: str = "Information-Retrieval_evaluation" + name + "_results.csv" 62 | self.csv_headers = ["epoch", "steps"] 63 | 64 | for score_name in self.score_function_names: 65 | for k in precision_recall_at_k: 66 | self.csv_headers.append("{}-Precision@{}".format(score_name, k)) 67 | self.csv_headers.append("{}-Recall@{}".format(score_name, k)) 68 | 69 | def __call__(self, model, output_path: str = None, epoch: int = -1, steps: int = -1, *args, **kwargs) -> float: 70 | if epoch != -1: 71 | out_txt = " after epoch {}:".format(epoch) if steps == -1 else " in epoch {} after {} steps:".format(epoch, steps) 72 | else: 73 | out_txt = ":" 74 | 75 | scores = self.compute_metrices(model, *args, **kwargs) 76 | 77 | # Write results to disc 78 | if output_path is not None and self.write_csv: 79 | csv_path = os.path.join(output_path, self.csv_file) 80 | if not os.path.isfile(csv_path): 81 | fOut = open(csv_path, mode="w", encoding="utf-8") 82 | fOut.write(",".join(self.csv_headers)) 83 | fOut.write("\n") 84 | 85 | else: 86 | fOut = open(csv_path, mode="a", encoding="utf-8") 87 | 88 | output_data = [epoch, steps] 89 | for name in self.score_function_names: 90 | for k in self.precision_recall_at_k: 91 | output_data.append(scores[name]['precision@k'][k]) 92 | output_data.append(scores[name]['recall@k'][k]) 93 | 94 | fOut.write(",".join(map(str, output_data))) 95 | fOut.write("\n") 96 | fOut.close() 97 | 98 | if self.main_score_function is None: 99 | return max([scores[name]['recall@k'][max(self.precision_recall_at_k)] for name in self.score_function_names]) 100 | else: 101 | return scores[self.main_score_function]['recall@k'][max(self.precision_recall_at_k)] 102 | 103 | def compute_metrices(self, model, corpus_model = None, corpus_embeddings: Tensor = None) -> Dict[str, float]: 104 | if corpus_model is None: 105 | corpus_model = model 106 | 107 | max_k = max(self.precision_recall_at_k) 108 | 109 | # Compute embedding for the queries 110 | query_embeddings = model.encode(self.queries, show_progress_bar=self.show_progress_bar, batch_size=self.batch_size, convert_to_tensor=True) 111 | 112 | queries_result_list = {} 113 | for name in self.score_functions: 114 | queries_result_list[name] = [[] for _ in range(len(query_embeddings))] 115 | 116 | #Iterate over chunks of the corpus 117 | for corpus_start_idx in trange(0, len(self.corpus), self.corpus_chunk_size, desc='Corpus Chunks', disable=not self.show_progress_bar): 118 | corpus_end_idx = min(corpus_start_idx + self.corpus_chunk_size, len(self.corpus)) 119 | 120 | #Encode chunk of corpus 121 | if corpus_embeddings is None: 122 | sub_corpus_embeddings = corpus_model.encode(self.corpus[corpus_start_idx:corpus_end_idx], show_progress_bar=False, batch_size=self.batch_size, convert_to_tensor=True) 123 | else: 124 | sub_corpus_embeddings = corpus_embeddings[corpus_start_idx:corpus_end_idx] 125 | 126 | #Compute cosine similarites 127 | for name, score_function in self.score_functions.items(): 128 | pair_scores = score_function(query_embeddings, sub_corpus_embeddings) 129 | 130 | #Get top-k values 131 | pair_scores_top_k_values, pair_scores_top_k_idx = torch.topk(pair_scores, min(max_k, len(pair_scores[0])), dim=1, largest=True, sorted=False) 132 | pair_scores_top_k_values = pair_scores_top_k_values.cpu().tolist() 133 | pair_scores_top_k_idx = pair_scores_top_k_idx.cpu().tolist() 134 | 135 | for query_itr in range(len(query_embeddings)): 136 | for sub_corpus_id, score in zip(pair_scores_top_k_idx[query_itr], pair_scores_top_k_values[query_itr]): 137 | corpus_id = self.corpus_ids[corpus_start_idx+sub_corpus_id] 138 | if len(queries_result_list[name][query_itr]) < max_k: 139 | heapq.heappush(queries_result_list[name][query_itr], (score, corpus_id)) # heaqp tracks the quantity of the first element in the tuple 140 | else: 141 | heapq.heappushpop(queries_result_list[name][query_itr], (score, corpus_id)) 142 | 143 | for name in queries_result_list: 144 | for query_itr in range(len(queries_result_list[name])): 145 | for doc_itr in range(len(queries_result_list[name][query_itr])): 146 | score, corpus_id = queries_result_list[name][query_itr][doc_itr] 147 | queries_result_list[name][query_itr][doc_itr] = {'corpus_id': corpus_id, 'score': score} 148 | 149 | 150 | #Compute scores 151 | scores = {name: self.compute_metrics(queries_result_list[name]) for name in self.score_functions} 152 | 153 | return scores 154 | 155 | 156 | def compute_metrics(self, queries_result_list: List[object]): 157 | # Init score computation values 158 | precisions_at_k = {k: [] for k in self.precision_recall_at_k} 159 | recall_at_k = {k: [] for k in self.precision_recall_at_k} 160 | 161 | # Compute scores on results 162 | for query_itr in range(len(queries_result_list)): 163 | query_id = self.queries_ids[query_itr] 164 | 165 | # Sort scores 166 | top_hits = sorted(queries_result_list[query_itr], key=lambda x: x['score'], reverse=True) 167 | query_relevant_docs = self.relevant_docs[query_id] 168 | 169 | # Precision and Recall@k 170 | for k_val in self.precision_recall_at_k: 171 | num_correct = 0 172 | for hit in top_hits[0:k_val]: 173 | if hit['corpus_id'] in query_relevant_docs: 174 | num_correct += 1 175 | 176 | precisions_at_k[k_val].append(num_correct / k_val) 177 | recall_at_k[k_val].append(num_correct / len(query_relevant_docs)) 178 | 179 | 180 | for k in precisions_at_k: 181 | precisions_at_k[k] = np.mean(precisions_at_k[k]) 182 | 183 | for k in recall_at_k: 184 | recall_at_k[k] = np.mean(recall_at_k[k]) 185 | 186 | 187 | return {'precision@k': precisions_at_k, 'recall@k': recall_at_k} -------------------------------------------------------------------------------- /4. Inference.ipynb: -------------------------------------------------------------------------------- 1 | { 2 | "cells": [ 3 | { 4 | "cell_type": "markdown", 5 | "id": "7b820d87", 6 | "metadata": { 7 | "papermill": { 8 | "duration": 0.006325, 9 | "end_time": "2023-02-23T15:13:33.107755", 10 | "exception": false, 11 | "start_time": "2023-02-23T15:13:33.101430", 12 | "status": "completed" 13 | }, 14 | "tags": [] 15 | }, 16 | "source": [ 17 | "# Sentence Transformers + Cross Encoders\n", 18 | "## 2-Stage Inference Pipeline (Long Trained)" 19 | ] 20 | }, 21 | { 22 | "cell_type": "code", 23 | "execution_count": null, 24 | "id": "9f353b9c", 25 | "metadata": { 26 | "execution": { 27 | "iopub.execute_input": "2023-02-23T15:13:33.120545Z", 28 | "iopub.status.busy": "2023-02-23T15:13:33.119675Z", 29 | "iopub.status.idle": "2023-02-23T15:13:33.129725Z", 30 | "shell.execute_reply": "2023-02-23T15:13:33.128896Z" 31 | }, 32 | "papermill": { 33 | "duration": 0.018713, 34 | "end_time": "2023-02-23T15:13:33.131733", 35 | "exception": false, 36 | "start_time": "2023-02-23T15:13:33.113020", 37 | "status": "completed" 38 | }, 39 | "tags": [] 40 | }, 41 | "outputs": [], 42 | "source": [ 43 | "import os\n", 44 | "os.environ[\"KMP_DUPLICATE_LIB_OK\"]=\"TRUE\"" 45 | ] 46 | }, 47 | { 48 | "cell_type": "code", 49 | "execution_count": null, 50 | "id": "3733292a", 51 | "metadata": { 52 | "execution": { 53 | "iopub.execute_input": "2023-02-23T15:13:33.144274Z", 54 | "iopub.status.busy": "2023-02-23T15:13:33.143560Z", 55 | "iopub.status.idle": "2023-02-23T15:13:58.340278Z", 56 | "shell.execute_reply": "2023-02-23T15:13:58.338965Z" 57 | }, 58 | "papermill": { 59 | "duration": 25.205642, 60 | "end_time": "2023-02-23T15:13:58.342846", 61 | "exception": false, 62 | "start_time": "2023-02-23T15:13:33.137204", 63 | "status": "completed" 64 | }, 65 | "tags": [] 66 | }, 67 | "outputs": [], 68 | "source": [ 69 | "!cp \"/kaggle/input/sentence-transformers-lib-for-2023-01-23/sentence-transformers-2.2.2.piplib\" \"sentence-transformers-2.2.2.tar.gz\"\n", 70 | "!pip install sentence-transformers-2.2.2.tar.gz --no-dependencies -qqq" 71 | ] 72 | }, 73 | { 74 | "cell_type": "code", 75 | "execution_count": null, 76 | "id": "e96d2d82", 77 | "metadata": { 78 | "_cell_guid": "b1076dfc-b9ad-4769-8c92-a6c4dae69d19", 79 | "_uuid": "8f2839f25d086af736a60e9eeb907d3b93b6e0e5", 80 | "execution": { 81 | "iopub.execute_input": "2023-02-23T15:13:58.354846Z", 82 | "iopub.status.busy": "2023-02-23T15:13:58.354493Z", 83 | "iopub.status.idle": "2023-02-23T15:14:04.700951Z", 84 | "shell.execute_reply": "2023-02-23T15:14:04.699679Z" 85 | }, 86 | "papermill": { 87 | "duration": 6.354979, 88 | "end_time": "2023-02-23T15:14:04.703267", 89 | "exception": false, 90 | "start_time": "2023-02-23T15:13:58.348288", 91 | "status": "completed" 92 | }, 93 | "tags": [] 94 | }, 95 | "outputs": [], 96 | "source": [ 97 | "# =========================================================================================\n", 98 | "# Libraries\n", 99 | "# =========================================================================================\n", 100 | "import gc\n", 101 | "import time\n", 102 | "import math\n", 103 | "import random\n", 104 | "import warnings\n", 105 | "warnings.filterwarnings(\"ignore\")\n", 106 | "import numpy as np\n", 107 | "import pandas as pd\n", 108 | "from tqdm.auto import tqdm\n", 109 | "import json\n", 110 | "\n", 111 | "from cuml.neighbors import NearestNeighbors\n", 112 | "import cupy as cp\n", 113 | "\n", 114 | "from sentence_transformers import models, losses\n", 115 | "from sentence_transformers import SentenceTransformer, CrossEncoder, util\n", 116 | "from sentence_transformers.readers import InputExample\n", 117 | "from sentence_transformers.cross_encoder.evaluation import CEBinaryClassificationEvaluator\n", 118 | "from torch.utils.data import DataLoader\n", 119 | "import torch\n", 120 | "\n", 121 | "%env TOKENIZERS_PARALLELISM=true\n", 122 | "%env TRANSFORMERS_NO_ADVISORY_WARNINGS=true" 123 | ] 124 | }, 125 | { 126 | "cell_type": "code", 127 | "execution_count": null, 128 | "id": "c9712324", 129 | "metadata": { 130 | "execution": { 131 | "iopub.execute_input": "2023-02-23T15:14:04.716086Z", 132 | "iopub.status.busy": "2023-02-23T15:14:04.715775Z", 133 | "iopub.status.idle": "2023-02-23T15:14:04.748578Z", 134 | "shell.execute_reply": "2023-02-23T15:14:04.747553Z" 135 | }, 136 | "papermill": { 137 | "duration": 0.041883, 138 | "end_time": "2023-02-23T15:14:04.750547", 139 | "exception": false, 140 | "start_time": "2023-02-23T15:14:04.708664", 141 | "status": "completed" 142 | }, 143 | "tags": [] 144 | }, 145 | "outputs": [], 146 | "source": [ 147 | "def get_neighbors(topic_df,\n", 148 | " content_df,\n", 149 | " config_obj):\n", 150 | " # Create unsupervised model to extract embeddings\n", 151 | " model = SentenceTransformer(config_obj[\"unsupervised_model\"][\"save_name\"])\n", 152 | " model = model.to(\"cuda\")\n", 153 | "\n", 154 | " # Predict\n", 155 | " topics_preds = model.encode(topic_df[\"model_input\"],\n", 156 | " show_progress_bar=True,\n", 157 | " convert_to_tensor=True)\n", 158 | " topics_preds_gpu = cp.asarray(topics_preds)\n", 159 | "\n", 160 | " content_preds = model.encode(content_df[\"model_input\"],\n", 161 | " show_progress_bar=True,\n", 162 | " convert_to_tensor=True,\n", 163 | " batch_size=100)\n", 164 | " content_preds_gpu = cp.asarray(content_preds)\n", 165 | "\n", 166 | " # Release memory\n", 167 | " torch.cuda.empty_cache()\n", 168 | " gc.collect()\n", 169 | "\n", 170 | " # KNN model\n", 171 | " print(' ')\n", 172 | " print('Training KNN model...')\n", 173 | " neighbors_model = NearestNeighbors(n_neighbors=config_obj[\"unsupervised_model\"][\"top_n\"],\n", 174 | " metric='cosine')\n", 175 | " neighbors_model.fit(content_preds_gpu)\n", 176 | " indices = neighbors_model.kneighbors(topics_preds_gpu, return_distance=False)\n", 177 | " predictions = []\n", 178 | " for k in tqdm(range(len(indices))):\n", 179 | " pred = indices[k]\n", 180 | " p = ' '.join([content_df.loc[ind, 'id'] for ind in pred.get()])\n", 181 | " predictions.append(p)\n", 182 | " topic_df['predictions'] = predictions\n", 183 | "\n", 184 | " # Release memory\n", 185 | " del topics_preds, content_preds, topics_preds_gpu, content_preds_gpu, neighbors_model, predictions, indices, model\n", 186 | " gc.collect()\n", 187 | " torch.cuda.empty_cache()\n", 188 | " gc.collect()\n", 189 | "\n", 190 | " return topic_df, content_df\n", 191 | "\n", 192 | "\n", 193 | "\n", 194 | "def build_training_set(topic_df,\n", 195 | " content_df,\n", 196 | " mode=\"local\"):\n", 197 | " # Create lists for training\n", 198 | " topics_ids = []\n", 199 | " content_ids = []\n", 200 | " title1 = []\n", 201 | " title2 = []\n", 202 | " targets = []\n", 203 | " # Iterate over each topic\n", 204 | " for k in tqdm(range(len(topic_df))):\n", 205 | " row = topic_df.iloc[k]\n", 206 | " topics_id = row['id']\n", 207 | " topics_title = row['model_input']\n", 208 | " predictions = row['predictions'].split(' ')\n", 209 | "\n", 210 | " if mode == \"local\":\n", 211 | " ground_truth = row['content_ids'].split(' ')\n", 212 | "\n", 213 | " for pred in predictions:\n", 214 | " content_title = content_df.loc[pred, 'model_input']\n", 215 | " topics_ids.append(topics_id)\n", 216 | " content_ids.append(pred)\n", 217 | " title1.append(topics_title)\n", 218 | " title2.append(content_title)\n", 219 | " # If pred is in ground truth, 1 else 0\n", 220 | " if mode == \"local\":\n", 221 | " if pred in ground_truth:\n", 222 | " targets.append(1)\n", 223 | " else:\n", 224 | " targets.append(0)\n", 225 | " # Build training dataset\n", 226 | " train = pd.DataFrame(\n", 227 | " {'topics_ids': topics_ids,\n", 228 | " 'content_ids': content_ids,\n", 229 | " 'model_input1': title1,\n", 230 | " 'model_input2': title2\n", 231 | " }\n", 232 | " )\n", 233 | " if mode == \"local\":\n", 234 | " train[\"target\"] = targets\n", 235 | "\n", 236 | " return train\n", 237 | "\n", 238 | "def read_data(data_path,\n", 239 | " config_obj,\n", 240 | " read_mode=\"all\"):\n", 241 | " topics = pd.read_csv(data_path + 'topics.csv')\n", 242 | " content = pd.read_csv(data_path + 'content.csv')\n", 243 | "\n", 244 | " if read_mode != \"all\":\n", 245 | " correlations = pd.read_csv(data_path + 'correlations.csv')\n", 246 | " else:\n", 247 | " correlations = None\n", 248 | " topic_trees = generate_topic_tree(topics)\n", 249 | "\n", 250 | " if read_mode != \"all\":\n", 251 | " splits = pd.read_csv(\"train_test_splits.csv\")\n", 252 | " topics = topics[topics.id.isin(splits[splits.fold == read_mode].id)].reset_index(drop=True)\n", 253 | "\n", 254 | " topics = topics.merge(topic_trees, how=\"left\", on=\"id\")\n", 255 | " del topic_trees\n", 256 | " gc.collect()\n", 257 | "\n", 258 | " topic_tokens = generate_topic_model_input(input_df=topics)\n", 259 | " content_tokens = generate_content_model_input(input_df=content)\n", 260 | "\n", 261 | " unq_tokens = set(topic_tokens + content_tokens + [\"nan\"])\n", 262 | "\n", 263 | " # Sort by title length to make inference faster\n", 264 | " topics['length'] = topics['title'].apply(lambda x: len(x))\n", 265 | " content['length'] = content['title'].apply(lambda x: len(x))\n", 266 | " topics.sort_values('length', inplace=True)\n", 267 | " content.sort_values('length', inplace=True)\n", 268 | "\n", 269 | " # Drop cols\n", 270 | " topics.drop(['length'], axis=1,\n", 271 | " inplace=True)\n", 272 | " content.drop(['length'], axis=1,\n", 273 | " inplace=True)\n", 274 | " # Reset index\n", 275 | " topics.reset_index(drop=True, inplace=True)\n", 276 | " content.reset_index(drop=True, inplace=True)\n", 277 | " print(' ')\n", 278 | " print('-' * 50)\n", 279 | " print(f\"topics.shape: {topics.shape}\")\n", 280 | " print(f\"content.shape: {content.shape}\")\n", 281 | " if read_mode != \"all\":\n", 282 | " print(f\"correlations.shape: {correlations.shape}\")\n", 283 | "\n", 284 | " return topics, content, correlations, unq_tokens\n", 285 | "\n", 286 | "\n", 287 | "def generate_token_features(input_df,\n", 288 | " token_features):\n", 289 | " \"\"\"\n", 290 | " :param input_df: Input topic dataframe\n", 291 | " :param token_features: Target columns for \"unique value token encoding\"\n", 292 | " :return: Tuple of (Dataframe with additional model input column, Unique Special Tokens)\n", 293 | " \"\"\"\n", 294 | " token_feature_set = None\n", 295 | " special_tokens = []\n", 296 | "\n", 297 | " for token_feature in token_features:\n", 298 | "\n", 299 | " token_feature_str = \"[<[\" + token_feature + \"_\" +\\\n", 300 | " input_df[token_feature].astype(str) + \"]>]\"\n", 301 | " special_tokens += set(token_feature_str.values)\n", 302 | "\n", 303 | " if not isinstance(token_feature_set, pd.Series):\n", 304 | " token_feature_set = token_feature_str\n", 305 | " else:\n", 306 | " token_feature_set += \" \" + token_feature_str\n", 307 | "\n", 308 | " return token_feature_set, special_tokens\n", 309 | "\n", 310 | "\n", 311 | "def generate_topic_model_input(input_df):\n", 312 | " \"\"\"\n", 313 | " :param input_df: Input topic dataframe\n", 314 | " :return: Dataframe with additional model input column\n", 315 | " \"\"\"\n", 316 | "\n", 317 | " input_df.fillna(\"nan\", inplace=True)\n", 318 | "\n", 319 | " token_features = [\n", 320 | " \"language\",\n", 321 | " # \"level\",\n", 322 | " # \"reverse_level\"\n", 323 | " ]\n", 324 | " token_feature_text, special_tokens = generate_token_features(input_df=input_df,\n", 325 | " token_features=token_features)\n", 326 | "\n", 327 | "\n", 328 | " input_df[\"model_input\"] = (\n", 329 | " token_feature_text +\n", 330 | " \" [<[topic_title]>] \" + input_df[\"title\"].astype(str) +\n", 331 | " \" [<[topic_tree]>] \" + input_df[\"topic_tree\"].astype(str) +\n", 332 | " \" [<[topic_desc]>] \" + input_df[\"description\"].astype(str)\n", 333 | " ).str.lower()#.str.split().apply(lambda x: \" \".join(x[:seq_len]))\n", 334 | "\n", 335 | " del token_feature_text\n", 336 | "\n", 337 | " input_df.drop(['description', 'channel', 'category',\n", 338 | " 'level', 'parent', 'has_content'],\n", 339 | " axis=1,\n", 340 | " inplace=True)\n", 341 | " gc.collect()\n", 342 | "\n", 343 | " return special_tokens\n", 344 | "\n", 345 | "\n", 346 | "def generate_content_model_input(input_df):\n", 347 | " \"\"\"\n", 348 | " :param input_df: Input content dataframe\n", 349 | " :return: Dataframe with additional model input column\n", 350 | " \"\"\"\n", 351 | "\n", 352 | " input_df.fillna(\"nan\", inplace=True)\n", 353 | "\n", 354 | " token_features = [\"language\", \"kind\"]\n", 355 | " token_feature_text, special_tokens = generate_token_features(input_df=input_df,\n", 356 | " token_features=token_features)\n", 357 | "\n", 358 | " input_df[\"model_input\"] = (\n", 359 | " token_feature_text +\n", 360 | " \" [<[cntnt_title]>] \" + input_df[\"title\"].astype(str) +\n", 361 | " \" [<[cntnt_desc]>] \" + input_df[\"description\"].astype(str) +\n", 362 | " \" [<[cntnt_text]>] \" + input_df[\"text\"].astype(str)\n", 363 | " ).apply(lambda x: \" \".join(x.split()[:512])).str.lower()\n", 364 | "\n", 365 | " del token_feature_text\n", 366 | "\n", 367 | " input_df.drop(['description', 'kind', 'text', 'copyright_holder', 'license'],\n", 368 | " axis=1,\n", 369 | " inplace=True)\n", 370 | " gc.collect()\n", 371 | "\n", 372 | " return special_tokens\n", 373 | "\n", 374 | "\n", 375 | "def generate_topic_tree(input_topic_df):\n", 376 | " df = pd.DataFrame()\n", 377 | "\n", 378 | " for channel in tqdm(input_topic_df['channel'].unique()):\n", 379 | " channel_df = input_topic_df[(input_topic_df['channel'] == channel)].reset_index(drop=True)\n", 380 | " for level in sorted(channel_df.level.unique()):\n", 381 | " # For level 0, it first creates a topic tree column which is the title of that topic.\n", 382 | " if level == 0:\n", 383 | " topic_tree = channel_df[channel_df['level'] == level]['title'].astype(str)\n", 384 | " topic_tree_df = pd.DataFrame([channel_df[channel_df['level'] == level][['id']], topic_tree.values]).T\n", 385 | " topic_tree_df.columns = ['child_id', 'topic_tree']\n", 386 | " channel_df = channel_df.merge(topic_tree_df, left_on='id', right_on='child_id', how='left').drop(\n", 387 | " ['child_id'], axis=1)\n", 388 | "\n", 389 | " # Once the topic tree column has been created, the parent node and child node is merged on parent_id = child_id\n", 390 | " topic_df_parent = channel_df[channel_df['level'] == level][['id', 'title', 'parent', 'topic_tree']]\n", 391 | " topic_df_parent.columns = 'parent_' + topic_df_parent.columns\n", 392 | "\n", 393 | " topic_df_child = channel_df[channel_df['level'] == level + 1][['id', 'title', 'parent', 'topic_tree']]\n", 394 | " topic_df_child.columns = 'child_' + topic_df_child.columns\n", 395 | "\n", 396 | " topic_df_merged = topic_df_parent.merge(topic_df_child, left_on='parent_id', right_on='child_parent')[\n", 397 | " ['child_id', 'parent_id', 'parent_title', 'child_title', 'parent_topic_tree']]\n", 398 | "\n", 399 | " # Topic tree is parent topic tree + title of the current child on that level\n", 400 | " topic_tree = topic_df_merged['parent_topic_tree'].astype(str) + ' > ' + topic_df_merged[\n", 401 | " 'child_title'].astype(str)\n", 402 | "\n", 403 | " topic_tree_df = pd.DataFrame([topic_df_merged['child_id'].values, topic_tree.values]).T\n", 404 | " topic_tree_df.columns = ['child_id', 'topic_tree']\n", 405 | "\n", 406 | " channel_df = channel_df.merge(topic_tree_df, left_on='id', right_on='child_id', how='left').drop(\n", 407 | " ['child_id'], axis=1)\n", 408 | " if 'topic_tree_y' in list(channel_df.columns):\n", 409 | " channel_df['topic_tree'] = channel_df['topic_tree_x'].combine_first(channel_df['topic_tree_y'])\n", 410 | " channel_df = channel_df.drop(['topic_tree_x', 'topic_tree_y'], axis=1)\n", 411 | "\n", 412 | " df = pd.concat([df, channel_df], ignore_index=True)\n", 413 | "\n", 414 | " df = df.merge(df.groupby(\"channel\")[\"level\"].max().rename(\"max_level_of_channel\").reset_index(),\n", 415 | " how=\"left\",\n", 416 | " on=\"channel\")\n", 417 | "\n", 418 | " df[\"reverse_level\"] = df[\"max_level_of_channel\"] - df[\"level\"]\n", 419 | "\n", 420 | " return df[[\"id\", \"topic_tree\", \"reverse_level\"]]" 421 | ] 422 | }, 423 | { 424 | "cell_type": "code", 425 | "execution_count": null, 426 | "id": "7a88675d", 427 | "metadata": { 428 | "execution": { 429 | "iopub.execute_input": "2023-02-23T15:14:04.762676Z", 430 | "iopub.status.busy": "2023-02-23T15:14:04.761790Z", 431 | "iopub.status.idle": "2023-02-23T15:14:04.767070Z", 432 | "shell.execute_reply": "2023-02-23T15:14:04.766159Z" 433 | }, 434 | "papermill": { 435 | "duration": 0.013276, 436 | "end_time": "2023-02-23T15:14:04.769041", 437 | "exception": false, 438 | "start_time": "2023-02-23T15:14:04.755765", 439 | "status": "completed" 440 | }, 441 | "tags": [] 442 | }, 443 | "outputs": [], 444 | "source": [ 445 | "config = {\n", 446 | " \"unsupervised_model\": {\n", 447 | " \"save_name\": \"trained_models/unsupervised/paraphrase-multilingual-mpnet-base-v2\",\n", 448 | " \"seq_len\": 128,\n", 449 | " \"top_n\": 100,\n", 450 | " \"seed\": 42\n", 451 | " },\n", 452 | " \"supervised_model\": {\n", 453 | " \"save_name\": \"trained_models/supervised/paraphrase-multilingual-mpnet-base-v2\",\n", 454 | " \"threshold\": 0.015,\n", 455 | " \"seq_len\": 128\n", 456 | " }\n", 457 | "}\n", 458 | "\n", 459 | "DATA_PATH = \"../raw_data/\"" 460 | ] 461 | }, 462 | { 463 | "cell_type": "code", 464 | "execution_count": null, 465 | "id": "0040639a", 466 | "metadata": { 467 | "execution": { 468 | "iopub.execute_input": "2023-02-23T15:14:04.780558Z", 469 | "iopub.status.busy": "2023-02-23T15:14:04.780295Z", 470 | "iopub.status.idle": "2023-02-23T15:14:04.808723Z", 471 | "shell.execute_reply": "2023-02-23T15:14:04.807881Z" 472 | }, 473 | "papermill": { 474 | "duration": 0.036685, 475 | "end_time": "2023-02-23T15:14:04.810858", 476 | "exception": false, 477 | "start_time": "2023-02-23T15:14:04.774173", 478 | "status": "completed" 479 | }, 480 | "tags": [] 481 | }, 482 | "outputs": [], 483 | "source": [ 484 | "submission_df = pd.read_csv(DATA_PATH + \"sample_submission.csv\")\n", 485 | "submission_df.head()" 486 | ] 487 | }, 488 | { 489 | "cell_type": "markdown", 490 | "id": "3792f27d", 491 | "metadata": { 492 | "papermill": { 493 | "duration": 0.00517, 494 | "end_time": "2023-02-23T15:14:04.821540", 495 | "exception": false, 496 | "start_time": "2023-02-23T15:14:04.816370", 497 | "status": "completed" 498 | }, 499 | "tags": [] 500 | }, 501 | "source": [ 502 | "## 1. Retrieve" 503 | ] 504 | }, 505 | { 506 | "cell_type": "code", 507 | "execution_count": null, 508 | "id": "4eb17d8f", 509 | "metadata": { 510 | "execution": { 511 | "iopub.execute_input": "2023-02-23T15:14:04.833240Z", 512 | "iopub.status.busy": "2023-02-23T15:14:04.832940Z", 513 | "iopub.status.idle": "2023-02-23T15:14:52.290194Z", 514 | "shell.execute_reply": "2023-02-23T15:14:52.288435Z" 515 | }, 516 | "papermill": { 517 | "duration": 47.466353, 518 | "end_time": "2023-02-23T15:14:52.293108", 519 | "exception": false, 520 | "start_time": "2023-02-23T15:14:04.826755", 521 | "status": "completed" 522 | }, 523 | "tags": [] 524 | }, 525 | "outputs": [], 526 | "source": [ 527 | "# Read data\n", 528 | "topics, content, correlations, _ = read_data(data_path=DATA_PATH,\n", 529 | " config_obj=config,\n", 530 | " read_mode=\"all\")" 531 | ] 532 | }, 533 | { 534 | "cell_type": "code", 535 | "execution_count": null, 536 | "id": "4962fd33", 537 | "metadata": { 538 | "execution": { 539 | "iopub.execute_input": "2023-02-23T15:14:52.307313Z", 540 | "iopub.status.busy": "2023-02-23T15:14:52.307017Z", 541 | "iopub.status.idle": "2023-02-23T15:14:52.339996Z", 542 | "shell.execute_reply": "2023-02-23T15:14:52.339100Z" 543 | }, 544 | "papermill": { 545 | "duration": 0.04311, 546 | "end_time": "2023-02-23T15:14:52.342375", 547 | "exception": false, 548 | "start_time": "2023-02-23T15:14:52.299265", 549 | "status": "completed" 550 | }, 551 | "tags": [] 552 | }, 553 | "outputs": [], 554 | "source": [ 555 | "topics = topics[topics.id.isin(submission_df.topic_id)].reset_index(drop=True)" 556 | ] 557 | }, 558 | { 559 | "cell_type": "code", 560 | "execution_count": null, 561 | "id": "318a4b42", 562 | "metadata": { 563 | "execution": { 564 | "iopub.execute_input": "2023-02-23T15:14:52.355306Z", 565 | "iopub.status.busy": "2023-02-23T15:14:52.354438Z", 566 | "iopub.status.idle": "2023-02-23T15:23:01.948690Z", 567 | "shell.execute_reply": "2023-02-23T15:23:01.947558Z" 568 | }, 569 | "papermill": { 570 | "duration": 489.603181, 571 | "end_time": "2023-02-23T15:23:01.951109", 572 | "exception": false, 573 | "start_time": "2023-02-23T15:14:52.347928", 574 | "status": "completed" 575 | }, 576 | "tags": [] 577 | }, 578 | "outputs": [], 579 | "source": [ 580 | "# Run nearest neighbors\n", 581 | "topics, content = get_neighbors(topic_df=topics,\n", 582 | " content_df=content,\n", 583 | " config_obj=config)" 584 | ] 585 | }, 586 | { 587 | "cell_type": "code", 588 | "execution_count": null, 589 | "id": "f8943a26", 590 | "metadata": { 591 | "execution": { 592 | "iopub.execute_input": "2023-02-23T15:23:01.964996Z", 593 | "iopub.status.busy": "2023-02-23T15:23:01.964666Z", 594 | "iopub.status.idle": "2023-02-23T15:23:01.969791Z", 595 | "shell.execute_reply": "2023-02-23T15:23:01.968777Z" 596 | }, 597 | "papermill": { 598 | "duration": 0.014101, 599 | "end_time": "2023-02-23T15:23:01.971944", 600 | "exception": false, 601 | "start_time": "2023-02-23T15:23:01.957843", 602 | "status": "completed" 603 | }, 604 | "tags": [] 605 | }, 606 | "outputs": [], 607 | "source": [ 608 | "# Set id as index for content\n", 609 | "content.set_index('id', inplace = True)" 610 | ] 611 | }, 612 | { 613 | "cell_type": "code", 614 | "execution_count": null, 615 | "id": "e03bf7db", 616 | "metadata": { 617 | "execution": { 618 | "iopub.execute_input": "2023-02-23T15:23:01.984942Z", 619 | "iopub.status.busy": "2023-02-23T15:23:01.984403Z", 620 | "iopub.status.idle": "2023-02-23T15:23:02.045985Z", 621 | "shell.execute_reply": "2023-02-23T15:23:02.044960Z" 622 | }, 623 | "papermill": { 624 | "duration": 0.073789, 625 | "end_time": "2023-02-23T15:23:02.051503", 626 | "exception": false, 627 | "start_time": "2023-02-23T15:23:01.977714", 628 | "status": "completed" 629 | }, 630 | "tags": [] 631 | }, 632 | "outputs": [], 633 | "source": [ 634 | "df = build_training_set(topic_df=topics,\n", 635 | " content_df=content,\n", 636 | " mode=\"kaggle\")" 637 | ] 638 | }, 639 | { 640 | "cell_type": "code", 641 | "execution_count": null, 642 | "id": "2d9b604b", 643 | "metadata": { 644 | "execution": { 645 | "iopub.execute_input": "2023-02-23T15:23:02.064767Z", 646 | "iopub.status.busy": "2023-02-23T15:23:02.064461Z", 647 | "iopub.status.idle": "2023-02-23T15:23:02.074630Z", 648 | "shell.execute_reply": "2023-02-23T15:23:02.073778Z" 649 | }, 650 | "papermill": { 651 | "duration": 0.019169, 652 | "end_time": "2023-02-23T15:23:02.076681", 653 | "exception": false, 654 | "start_time": "2023-02-23T15:23:02.057512", 655 | "status": "completed" 656 | }, 657 | "tags": [] 658 | }, 659 | "outputs": [], 660 | "source": [ 661 | "df.head()" 662 | ] 663 | }, 664 | { 665 | "cell_type": "code", 666 | "execution_count": null, 667 | "id": "6d8c1ce8", 668 | "metadata": { 669 | "execution": { 670 | "iopub.execute_input": "2023-02-23T15:23:02.089997Z", 671 | "iopub.status.busy": "2023-02-23T15:23:02.089683Z", 672 | "iopub.status.idle": "2023-02-23T15:23:02.162213Z", 673 | "shell.execute_reply": "2023-02-23T15:23:02.161079Z" 674 | }, 675 | "papermill": { 676 | "duration": 0.08174, 677 | "end_time": "2023-02-23T15:23:02.164562", 678 | "exception": false, 679 | "start_time": "2023-02-23T15:23:02.082822", 680 | "status": "completed" 681 | }, 682 | "tags": [] 683 | }, 684 | "outputs": [], 685 | "source": [ 686 | "del topics, content" 687 | ] 688 | }, 689 | { 690 | "cell_type": "markdown", 691 | "id": "842cb01b", 692 | "metadata": { 693 | "papermill": { 694 | "duration": 0.00618, 695 | "end_time": "2023-02-23T15:23:02.177129", 696 | "exception": false, 697 | "start_time": "2023-02-23T15:23:02.170949", 698 | "status": "completed" 699 | }, 700 | "tags": [] 701 | }, 702 | "source": [ 703 | "## 2. Re-Rank" 704 | ] 705 | }, 706 | { 707 | "cell_type": "code", 708 | "execution_count": null, 709 | "id": "ff3b579e", 710 | "metadata": { 711 | "execution": { 712 | "iopub.execute_input": "2023-02-23T15:23:02.191720Z", 713 | "iopub.status.busy": "2023-02-23T15:23:02.191365Z", 714 | "iopub.status.idle": "2023-02-23T15:23:12.956495Z", 715 | "shell.execute_reply": "2023-02-23T15:23:12.955505Z" 716 | }, 717 | "papermill": { 718 | "duration": 10.775843, 719 | "end_time": "2023-02-23T15:23:12.959193", 720 | "exception": false, 721 | "start_time": "2023-02-23T15:23:02.183350", 722 | "status": "completed" 723 | }, 724 | "tags": [] 725 | }, 726 | "outputs": [], 727 | "source": [ 728 | "model = CrossEncoder(config[\"supervised_model\"][\"save_name\"],\n", 729 | " num_labels=1,\n", 730 | " max_length=config[\"supervised_model\"][\"seq_len\"])" 731 | ] 732 | }, 733 | { 734 | "cell_type": "code", 735 | "execution_count": null, 736 | "id": "5d45a453", 737 | "metadata": { 738 | "execution": { 739 | "iopub.execute_input": "2023-02-23T15:23:12.973617Z", 740 | "iopub.status.busy": "2023-02-23T15:23:12.973320Z", 741 | "iopub.status.idle": "2023-02-23T15:23:14.464909Z", 742 | "shell.execute_reply": "2023-02-23T15:23:14.463785Z" 743 | }, 744 | "papermill": { 745 | "duration": 1.501744, 746 | "end_time": "2023-02-23T15:23:14.467476", 747 | "exception": false, 748 | "start_time": "2023-02-23T15:23:12.965732", 749 | "status": "completed" 750 | }, 751 | "tags": [] 752 | }, 753 | "outputs": [], 754 | "source": [ 755 | "preds = model.predict(df[[\"model_input1\", \"model_input2\"]].values,\n", 756 | " show_progress_bar=True,\n", 757 | " batch_size=96)" 758 | ] 759 | }, 760 | { 761 | "cell_type": "code", 762 | "execution_count": null, 763 | "id": "f71ce674", 764 | "metadata": { 765 | "execution": { 766 | "iopub.execute_input": "2023-02-23T15:23:14.481905Z", 767 | "iopub.status.busy": "2023-02-23T15:23:14.481568Z", 768 | "iopub.status.idle": "2023-02-23T15:23:14.486877Z", 769 | "shell.execute_reply": "2023-02-23T15:23:14.485787Z" 770 | }, 771 | "papermill": { 772 | "duration": 0.015178, 773 | "end_time": "2023-02-23T15:23:14.489247", 774 | "exception": false, 775 | "start_time": "2023-02-23T15:23:14.474069", 776 | "status": "completed" 777 | }, 778 | "tags": [] 779 | }, 780 | "outputs": [], 781 | "source": [ 782 | "df[\"pred_score\"] = preds" 783 | ] 784 | }, 785 | { 786 | "cell_type": "code", 787 | "execution_count": null, 788 | "id": "d64b0b43", 789 | "metadata": { 790 | "execution": { 791 | "iopub.execute_input": "2023-02-23T15:23:14.502802Z", 792 | "iopub.status.busy": "2023-02-23T15:23:14.502504Z", 793 | "iopub.status.idle": "2023-02-23T15:23:14.514378Z", 794 | "shell.execute_reply": "2023-02-23T15:23:14.513358Z" 795 | }, 796 | "papermill": { 797 | "duration": 0.022634, 798 | "end_time": "2023-02-23T15:23:14.517931", 799 | "exception": false, 800 | "start_time": "2023-02-23T15:23:14.495297", 801 | "status": "completed" 802 | }, 803 | "tags": [] 804 | }, 805 | "outputs": [], 806 | "source": [ 807 | "df.head()" 808 | ] 809 | }, 810 | { 811 | "cell_type": "code", 812 | "execution_count": null, 813 | "id": "54994de6", 814 | "metadata": { 815 | "execution": { 816 | "iopub.execute_input": "2023-02-23T15:23:14.532685Z", 817 | "iopub.status.busy": "2023-02-23T15:23:14.532379Z", 818 | "iopub.status.idle": "2023-02-23T15:23:14.545105Z", 819 | "shell.execute_reply": "2023-02-23T15:23:14.544282Z" 820 | }, 821 | "papermill": { 822 | "duration": 0.022239, 823 | "end_time": "2023-02-23T15:23:14.547063", 824 | "exception": false, 825 | "start_time": "2023-02-23T15:23:14.524824", 826 | "status": "completed" 827 | }, 828 | "tags": [] 829 | }, 830 | "outputs": [], 831 | "source": [ 832 | "sorted_pred_df = df.sort_values(by=\"pred_score\", ascending=False).reset_index(drop=True)\n", 833 | "\n", 834 | "preds_thr_df = sorted_pred_df[[\"topics_ids\",\"content_ids\"]][sorted_pred_df.pred_score >= config[\"supervised_model\"][\"threshold\"]].\\\n", 835 | " groupby(\"topics_ids\")[\"content_ids\"].apply(lambda x: \" \".join(x)).rename(\"pred_content_ids\").reset_index()" 836 | ] 837 | }, 838 | { 839 | "cell_type": "markdown", 840 | "id": "42aaa5ff", 841 | "metadata": { 842 | "papermill": { 843 | "duration": 0.006148, 844 | "end_time": "2023-02-23T15:23:14.559466", 845 | "exception": false, 846 | "start_time": "2023-02-23T15:23:14.553318", 847 | "status": "completed" 848 | }, 849 | "tags": [] 850 | }, 851 | "source": [ 852 | "## Create Submission" 853 | ] 854 | }, 855 | { 856 | "cell_type": "code", 857 | "execution_count": null, 858 | "id": "d9358cf3", 859 | "metadata": { 860 | "execution": { 861 | "iopub.execute_input": "2023-02-23T15:23:14.573798Z", 862 | "iopub.status.busy": "2023-02-23T15:23:14.573024Z", 863 | "iopub.status.idle": "2023-02-23T15:23:14.581256Z", 864 | "shell.execute_reply": "2023-02-23T15:23:14.580182Z" 865 | }, 866 | "papermill": { 867 | "duration": 0.017179, 868 | "end_time": "2023-02-23T15:23:14.583160", 869 | "exception": false, 870 | "start_time": "2023-02-23T15:23:14.565981", 871 | "status": "completed" 872 | }, 873 | "tags": [] 874 | }, 875 | "outputs": [], 876 | "source": [ 877 | "submission_df = submission_df.merge(preds_thr_df,\n", 878 | " how=\"left\",\n", 879 | " right_on=\"topics_ids\",\n", 880 | " left_on=\"topic_id\")[[\"topic_id\", \"pred_content_ids\"]]\n", 881 | "submission_df.rename(columns={\"pred_content_ids\": \"content_ids\"}, inplace=True)" 882 | ] 883 | }, 884 | { 885 | "cell_type": "code", 886 | "execution_count": null, 887 | "id": "cc258cd6", 888 | "metadata": { 889 | "execution": { 890 | "iopub.execute_input": "2023-02-23T15:23:14.596715Z", 891 | "iopub.status.busy": "2023-02-23T15:23:14.596449Z", 892 | "iopub.status.idle": "2023-02-23T15:23:14.605321Z", 893 | "shell.execute_reply": "2023-02-23T15:23:14.604442Z" 894 | }, 895 | "papermill": { 896 | "duration": 0.018181, 897 | "end_time": "2023-02-23T15:23:14.607572", 898 | "exception": false, 899 | "start_time": "2023-02-23T15:23:14.589391", 900 | "status": "completed" 901 | }, 902 | "tags": [] 903 | }, 904 | "outputs": [], 905 | "source": [ 906 | "submission_df" 907 | ] 908 | }, 909 | { 910 | "cell_type": "code", 911 | "execution_count": null, 912 | "id": "c723a154", 913 | "metadata": { 914 | "execution": { 915 | "iopub.execute_input": "2023-02-23T15:23:14.622141Z", 916 | "iopub.status.busy": "2023-02-23T15:23:14.621368Z", 917 | "iopub.status.idle": "2023-02-23T15:23:14.628021Z", 918 | "shell.execute_reply": "2023-02-23T15:23:14.627091Z" 919 | }, 920 | "papermill": { 921 | "duration": 0.016004, 922 | "end_time": "2023-02-23T15:23:14.629924", 923 | "exception": false, 924 | "start_time": "2023-02-23T15:23:14.613920", 925 | "status": "completed" 926 | }, 927 | "tags": [] 928 | }, 929 | "outputs": [], 930 | "source": [ 931 | "submission_df.to_csv(\"submission.csv\", index=False)" 932 | ] 933 | } 934 | ], 935 | "metadata": { 936 | "kernelspec": { 937 | "display_name": "Python 3 (ipykernel)", 938 | "language": "python", 939 | "name": "python3" 940 | }, 941 | "language_info": { 942 | "codemirror_mode": { 943 | "name": "ipython", 944 | "version": 3 945 | }, 946 | "file_extension": ".py", 947 | "mimetype": "text/x-python", 948 | "name": "python", 949 | "nbconvert_exporter": "python", 950 | "pygments_lexer": "ipython3", 951 | "version": "3.8.10" 952 | }, 953 | "papermill": { 954 | "default_parameters": {}, 955 | "duration": 591.59175, 956 | "end_time": "2023-02-23T15:23:17.226549", 957 | "environment_variables": {}, 958 | "exception": null, 959 | "input_path": "__notebook__.ipynb", 960 | "output_path": "__notebook__.ipynb", 961 | "parameters": {}, 962 | "start_time": "2023-02-23T15:13:25.634799", 963 | "version": "2.3.4" 964 | }, 965 | "widgets": { 966 | "application/vnd.jupyter.widget-state+json": { 967 | "state": { 968 | "000c44770f834dfe8d4b821ce0c3128c": { 969 | "model_module": "@jupyter-widgets/controls", 970 | "model_module_version": "1.5.0", 971 | "model_name": "FloatProgressModel", 972 | "state": { 973 | "_dom_classes": [], 974 | "_model_module": "@jupyter-widgets/controls", 975 | "_model_module_version": "1.5.0", 976 | "_model_name": "FloatProgressModel", 977 | "_view_count": null, 978 | "_view_module": "@jupyter-widgets/controls", 979 | "_view_module_version": "1.5.0", 980 | "_view_name": "ProgressView", 981 | "bar_style": "success", 982 | "description": "", 983 | "description_tooltip": null, 984 | "layout": "IPY_MODEL_a854925446b1440cbeb3b499ea722944", 985 | "max": 171, 986 | "min": 0, 987 | "orientation": "horizontal", 988 | "style": "IPY_MODEL_bc3a207ec7644679b0d003bf9b099051", 989 | "value": 171 990 | } 991 | }, 992 | "02b655f2dd8e453c927f829e5e0377c9": { 993 | "model_module": "@jupyter-widgets/controls", 994 | "model_module_version": "1.5.0", 995 | "model_name": "ProgressStyleModel", 996 | "state": { 997 | "_model_module": "@jupyter-widgets/controls", 998 | "_model_module_version": "1.5.0", 999 | "_model_name": "ProgressStyleModel", 1000 | "_view_count": null, 1001 | "_view_module": "@jupyter-widgets/base", 1002 | "_view_module_version": "1.2.0", 1003 | "_view_name": "StyleView", 1004 | "bar_color": null, 1005 | "description_width": "" 1006 | } 1007 | }, 1008 | "02f0169ac226471e83090c438fdf8014": { 1009 | "model_module": "@jupyter-widgets/controls", 1010 | "model_module_version": "1.5.0", 1011 | "model_name": "HTMLModel", 1012 | "state": { 1013 | "_dom_classes": [], 1014 | "_model_module": "@jupyter-widgets/controls", 1015 | "_model_module_version": "1.5.0", 1016 | "_model_name": "HTMLModel", 1017 | "_view_count": null, 1018 | "_view_module": "@jupyter-widgets/controls", 1019 | "_view_module_version": "1.5.0", 1020 | "_view_name": "HTMLView", 1021 | "description": "", 1022 | "description_tooltip": null, 1023 | "layout": "IPY_MODEL_e73980b6b1db4ef7a191c71fbc5d8831", 1024 | "placeholder": "​", 1025 | "style": "IPY_MODEL_e29b331e164649b8b6e252b71b47434b", 1026 | "value": " 171/171 [00:14<00:00, 20.16it/s]" 1027 | } 1028 | }, 1029 | "04817290568b4998a6e4ae14f5059961": { 1030 | "model_module": "@jupyter-widgets/controls", 1031 | "model_module_version": "1.5.0", 1032 | "model_name": "HBoxModel", 1033 | "state": { 1034 | "_dom_classes": [], 1035 | "_model_module": "@jupyter-widgets/controls", 1036 | "_model_module_version": "1.5.0", 1037 | "_model_name": "HBoxModel", 1038 | "_view_count": null, 1039 | "_view_module": "@jupyter-widgets/controls", 1040 | "_view_module_version": "1.5.0", 1041 | "_view_name": "HBoxView", 1042 | "box_style": "", 1043 | "children": [ 1044 | "IPY_MODEL_302217286df24fdda1da73b6e2ac07d5", 1045 | "IPY_MODEL_2229bdabe0b0470db0ace9182b9dc463", 1046 | "IPY_MODEL_1c63d1cbf66740aeb882233fea9aa0b0" 1047 | ], 1048 | "layout": "IPY_MODEL_404fc9d374dc481abbdd0d4a12567375" 1049 | } 1050 | }, 1051 | "0c6f44b8092841cea7e9ef1fd90ae255": { 1052 | "model_module": "@jupyter-widgets/base", 1053 | "model_module_version": "1.2.0", 1054 | "model_name": "LayoutModel", 1055 | "state": { 1056 | "_model_module": "@jupyter-widgets/base", 1057 | "_model_module_version": "1.2.0", 1058 | "_model_name": "LayoutModel", 1059 | "_view_count": null, 1060 | "_view_module": "@jupyter-widgets/base", 1061 | "_view_module_version": "1.2.0", 1062 | "_view_name": "LayoutView", 1063 | "align_content": null, 1064 | "align_items": null, 1065 | "align_self": null, 1066 | "border": null, 1067 | "bottom": null, 1068 | "display": null, 1069 | "flex": null, 1070 | "flex_flow": null, 1071 | "grid_area": null, 1072 | "grid_auto_columns": null, 1073 | "grid_auto_flow": null, 1074 | "grid_auto_rows": null, 1075 | "grid_column": null, 1076 | "grid_gap": null, 1077 | "grid_row": null, 1078 | "grid_template_areas": null, 1079 | "grid_template_columns": null, 1080 | "grid_template_rows": null, 1081 | "height": null, 1082 | "justify_content": null, 1083 | "justify_items": null, 1084 | "left": null, 1085 | "margin": null, 1086 | "max_height": null, 1087 | "max_width": null, 1088 | "min_height": null, 1089 | "min_width": null, 1090 | "object_fit": null, 1091 | "object_position": null, 1092 | "order": null, 1093 | "overflow": null, 1094 | "overflow_x": null, 1095 | "overflow_y": null, 1096 | "padding": null, 1097 | "right": null, 1098 | "top": null, 1099 | "visibility": null, 1100 | "width": null 1101 | } 1102 | }, 1103 | "0d91952d22b94bf08295547b4dabde34": { 1104 | "model_module": "@jupyter-widgets/controls", 1105 | "model_module_version": "1.5.0", 1106 | "model_name": "HBoxModel", 1107 | "state": { 1108 | "_dom_classes": [], 1109 | "_model_module": "@jupyter-widgets/controls", 1110 | "_model_module_version": "1.5.0", 1111 | "_model_name": "HBoxModel", 1112 | "_view_count": null, 1113 | "_view_module": "@jupyter-widgets/controls", 1114 | "_view_module_version": "1.5.0", 1115 | "_view_name": "HBoxView", 1116 | "box_style": "", 1117 | "children": [ 1118 | "IPY_MODEL_74579f1f7b464019bdd7292e7a4aa76a", 1119 | "IPY_MODEL_66ed3b2fe5c84b29a90c0e9271004db6", 1120 | "IPY_MODEL_111ba575c16344dfb8ede70a98be5437" 1121 | ], 1122 | "layout": "IPY_MODEL_4b9b1817e2e74816a46fb722cf6fffbe" 1123 | } 1124 | }, 1125 | "111ba575c16344dfb8ede70a98be5437": { 1126 | "model_module": "@jupyter-widgets/controls", 1127 | "model_module_version": "1.5.0", 1128 | "model_name": "HTMLModel", 1129 | "state": { 1130 | "_dom_classes": [], 1131 | "_model_module": "@jupyter-widgets/controls", 1132 | "_model_module_version": "1.5.0", 1133 | "_model_name": "HTMLModel", 1134 | "_view_count": null, 1135 | "_view_module": "@jupyter-widgets/controls", 1136 | "_view_module_version": "1.5.0", 1137 | "_view_name": "HTMLView", 1138 | "description": "", 1139 | "description_tooltip": null, 1140 | "layout": "IPY_MODEL_b51cfc1f0f824b0b8c5546d582b41d6b", 1141 | "placeholder": "​", 1142 | "style": "IPY_MODEL_3545a7f07dfa4a54b9b9b43eab645ba8", 1143 | "value": " 5/5 [00:00<00:00, 149.74it/s]" 1144 | } 1145 | }, 1146 | "152c248947e343e1b1768b89464319f0": { 1147 | "model_module": "@jupyter-widgets/controls", 1148 | "model_module_version": "1.5.0", 1149 | "model_name": "DescriptionStyleModel", 1150 | "state": { 1151 | "_model_module": "@jupyter-widgets/controls", 1152 | "_model_module_version": "1.5.0", 1153 | "_model_name": "DescriptionStyleModel", 1154 | "_view_count": null, 1155 | "_view_module": "@jupyter-widgets/base", 1156 | "_view_module_version": "1.2.0", 1157 | "_view_name": "StyleView", 1158 | "description_width": "" 1159 | } 1160 | }, 1161 | "18ba9ed02bfa42c8a519b106b16dfc8e": { 1162 | "model_module": "@jupyter-widgets/base", 1163 | "model_module_version": "1.2.0", 1164 | "model_name": "LayoutModel", 1165 | "state": { 1166 | "_model_module": "@jupyter-widgets/base", 1167 | "_model_module_version": "1.2.0", 1168 | "_model_name": "LayoutModel", 1169 | "_view_count": null, 1170 | "_view_module": "@jupyter-widgets/base", 1171 | "_view_module_version": "1.2.0", 1172 | "_view_name": "LayoutView", 1173 | "align_content": null, 1174 | "align_items": null, 1175 | "align_self": null, 1176 | "border": null, 1177 | "bottom": null, 1178 | "display": null, 1179 | "flex": null, 1180 | "flex_flow": null, 1181 | "grid_area": null, 1182 | "grid_auto_columns": null, 1183 | "grid_auto_flow": null, 1184 | "grid_auto_rows": null, 1185 | "grid_column": null, 1186 | "grid_gap": null, 1187 | "grid_row": null, 1188 | "grid_template_areas": null, 1189 | "grid_template_columns": null, 1190 | "grid_template_rows": null, 1191 | "height": null, 1192 | "justify_content": null, 1193 | "justify_items": null, 1194 | "left": null, 1195 | "margin": null, 1196 | "max_height": null, 1197 | "max_width": null, 1198 | "min_height": null, 1199 | "min_width": null, 1200 | "object_fit": null, 1201 | "object_position": null, 1202 | "order": null, 1203 | "overflow": null, 1204 | "overflow_x": null, 1205 | "overflow_y": null, 1206 | "padding": null, 1207 | "right": null, 1208 | "top": null, 1209 | "visibility": null, 1210 | "width": null 1211 | } 1212 | }, 1213 | "1be702359a024806bc209f4906b95f65": { 1214 | "model_module": "@jupyter-widgets/base", 1215 | "model_module_version": "1.2.0", 1216 | "model_name": "LayoutModel", 1217 | "state": { 1218 | "_model_module": "@jupyter-widgets/base", 1219 | "_model_module_version": "1.2.0", 1220 | "_model_name": "LayoutModel", 1221 | "_view_count": null, 1222 | "_view_module": "@jupyter-widgets/base", 1223 | "_view_module_version": "1.2.0", 1224 | "_view_name": "LayoutView", 1225 | "align_content": null, 1226 | "align_items": null, 1227 | "align_self": null, 1228 | "border": null, 1229 | "bottom": null, 1230 | "display": null, 1231 | "flex": null, 1232 | "flex_flow": null, 1233 | "grid_area": null, 1234 | "grid_auto_columns": null, 1235 | "grid_auto_flow": null, 1236 | "grid_auto_rows": null, 1237 | "grid_column": null, 1238 | "grid_gap": null, 1239 | "grid_row": null, 1240 | "grid_template_areas": null, 1241 | "grid_template_columns": null, 1242 | "grid_template_rows": null, 1243 | "height": null, 1244 | "justify_content": null, 1245 | "justify_items": null, 1246 | "left": null, 1247 | "margin": null, 1248 | "max_height": null, 1249 | "max_width": null, 1250 | "min_height": null, 1251 | "min_width": null, 1252 | "object_fit": null, 1253 | "object_position": null, 1254 | "order": null, 1255 | "overflow": null, 1256 | "overflow_x": null, 1257 | "overflow_y": null, 1258 | "padding": null, 1259 | "right": null, 1260 | "top": null, 1261 | "visibility": null, 1262 | "width": null 1263 | } 1264 | }, 1265 | "1c63d1cbf66740aeb882233fea9aa0b0": { 1266 | "model_module": "@jupyter-widgets/controls", 1267 | "model_module_version": "1.5.0", 1268 | "model_name": "HTMLModel", 1269 | "state": { 1270 | "_dom_classes": [], 1271 | "_model_module": "@jupyter-widgets/controls", 1272 | "_model_module_version": "1.5.0", 1273 | "_model_name": "HTMLModel", 1274 | "_view_count": null, 1275 | "_view_module": "@jupyter-widgets/controls", 1276 | "_view_module_version": "1.5.0", 1277 | "_view_name": "HTMLView", 1278 | "description": "", 1279 | "description_tooltip": null, 1280 | "layout": "IPY_MODEL_33b28c8cf2884cd09e1308ed8cd992a7", 1281 | "placeholder": "​", 1282 | "style": "IPY_MODEL_28c870099263407f8eb005188eaa7b77", 1283 | "value": " 1541/1541 [07:29<00:00, 7.44it/s]" 1284 | } 1285 | }, 1286 | "203aa20d46234f90996d8a060c66f404": { 1287 | "model_module": "@jupyter-widgets/controls", 1288 | "model_module_version": "1.5.0", 1289 | "model_name": "DescriptionStyleModel", 1290 | "state": { 1291 | "_model_module": "@jupyter-widgets/controls", 1292 | "_model_module_version": "1.5.0", 1293 | "_model_name": "DescriptionStyleModel", 1294 | "_view_count": null, 1295 | "_view_module": "@jupyter-widgets/base", 1296 | "_view_module_version": "1.2.0", 1297 | "_view_name": "StyleView", 1298 | "description_width": "" 1299 | } 1300 | }, 1301 | "2229bdabe0b0470db0ace9182b9dc463": { 1302 | "model_module": "@jupyter-widgets/controls", 1303 | "model_module_version": "1.5.0", 1304 | "model_name": "FloatProgressModel", 1305 | "state": { 1306 | "_dom_classes": [], 1307 | "_model_module": "@jupyter-widgets/controls", 1308 | "_model_module_version": "1.5.0", 1309 | "_model_name": "FloatProgressModel", 1310 | "_view_count": null, 1311 | "_view_module": "@jupyter-widgets/controls", 1312 | "_view_module_version": "1.5.0", 1313 | "_view_name": "ProgressView", 1314 | "bar_style": "success", 1315 | "description": "", 1316 | "description_tooltip": null, 1317 | "layout": "IPY_MODEL_18ba9ed02bfa42c8a519b106b16dfc8e", 1318 | "max": 1541, 1319 | "min": 0, 1320 | "orientation": "horizontal", 1321 | "style": "IPY_MODEL_bf62fd78fbc14ffb9a806202b3df1246", 1322 | "value": 1541 1323 | } 1324 | }, 1325 | "280c00a38f504ce19c649036b54d0cf8": { 1326 | "model_module": "@jupyter-widgets/controls", 1327 | "model_module_version": "1.5.0", 1328 | "model_name": "DescriptionStyleModel", 1329 | "state": { 1330 | "_model_module": "@jupyter-widgets/controls", 1331 | "_model_module_version": "1.5.0", 1332 | "_model_name": "DescriptionStyleModel", 1333 | "_view_count": null, 1334 | "_view_module": "@jupyter-widgets/base", 1335 | "_view_module_version": "1.2.0", 1336 | "_view_name": "StyleView", 1337 | "description_width": "" 1338 | } 1339 | }, 1340 | "2833367e24a546f6828fe64c053c8f75": { 1341 | "model_module": "@jupyter-widgets/base", 1342 | "model_module_version": "1.2.0", 1343 | "model_name": "LayoutModel", 1344 | "state": { 1345 | "_model_module": "@jupyter-widgets/base", 1346 | "_model_module_version": "1.2.0", 1347 | "_model_name": "LayoutModel", 1348 | "_view_count": null, 1349 | "_view_module": "@jupyter-widgets/base", 1350 | "_view_module_version": "1.2.0", 1351 | "_view_name": "LayoutView", 1352 | "align_content": null, 1353 | "align_items": null, 1354 | "align_self": null, 1355 | "border": null, 1356 | "bottom": null, 1357 | "display": null, 1358 | "flex": null, 1359 | "flex_flow": null, 1360 | "grid_area": null, 1361 | "grid_auto_columns": null, 1362 | "grid_auto_flow": null, 1363 | "grid_auto_rows": null, 1364 | "grid_column": null, 1365 | "grid_gap": null, 1366 | "grid_row": null, 1367 | "grid_template_areas": null, 1368 | "grid_template_columns": null, 1369 | "grid_template_rows": null, 1370 | "height": null, 1371 | "justify_content": null, 1372 | "justify_items": null, 1373 | "left": null, 1374 | "margin": null, 1375 | "max_height": null, 1376 | "max_width": null, 1377 | "min_height": null, 1378 | "min_width": null, 1379 | "object_fit": null, 1380 | "object_position": null, 1381 | "order": null, 1382 | "overflow": null, 1383 | "overflow_x": null, 1384 | "overflow_y": null, 1385 | "padding": null, 1386 | "right": null, 1387 | "top": null, 1388 | "visibility": null, 1389 | "width": null 1390 | } 1391 | }, 1392 | "28c870099263407f8eb005188eaa7b77": { 1393 | "model_module": "@jupyter-widgets/controls", 1394 | "model_module_version": "1.5.0", 1395 | "model_name": "DescriptionStyleModel", 1396 | "state": { 1397 | "_model_module": "@jupyter-widgets/controls", 1398 | "_model_module_version": "1.5.0", 1399 | "_model_name": "DescriptionStyleModel", 1400 | "_view_count": null, 1401 | "_view_module": "@jupyter-widgets/base", 1402 | "_view_module_version": "1.2.0", 1403 | "_view_name": "StyleView", 1404 | "description_width": "" 1405 | } 1406 | }, 1407 | "302217286df24fdda1da73b6e2ac07d5": { 1408 | "model_module": "@jupyter-widgets/controls", 1409 | "model_module_version": "1.5.0", 1410 | "model_name": "HTMLModel", 1411 | "state": { 1412 | "_dom_classes": [], 1413 | "_model_module": "@jupyter-widgets/controls", 1414 | "_model_module_version": "1.5.0", 1415 | "_model_name": "HTMLModel", 1416 | "_view_count": null, 1417 | "_view_module": "@jupyter-widgets/controls", 1418 | "_view_module_version": "1.5.0", 1419 | "_view_name": "HTMLView", 1420 | "description": "", 1421 | "description_tooltip": null, 1422 | "layout": "IPY_MODEL_2833367e24a546f6828fe64c053c8f75", 1423 | "placeholder": "​", 1424 | "style": "IPY_MODEL_fd923f56550241b3a93bde41ffe0dbf1", 1425 | "value": "Batches: 100%" 1426 | } 1427 | }, 1428 | "33b28c8cf2884cd09e1308ed8cd992a7": { 1429 | "model_module": "@jupyter-widgets/base", 1430 | "model_module_version": "1.2.0", 1431 | "model_name": "LayoutModel", 1432 | "state": { 1433 | "_model_module": "@jupyter-widgets/base", 1434 | "_model_module_version": "1.2.0", 1435 | "_model_name": "LayoutModel", 1436 | "_view_count": null, 1437 | "_view_module": "@jupyter-widgets/base", 1438 | "_view_module_version": "1.2.0", 1439 | "_view_name": "LayoutView", 1440 | "align_content": null, 1441 | "align_items": null, 1442 | "align_self": null, 1443 | "border": null, 1444 | "bottom": null, 1445 | "display": null, 1446 | "flex": null, 1447 | "flex_flow": null, 1448 | "grid_area": null, 1449 | "grid_auto_columns": null, 1450 | "grid_auto_flow": null, 1451 | "grid_auto_rows": null, 1452 | "grid_column": null, 1453 | "grid_gap": null, 1454 | "grid_row": null, 1455 | "grid_template_areas": null, 1456 | "grid_template_columns": null, 1457 | "grid_template_rows": null, 1458 | "height": null, 1459 | "justify_content": null, 1460 | "justify_items": null, 1461 | "left": null, 1462 | "margin": null, 1463 | "max_height": null, 1464 | "max_width": null, 1465 | "min_height": null, 1466 | "min_width": null, 1467 | "object_fit": null, 1468 | "object_position": null, 1469 | "order": null, 1470 | "overflow": null, 1471 | "overflow_x": null, 1472 | "overflow_y": null, 1473 | "padding": null, 1474 | "right": null, 1475 | "top": null, 1476 | "visibility": null, 1477 | "width": null 1478 | } 1479 | }, 1480 | "3545a7f07dfa4a54b9b9b43eab645ba8": { 1481 | "model_module": "@jupyter-widgets/controls", 1482 | "model_module_version": "1.5.0", 1483 | "model_name": "DescriptionStyleModel", 1484 | "state": { 1485 | "_model_module": "@jupyter-widgets/controls", 1486 | "_model_module_version": "1.5.0", 1487 | "_model_name": "DescriptionStyleModel", 1488 | "_view_count": null, 1489 | "_view_module": "@jupyter-widgets/base", 1490 | "_view_module_version": "1.2.0", 1491 | "_view_name": "StyleView", 1492 | "description_width": "" 1493 | } 1494 | }, 1495 | "3b94d51e2a134321b9c3a072619f4808": { 1496 | "model_module": "@jupyter-widgets/base", 1497 | "model_module_version": "1.2.0", 1498 | "model_name": "LayoutModel", 1499 | "state": { 1500 | "_model_module": "@jupyter-widgets/base", 1501 | "_model_module_version": "1.2.0", 1502 | "_model_name": "LayoutModel", 1503 | "_view_count": null, 1504 | "_view_module": "@jupyter-widgets/base", 1505 | "_view_module_version": "1.2.0", 1506 | "_view_name": "LayoutView", 1507 | "align_content": null, 1508 | "align_items": null, 1509 | "align_self": null, 1510 | "border": null, 1511 | "bottom": null, 1512 | "display": null, 1513 | "flex": null, 1514 | "flex_flow": null, 1515 | "grid_area": null, 1516 | "grid_auto_columns": null, 1517 | "grid_auto_flow": null, 1518 | "grid_auto_rows": null, 1519 | "grid_column": null, 1520 | "grid_gap": null, 1521 | "grid_row": null, 1522 | "grid_template_areas": null, 1523 | "grid_template_columns": null, 1524 | "grid_template_rows": null, 1525 | "height": null, 1526 | "justify_content": null, 1527 | "justify_items": null, 1528 | "left": null, 1529 | "margin": null, 1530 | "max_height": null, 1531 | "max_width": null, 1532 | "min_height": null, 1533 | "min_width": null, 1534 | "object_fit": null, 1535 | "object_position": null, 1536 | "order": null, 1537 | "overflow": null, 1538 | "overflow_x": null, 1539 | "overflow_y": null, 1540 | "padding": null, 1541 | "right": null, 1542 | "top": null, 1543 | "visibility": null, 1544 | "width": null 1545 | } 1546 | }, 1547 | "3f8a645ac0614151a0edce6965a0fc7d": { 1548 | "model_module": "@jupyter-widgets/controls", 1549 | "model_module_version": "1.5.0", 1550 | "model_name": "HTMLModel", 1551 | "state": { 1552 | "_dom_classes": [], 1553 | "_model_module": "@jupyter-widgets/controls", 1554 | "_model_module_version": "1.5.0", 1555 | "_model_name": "HTMLModel", 1556 | "_view_count": null, 1557 | "_view_module": "@jupyter-widgets/controls", 1558 | "_view_module_version": "1.5.0", 1559 | "_view_name": "HTMLView", 1560 | "description": "", 1561 | "description_tooltip": null, 1562 | "layout": "IPY_MODEL_572ca3d62de54d3e9381b57fe59621a9", 1563 | "placeholder": "​", 1564 | "style": "IPY_MODEL_152c248947e343e1b1768b89464319f0", 1565 | "value": "100%" 1566 | } 1567 | }, 1568 | "404fc9d374dc481abbdd0d4a12567375": { 1569 | "model_module": "@jupyter-widgets/base", 1570 | "model_module_version": "1.2.0", 1571 | "model_name": "LayoutModel", 1572 | "state": { 1573 | "_model_module": "@jupyter-widgets/base", 1574 | "_model_module_version": "1.2.0", 1575 | "_model_name": "LayoutModel", 1576 | "_view_count": null, 1577 | "_view_module": "@jupyter-widgets/base", 1578 | "_view_module_version": "1.2.0", 1579 | "_view_name": "LayoutView", 1580 | "align_content": null, 1581 | "align_items": null, 1582 | "align_self": null, 1583 | "border": null, 1584 | "bottom": null, 1585 | "display": null, 1586 | "flex": null, 1587 | "flex_flow": null, 1588 | "grid_area": null, 1589 | "grid_auto_columns": null, 1590 | "grid_auto_flow": null, 1591 | "grid_auto_rows": null, 1592 | "grid_column": null, 1593 | "grid_gap": null, 1594 | "grid_row": null, 1595 | "grid_template_areas": null, 1596 | "grid_template_columns": null, 1597 | "grid_template_rows": null, 1598 | "height": null, 1599 | "justify_content": null, 1600 | "justify_items": null, 1601 | "left": null, 1602 | "margin": null, 1603 | "max_height": null, 1604 | "max_width": null, 1605 | "min_height": null, 1606 | "min_width": null, 1607 | "object_fit": null, 1608 | "object_position": null, 1609 | "order": null, 1610 | "overflow": null, 1611 | "overflow_x": null, 1612 | "overflow_y": null, 1613 | "padding": null, 1614 | "right": null, 1615 | "top": null, 1616 | "visibility": null, 1617 | "width": null 1618 | } 1619 | }, 1620 | "478122609c1240c49fb6a58d90c0d063": { 1621 | "model_module": "@jupyter-widgets/controls", 1622 | "model_module_version": "1.5.0", 1623 | "model_name": "ProgressStyleModel", 1624 | "state": { 1625 | "_model_module": "@jupyter-widgets/controls", 1626 | "_model_module_version": "1.5.0", 1627 | "_model_name": "ProgressStyleModel", 1628 | "_view_count": null, 1629 | "_view_module": "@jupyter-widgets/base", 1630 | "_view_module_version": "1.2.0", 1631 | "_view_name": "StyleView", 1632 | "bar_color": null, 1633 | "description_width": "" 1634 | } 1635 | }, 1636 | "4b9b1817e2e74816a46fb722cf6fffbe": { 1637 | "model_module": "@jupyter-widgets/base", 1638 | "model_module_version": "1.2.0", 1639 | "model_name": "LayoutModel", 1640 | "state": { 1641 | "_model_module": "@jupyter-widgets/base", 1642 | "_model_module_version": "1.2.0", 1643 | "_model_name": "LayoutModel", 1644 | "_view_count": null, 1645 | "_view_module": "@jupyter-widgets/base", 1646 | "_view_module_version": "1.2.0", 1647 | "_view_name": "LayoutView", 1648 | "align_content": null, 1649 | "align_items": null, 1650 | "align_self": null, 1651 | "border": null, 1652 | "bottom": null, 1653 | "display": null, 1654 | "flex": null, 1655 | "flex_flow": null, 1656 | "grid_area": null, 1657 | "grid_auto_columns": null, 1658 | "grid_auto_flow": null, 1659 | "grid_auto_rows": null, 1660 | "grid_column": null, 1661 | "grid_gap": null, 1662 | "grid_row": null, 1663 | "grid_template_areas": null, 1664 | "grid_template_columns": null, 1665 | "grid_template_rows": null, 1666 | "height": null, 1667 | "justify_content": null, 1668 | "justify_items": null, 1669 | "left": null, 1670 | "margin": null, 1671 | "max_height": null, 1672 | "max_width": null, 1673 | "min_height": null, 1674 | "min_width": null, 1675 | "object_fit": null, 1676 | "object_position": null, 1677 | "order": null, 1678 | "overflow": null, 1679 | "overflow_x": null, 1680 | "overflow_y": null, 1681 | "padding": null, 1682 | "right": null, 1683 | "top": null, 1684 | "visibility": null, 1685 | "width": null 1686 | } 1687 | }, 1688 | "52daf02e77074212b4c33625355b9270": { 1689 | "model_module": "@jupyter-widgets/base", 1690 | "model_module_version": "1.2.0", 1691 | "model_name": "LayoutModel", 1692 | "state": { 1693 | "_model_module": "@jupyter-widgets/base", 1694 | "_model_module_version": "1.2.0", 1695 | "_model_name": "LayoutModel", 1696 | "_view_count": null, 1697 | "_view_module": "@jupyter-widgets/base", 1698 | "_view_module_version": "1.2.0", 1699 | "_view_name": "LayoutView", 1700 | "align_content": null, 1701 | "align_items": null, 1702 | "align_self": null, 1703 | "border": null, 1704 | "bottom": null, 1705 | "display": null, 1706 | "flex": null, 1707 | "flex_flow": null, 1708 | "grid_area": null, 1709 | "grid_auto_columns": null, 1710 | "grid_auto_flow": null, 1711 | "grid_auto_rows": null, 1712 | "grid_column": null, 1713 | "grid_gap": null, 1714 | "grid_row": null, 1715 | "grid_template_areas": null, 1716 | "grid_template_columns": null, 1717 | "grid_template_rows": null, 1718 | "height": null, 1719 | "justify_content": null, 1720 | "justify_items": null, 1721 | "left": null, 1722 | "margin": null, 1723 | "max_height": null, 1724 | "max_width": null, 1725 | "min_height": null, 1726 | "min_width": null, 1727 | "object_fit": null, 1728 | "object_position": null, 1729 | "order": null, 1730 | "overflow": null, 1731 | "overflow_x": null, 1732 | "overflow_y": null, 1733 | "padding": null, 1734 | "right": null, 1735 | "top": null, 1736 | "visibility": null, 1737 | "width": null 1738 | } 1739 | }, 1740 | "53ee9b5e340e4f60ae4434a3bf0a89df": { 1741 | "model_module": "@jupyter-widgets/controls", 1742 | "model_module_version": "1.5.0", 1743 | "model_name": "HBoxModel", 1744 | "state": { 1745 | "_dom_classes": [], 1746 | "_model_module": "@jupyter-widgets/controls", 1747 | "_model_module_version": "1.5.0", 1748 | "_model_name": "HBoxModel", 1749 | "_view_count": null, 1750 | "_view_module": "@jupyter-widgets/controls", 1751 | "_view_module_version": "1.5.0", 1752 | "_view_name": "HBoxView", 1753 | "box_style": "", 1754 | "children": [ 1755 | "IPY_MODEL_3f8a645ac0614151a0edce6965a0fc7d", 1756 | "IPY_MODEL_000c44770f834dfe8d4b821ce0c3128c", 1757 | "IPY_MODEL_02f0169ac226471e83090c438fdf8014" 1758 | ], 1759 | "layout": "IPY_MODEL_1be702359a024806bc209f4906b95f65" 1760 | } 1761 | }, 1762 | "54f7081a396743b586bbea676e7dc0ff": { 1763 | "model_module": "@jupyter-widgets/controls", 1764 | "model_module_version": "1.5.0", 1765 | "model_name": "HBoxModel", 1766 | "state": { 1767 | "_dom_classes": [], 1768 | "_model_module": "@jupyter-widgets/controls", 1769 | "_model_module_version": "1.5.0", 1770 | "_model_name": "HBoxModel", 1771 | "_view_count": null, 1772 | "_view_module": "@jupyter-widgets/controls", 1773 | "_view_module_version": "1.5.0", 1774 | "_view_name": "HBoxView", 1775 | "box_style": "", 1776 | "children": [ 1777 | "IPY_MODEL_b2b52be613f049de8becb0409a30eac5", 1778 | "IPY_MODEL_63ffccaedd3343378382637e145707a8", 1779 | "IPY_MODEL_77b82762415047a5bd998114285a837b" 1780 | ], 1781 | "layout": "IPY_MODEL_52daf02e77074212b4c33625355b9270" 1782 | } 1783 | }, 1784 | "572ca3d62de54d3e9381b57fe59621a9": { 1785 | "model_module": "@jupyter-widgets/base", 1786 | "model_module_version": "1.2.0", 1787 | "model_name": "LayoutModel", 1788 | "state": { 1789 | "_model_module": "@jupyter-widgets/base", 1790 | "_model_module_version": "1.2.0", 1791 | "_model_name": "LayoutModel", 1792 | "_view_count": null, 1793 | "_view_module": "@jupyter-widgets/base", 1794 | "_view_module_version": "1.2.0", 1795 | "_view_name": "LayoutView", 1796 | "align_content": null, 1797 | "align_items": null, 1798 | "align_self": null, 1799 | "border": null, 1800 | "bottom": null, 1801 | "display": null, 1802 | "flex": null, 1803 | "flex_flow": null, 1804 | "grid_area": null, 1805 | "grid_auto_columns": null, 1806 | "grid_auto_flow": null, 1807 | "grid_auto_rows": null, 1808 | "grid_column": null, 1809 | "grid_gap": null, 1810 | "grid_row": null, 1811 | "grid_template_areas": null, 1812 | "grid_template_columns": null, 1813 | "grid_template_rows": null, 1814 | "height": null, 1815 | "justify_content": null, 1816 | "justify_items": null, 1817 | "left": null, 1818 | "margin": null, 1819 | "max_height": null, 1820 | "max_width": null, 1821 | "min_height": null, 1822 | "min_width": null, 1823 | "object_fit": null, 1824 | "object_position": null, 1825 | "order": null, 1826 | "overflow": null, 1827 | "overflow_x": null, 1828 | "overflow_y": null, 1829 | "padding": null, 1830 | "right": null, 1831 | "top": null, 1832 | "visibility": null, 1833 | "width": null 1834 | } 1835 | }, 1836 | "5f2568b10f0040278f3474ca26fac15f": { 1837 | "model_module": "@jupyter-widgets/base", 1838 | "model_module_version": "1.2.0", 1839 | "model_name": "LayoutModel", 1840 | "state": { 1841 | "_model_module": "@jupyter-widgets/base", 1842 | "_model_module_version": "1.2.0", 1843 | "_model_name": "LayoutModel", 1844 | "_view_count": null, 1845 | "_view_module": "@jupyter-widgets/base", 1846 | "_view_module_version": "1.2.0", 1847 | "_view_name": "LayoutView", 1848 | "align_content": null, 1849 | "align_items": null, 1850 | "align_self": null, 1851 | "border": null, 1852 | "bottom": null, 1853 | "display": null, 1854 | "flex": null, 1855 | "flex_flow": null, 1856 | "grid_area": null, 1857 | "grid_auto_columns": null, 1858 | "grid_auto_flow": null, 1859 | "grid_auto_rows": null, 1860 | "grid_column": null, 1861 | "grid_gap": null, 1862 | "grid_row": null, 1863 | "grid_template_areas": null, 1864 | "grid_template_columns": null, 1865 | "grid_template_rows": null, 1866 | "height": null, 1867 | "justify_content": null, 1868 | "justify_items": null, 1869 | "left": null, 1870 | "margin": null, 1871 | "max_height": null, 1872 | "max_width": null, 1873 | "min_height": null, 1874 | "min_width": null, 1875 | "object_fit": null, 1876 | "object_position": null, 1877 | "order": null, 1878 | "overflow": null, 1879 | "overflow_x": null, 1880 | "overflow_y": null, 1881 | "padding": null, 1882 | "right": null, 1883 | "top": null, 1884 | "visibility": null, 1885 | "width": null 1886 | } 1887 | }, 1888 | "63ffccaedd3343378382637e145707a8": { 1889 | "model_module": "@jupyter-widgets/controls", 1890 | "model_module_version": "1.5.0", 1891 | "model_name": "FloatProgressModel", 1892 | "state": { 1893 | "_dom_classes": [], 1894 | "_model_module": "@jupyter-widgets/controls", 1895 | "_model_module_version": "1.5.0", 1896 | "_model_name": "FloatProgressModel", 1897 | "_view_count": null, 1898 | "_view_module": "@jupyter-widgets/controls", 1899 | "_view_module_version": "1.5.0", 1900 | "_view_name": "ProgressView", 1901 | "bar_style": "success", 1902 | "description": "", 1903 | "description_tooltip": null, 1904 | "layout": "IPY_MODEL_a5352c15d61f4be6a623ba5fa6b5dabe", 1905 | "max": 1, 1906 | "min": 0, 1907 | "orientation": "horizontal", 1908 | "style": "IPY_MODEL_478122609c1240c49fb6a58d90c0d063", 1909 | "value": 1 1910 | } 1911 | }, 1912 | "66a86b9427004c859c782660ac21f07e": { 1913 | "model_module": "@jupyter-widgets/controls", 1914 | "model_module_version": "1.5.0", 1915 | "model_name": "ProgressStyleModel", 1916 | "state": { 1917 | "_model_module": "@jupyter-widgets/controls", 1918 | "_model_module_version": "1.5.0", 1919 | "_model_name": "ProgressStyleModel", 1920 | "_view_count": null, 1921 | "_view_module": "@jupyter-widgets/base", 1922 | "_view_module_version": "1.2.0", 1923 | "_view_name": "StyleView", 1924 | "bar_color": null, 1925 | "description_width": "" 1926 | } 1927 | }, 1928 | "66ed3b2fe5c84b29a90c0e9271004db6": { 1929 | "model_module": "@jupyter-widgets/controls", 1930 | "model_module_version": "1.5.0", 1931 | "model_name": "FloatProgressModel", 1932 | "state": { 1933 | "_dom_classes": [], 1934 | "_model_module": "@jupyter-widgets/controls", 1935 | "_model_module_version": "1.5.0", 1936 | "_model_name": "FloatProgressModel", 1937 | "_view_count": null, 1938 | "_view_module": "@jupyter-widgets/controls", 1939 | "_view_module_version": "1.5.0", 1940 | "_view_name": "ProgressView", 1941 | "bar_style": "success", 1942 | "description": "", 1943 | "description_tooltip": null, 1944 | "layout": "IPY_MODEL_0c6f44b8092841cea7e9ef1fd90ae255", 1945 | "max": 5, 1946 | "min": 0, 1947 | "orientation": "horizontal", 1948 | "style": "IPY_MODEL_fb2b97cb722740139c33318d6042698b", 1949 | "value": 5 1950 | } 1951 | }, 1952 | "6a963e7ad8b44bc8968525740b8b9015": { 1953 | "model_module": "@jupyter-widgets/base", 1954 | "model_module_version": "1.2.0", 1955 | "model_name": "LayoutModel", 1956 | "state": { 1957 | "_model_module": "@jupyter-widgets/base", 1958 | "_model_module_version": "1.2.0", 1959 | "_model_name": "LayoutModel", 1960 | "_view_count": null, 1961 | "_view_module": "@jupyter-widgets/base", 1962 | "_view_module_version": "1.2.0", 1963 | "_view_name": "LayoutView", 1964 | "align_content": null, 1965 | "align_items": null, 1966 | "align_self": null, 1967 | "border": null, 1968 | "bottom": null, 1969 | "display": null, 1970 | "flex": null, 1971 | "flex_flow": null, 1972 | "grid_area": null, 1973 | "grid_auto_columns": null, 1974 | "grid_auto_flow": null, 1975 | "grid_auto_rows": null, 1976 | "grid_column": null, 1977 | "grid_gap": null, 1978 | "grid_row": null, 1979 | "grid_template_areas": null, 1980 | "grid_template_columns": null, 1981 | "grid_template_rows": null, 1982 | "height": null, 1983 | "justify_content": null, 1984 | "justify_items": null, 1985 | "left": null, 1986 | "margin": null, 1987 | "max_height": null, 1988 | "max_width": null, 1989 | "min_height": null, 1990 | "min_width": null, 1991 | "object_fit": null, 1992 | "object_position": null, 1993 | "order": null, 1994 | "overflow": null, 1995 | "overflow_x": null, 1996 | "overflow_y": null, 1997 | "padding": null, 1998 | "right": null, 1999 | "top": null, 2000 | "visibility": null, 2001 | "width": null 2002 | } 2003 | }, 2004 | "74579f1f7b464019bdd7292e7a4aa76a": { 2005 | "model_module": "@jupyter-widgets/controls", 2006 | "model_module_version": "1.5.0", 2007 | "model_name": "HTMLModel", 2008 | "state": { 2009 | "_dom_classes": [], 2010 | "_model_module": "@jupyter-widgets/controls", 2011 | "_model_module_version": "1.5.0", 2012 | "_model_name": "HTMLModel", 2013 | "_view_count": null, 2014 | "_view_module": "@jupyter-widgets/controls", 2015 | "_view_module_version": "1.5.0", 2016 | "_view_name": "HTMLView", 2017 | "description": "", 2018 | "description_tooltip": null, 2019 | "layout": "IPY_MODEL_7784d369d0cf4e17bba1dafb503e52b1", 2020 | "placeholder": "​", 2021 | "style": "IPY_MODEL_203aa20d46234f90996d8a060c66f404", 2022 | "value": "100%" 2023 | } 2024 | }, 2025 | "7784d369d0cf4e17bba1dafb503e52b1": { 2026 | "model_module": "@jupyter-widgets/base", 2027 | "model_module_version": "1.2.0", 2028 | "model_name": "LayoutModel", 2029 | "state": { 2030 | "_model_module": "@jupyter-widgets/base", 2031 | "_model_module_version": "1.2.0", 2032 | "_model_name": "LayoutModel", 2033 | "_view_count": null, 2034 | "_view_module": "@jupyter-widgets/base", 2035 | "_view_module_version": "1.2.0", 2036 | "_view_name": "LayoutView", 2037 | "align_content": null, 2038 | "align_items": null, 2039 | "align_self": null, 2040 | "border": null, 2041 | "bottom": null, 2042 | "display": null, 2043 | "flex": null, 2044 | "flex_flow": null, 2045 | "grid_area": null, 2046 | "grid_auto_columns": null, 2047 | "grid_auto_flow": null, 2048 | "grid_auto_rows": null, 2049 | "grid_column": null, 2050 | "grid_gap": null, 2051 | "grid_row": null, 2052 | "grid_template_areas": null, 2053 | "grid_template_columns": null, 2054 | "grid_template_rows": null, 2055 | "height": null, 2056 | "justify_content": null, 2057 | "justify_items": null, 2058 | "left": null, 2059 | "margin": null, 2060 | "max_height": null, 2061 | "max_width": null, 2062 | "min_height": null, 2063 | "min_width": null, 2064 | "object_fit": null, 2065 | "object_position": null, 2066 | "order": null, 2067 | "overflow": null, 2068 | "overflow_x": null, 2069 | "overflow_y": null, 2070 | "padding": null, 2071 | "right": null, 2072 | "top": null, 2073 | "visibility": null, 2074 | "width": null 2075 | } 2076 | }, 2077 | "77b82762415047a5bd998114285a837b": { 2078 | "model_module": "@jupyter-widgets/controls", 2079 | "model_module_version": "1.5.0", 2080 | "model_name": "HTMLModel", 2081 | "state": { 2082 | "_dom_classes": [], 2083 | "_model_module": "@jupyter-widgets/controls", 2084 | "_model_module_version": "1.5.0", 2085 | "_model_name": "HTMLModel", 2086 | "_view_count": null, 2087 | "_view_module": "@jupyter-widgets/controls", 2088 | "_view_module_version": "1.5.0", 2089 | "_view_name": "HTMLView", 2090 | "description": "", 2091 | "description_tooltip": null, 2092 | "layout": "IPY_MODEL_6a963e7ad8b44bc8968525740b8b9015", 2093 | "placeholder": "​", 2094 | "style": "IPY_MODEL_875def4e27284a2da7390801da08c32a", 2095 | "value": " 1/1 [00:00<00:00, 1.38it/s]" 2096 | } 2097 | }, 2098 | "82e437676f2f486ab94bdfec15f05b31": { 2099 | "model_module": "@jupyter-widgets/controls", 2100 | "model_module_version": "1.5.0", 2101 | "model_name": "FloatProgressModel", 2102 | "state": { 2103 | "_dom_classes": [], 2104 | "_model_module": "@jupyter-widgets/controls", 2105 | "_model_module_version": "1.5.0", 2106 | "_model_name": "FloatProgressModel", 2107 | "_view_count": null, 2108 | "_view_module": "@jupyter-widgets/controls", 2109 | "_view_module_version": "1.5.0", 2110 | "_view_name": "ProgressView", 2111 | "bar_style": "success", 2112 | "description": "", 2113 | "description_tooltip": null, 2114 | "layout": "IPY_MODEL_eeb7c19df1d64397b0ea7b1a6e2a02a0", 2115 | "max": 5, 2116 | "min": 0, 2117 | "orientation": "horizontal", 2118 | "style": "IPY_MODEL_66a86b9427004c859c782660ac21f07e", 2119 | "value": 5 2120 | } 2121 | }, 2122 | "875def4e27284a2da7390801da08c32a": { 2123 | "model_module": "@jupyter-widgets/controls", 2124 | "model_module_version": "1.5.0", 2125 | "model_name": "DescriptionStyleModel", 2126 | "state": { 2127 | "_model_module": "@jupyter-widgets/controls", 2128 | "_model_module_version": "1.5.0", 2129 | "_model_name": "DescriptionStyleModel", 2130 | "_view_count": null, 2131 | "_view_module": "@jupyter-widgets/base", 2132 | "_view_module_version": "1.2.0", 2133 | "_view_name": "StyleView", 2134 | "description_width": "" 2135 | } 2136 | }, 2137 | "88901984e77d47d08d7a897cec416037": { 2138 | "model_module": "@jupyter-widgets/base", 2139 | "model_module_version": "1.2.0", 2140 | "model_name": "LayoutModel", 2141 | "state": { 2142 | "_model_module": "@jupyter-widgets/base", 2143 | "_model_module_version": "1.2.0", 2144 | "_model_name": "LayoutModel", 2145 | "_view_count": null, 2146 | "_view_module": "@jupyter-widgets/base", 2147 | "_view_module_version": "1.2.0", 2148 | "_view_name": "LayoutView", 2149 | "align_content": null, 2150 | "align_items": null, 2151 | "align_self": null, 2152 | "border": null, 2153 | "bottom": null, 2154 | "display": null, 2155 | "flex": null, 2156 | "flex_flow": null, 2157 | "grid_area": null, 2158 | "grid_auto_columns": null, 2159 | "grid_auto_flow": null, 2160 | "grid_auto_rows": null, 2161 | "grid_column": null, 2162 | "grid_gap": null, 2163 | "grid_row": null, 2164 | "grid_template_areas": null, 2165 | "grid_template_columns": null, 2166 | "grid_template_rows": null, 2167 | "height": null, 2168 | "justify_content": null, 2169 | "justify_items": null, 2170 | "left": null, 2171 | "margin": null, 2172 | "max_height": null, 2173 | "max_width": null, 2174 | "min_height": null, 2175 | "min_width": null, 2176 | "object_fit": null, 2177 | "object_position": null, 2178 | "order": null, 2179 | "overflow": null, 2180 | "overflow_x": null, 2181 | "overflow_y": null, 2182 | "padding": null, 2183 | "right": null, 2184 | "top": null, 2185 | "visibility": null, 2186 | "width": null 2187 | } 2188 | }, 2189 | "a3f3b4e080b8414cbec4e5cc5db34b6f": { 2190 | "model_module": "@jupyter-widgets/controls", 2191 | "model_module_version": "1.5.0", 2192 | "model_name": "HTMLModel", 2193 | "state": { 2194 | "_dom_classes": [], 2195 | "_model_module": "@jupyter-widgets/controls", 2196 | "_model_module_version": "1.5.0", 2197 | "_model_name": "HTMLModel", 2198 | "_view_count": null, 2199 | "_view_module": "@jupyter-widgets/controls", 2200 | "_view_module_version": "1.5.0", 2201 | "_view_name": "HTMLView", 2202 | "description": "", 2203 | "description_tooltip": null, 2204 | "layout": "IPY_MODEL_bf4d5372f96b4d0aa42f542c41646a02", 2205 | "placeholder": "​", 2206 | "style": "IPY_MODEL_280c00a38f504ce19c649036b54d0cf8", 2207 | "value": " 5/5 [00:00<00:00, 94.40it/s]" 2208 | } 2209 | }, 2210 | "a5352c15d61f4be6a623ba5fa6b5dabe": { 2211 | "model_module": "@jupyter-widgets/base", 2212 | "model_module_version": "1.2.0", 2213 | "model_name": "LayoutModel", 2214 | "state": { 2215 | "_model_module": "@jupyter-widgets/base", 2216 | "_model_module_version": "1.2.0", 2217 | "_model_name": "LayoutModel", 2218 | "_view_count": null, 2219 | "_view_module": "@jupyter-widgets/base", 2220 | "_view_module_version": "1.2.0", 2221 | "_view_name": "LayoutView", 2222 | "align_content": null, 2223 | "align_items": null, 2224 | "align_self": null, 2225 | "border": null, 2226 | "bottom": null, 2227 | "display": null, 2228 | "flex": null, 2229 | "flex_flow": null, 2230 | "grid_area": null, 2231 | "grid_auto_columns": null, 2232 | "grid_auto_flow": null, 2233 | "grid_auto_rows": null, 2234 | "grid_column": null, 2235 | "grid_gap": null, 2236 | "grid_row": null, 2237 | "grid_template_areas": null, 2238 | "grid_template_columns": null, 2239 | "grid_template_rows": null, 2240 | "height": null, 2241 | "justify_content": null, 2242 | "justify_items": null, 2243 | "left": null, 2244 | "margin": null, 2245 | "max_height": null, 2246 | "max_width": null, 2247 | "min_height": null, 2248 | "min_width": null, 2249 | "object_fit": null, 2250 | "object_position": null, 2251 | "order": null, 2252 | "overflow": null, 2253 | "overflow_x": null, 2254 | "overflow_y": null, 2255 | "padding": null, 2256 | "right": null, 2257 | "top": null, 2258 | "visibility": null, 2259 | "width": null 2260 | } 2261 | }, 2262 | "a854925446b1440cbeb3b499ea722944": { 2263 | "model_module": "@jupyter-widgets/base", 2264 | "model_module_version": "1.2.0", 2265 | "model_name": "LayoutModel", 2266 | "state": { 2267 | "_model_module": "@jupyter-widgets/base", 2268 | "_model_module_version": "1.2.0", 2269 | "_model_name": "LayoutModel", 2270 | "_view_count": null, 2271 | "_view_module": "@jupyter-widgets/base", 2272 | "_view_module_version": "1.2.0", 2273 | "_view_name": "LayoutView", 2274 | "align_content": null, 2275 | "align_items": null, 2276 | "align_self": null, 2277 | "border": null, 2278 | "bottom": null, 2279 | "display": null, 2280 | "flex": null, 2281 | "flex_flow": null, 2282 | "grid_area": null, 2283 | "grid_auto_columns": null, 2284 | "grid_auto_flow": null, 2285 | "grid_auto_rows": null, 2286 | "grid_column": null, 2287 | "grid_gap": null, 2288 | "grid_row": null, 2289 | "grid_template_areas": null, 2290 | "grid_template_columns": null, 2291 | "grid_template_rows": null, 2292 | "height": null, 2293 | "justify_content": null, 2294 | "justify_items": null, 2295 | "left": null, 2296 | "margin": null, 2297 | "max_height": null, 2298 | "max_width": null, 2299 | "min_height": null, 2300 | "min_width": null, 2301 | "object_fit": null, 2302 | "object_position": null, 2303 | "order": null, 2304 | "overflow": null, 2305 | "overflow_x": null, 2306 | "overflow_y": null, 2307 | "padding": null, 2308 | "right": null, 2309 | "top": null, 2310 | "visibility": null, 2311 | "width": null 2312 | } 2313 | }, 2314 | "b2b52be613f049de8becb0409a30eac5": { 2315 | "model_module": "@jupyter-widgets/controls", 2316 | "model_module_version": "1.5.0", 2317 | "model_name": "HTMLModel", 2318 | "state": { 2319 | "_dom_classes": [], 2320 | "_model_module": "@jupyter-widgets/controls", 2321 | "_model_module_version": "1.5.0", 2322 | "_model_name": "HTMLModel", 2323 | "_view_count": null, 2324 | "_view_module": "@jupyter-widgets/controls", 2325 | "_view_module_version": "1.5.0", 2326 | "_view_name": "HTMLView", 2327 | "description": "", 2328 | "description_tooltip": null, 2329 | "layout": "IPY_MODEL_bd4cce3786324a9c98115008c7699c62", 2330 | "placeholder": "​", 2331 | "style": "IPY_MODEL_fd67eb37520b400db7e662c2a7fd7151", 2332 | "value": "Batches: 100%" 2333 | } 2334 | }, 2335 | "b51cfc1f0f824b0b8c5546d582b41d6b": { 2336 | "model_module": "@jupyter-widgets/base", 2337 | "model_module_version": "1.2.0", 2338 | "model_name": "LayoutModel", 2339 | "state": { 2340 | "_model_module": "@jupyter-widgets/base", 2341 | "_model_module_version": "1.2.0", 2342 | "_model_name": "LayoutModel", 2343 | "_view_count": null, 2344 | "_view_module": "@jupyter-widgets/base", 2345 | "_view_module_version": "1.2.0", 2346 | "_view_name": "LayoutView", 2347 | "align_content": null, 2348 | "align_items": null, 2349 | "align_self": null, 2350 | "border": null, 2351 | "bottom": null, 2352 | "display": null, 2353 | "flex": null, 2354 | "flex_flow": null, 2355 | "grid_area": null, 2356 | "grid_auto_columns": null, 2357 | "grid_auto_flow": null, 2358 | "grid_auto_rows": null, 2359 | "grid_column": null, 2360 | "grid_gap": null, 2361 | "grid_row": null, 2362 | "grid_template_areas": null, 2363 | "grid_template_columns": null, 2364 | "grid_template_rows": null, 2365 | "height": null, 2366 | "justify_content": null, 2367 | "justify_items": null, 2368 | "left": null, 2369 | "margin": null, 2370 | "max_height": null, 2371 | "max_width": null, 2372 | "min_height": null, 2373 | "min_width": null, 2374 | "object_fit": null, 2375 | "object_position": null, 2376 | "order": null, 2377 | "overflow": null, 2378 | "overflow_x": null, 2379 | "overflow_y": null, 2380 | "padding": null, 2381 | "right": null, 2382 | "top": null, 2383 | "visibility": null, 2384 | "width": null 2385 | } 2386 | }, 2387 | "bc3a207ec7644679b0d003bf9b099051": { 2388 | "model_module": "@jupyter-widgets/controls", 2389 | "model_module_version": "1.5.0", 2390 | "model_name": "ProgressStyleModel", 2391 | "state": { 2392 | "_model_module": "@jupyter-widgets/controls", 2393 | "_model_module_version": "1.5.0", 2394 | "_model_name": "ProgressStyleModel", 2395 | "_view_count": null, 2396 | "_view_module": "@jupyter-widgets/base", 2397 | "_view_module_version": "1.2.0", 2398 | "_view_name": "StyleView", 2399 | "bar_color": null, 2400 | "description_width": "" 2401 | } 2402 | }, 2403 | "bd4cce3786324a9c98115008c7699c62": { 2404 | "model_module": "@jupyter-widgets/base", 2405 | "model_module_version": "1.2.0", 2406 | "model_name": "LayoutModel", 2407 | "state": { 2408 | "_model_module": "@jupyter-widgets/base", 2409 | "_model_module_version": "1.2.0", 2410 | "_model_name": "LayoutModel", 2411 | "_view_count": null, 2412 | "_view_module": "@jupyter-widgets/base", 2413 | "_view_module_version": "1.2.0", 2414 | "_view_name": "LayoutView", 2415 | "align_content": null, 2416 | "align_items": null, 2417 | "align_self": null, 2418 | "border": null, 2419 | "bottom": null, 2420 | "display": null, 2421 | "flex": null, 2422 | "flex_flow": null, 2423 | "grid_area": null, 2424 | "grid_auto_columns": null, 2425 | "grid_auto_flow": null, 2426 | "grid_auto_rows": null, 2427 | "grid_column": null, 2428 | "grid_gap": null, 2429 | "grid_row": null, 2430 | "grid_template_areas": null, 2431 | "grid_template_columns": null, 2432 | "grid_template_rows": null, 2433 | "height": null, 2434 | "justify_content": null, 2435 | "justify_items": null, 2436 | "left": null, 2437 | "margin": null, 2438 | "max_height": null, 2439 | "max_width": null, 2440 | "min_height": null, 2441 | "min_width": null, 2442 | "object_fit": null, 2443 | "object_position": null, 2444 | "order": null, 2445 | "overflow": null, 2446 | "overflow_x": null, 2447 | "overflow_y": null, 2448 | "padding": null, 2449 | "right": null, 2450 | "top": null, 2451 | "visibility": null, 2452 | "width": null 2453 | } 2454 | }, 2455 | "bf4d5372f96b4d0aa42f542c41646a02": { 2456 | "model_module": "@jupyter-widgets/base", 2457 | "model_module_version": "1.2.0", 2458 | "model_name": "LayoutModel", 2459 | "state": { 2460 | "_model_module": "@jupyter-widgets/base", 2461 | "_model_module_version": "1.2.0", 2462 | "_model_name": "LayoutModel", 2463 | "_view_count": null, 2464 | "_view_module": "@jupyter-widgets/base", 2465 | "_view_module_version": "1.2.0", 2466 | "_view_name": "LayoutView", 2467 | "align_content": null, 2468 | "align_items": null, 2469 | "align_self": null, 2470 | "border": null, 2471 | "bottom": null, 2472 | "display": null, 2473 | "flex": null, 2474 | "flex_flow": null, 2475 | "grid_area": null, 2476 | "grid_auto_columns": null, 2477 | "grid_auto_flow": null, 2478 | "grid_auto_rows": null, 2479 | "grid_column": null, 2480 | "grid_gap": null, 2481 | "grid_row": null, 2482 | "grid_template_areas": null, 2483 | "grid_template_columns": null, 2484 | "grid_template_rows": null, 2485 | "height": null, 2486 | "justify_content": null, 2487 | "justify_items": null, 2488 | "left": null, 2489 | "margin": null, 2490 | "max_height": null, 2491 | "max_width": null, 2492 | "min_height": null, 2493 | "min_width": null, 2494 | "object_fit": null, 2495 | "object_position": null, 2496 | "order": null, 2497 | "overflow": null, 2498 | "overflow_x": null, 2499 | "overflow_y": null, 2500 | "padding": null, 2501 | "right": null, 2502 | "top": null, 2503 | "visibility": null, 2504 | "width": null 2505 | } 2506 | }, 2507 | "bf62fd78fbc14ffb9a806202b3df1246": { 2508 | "model_module": "@jupyter-widgets/controls", 2509 | "model_module_version": "1.5.0", 2510 | "model_name": "ProgressStyleModel", 2511 | "state": { 2512 | "_model_module": "@jupyter-widgets/controls", 2513 | "_model_module_version": "1.5.0", 2514 | "_model_name": "ProgressStyleModel", 2515 | "_view_count": null, 2516 | "_view_module": "@jupyter-widgets/base", 2517 | "_view_module_version": "1.2.0", 2518 | "_view_name": "StyleView", 2519 | "bar_color": null, 2520 | "description_width": "" 2521 | } 2522 | }, 2523 | "c0e481ee21284807973ac5fde01575ed": { 2524 | "model_module": "@jupyter-widgets/controls", 2525 | "model_module_version": "1.5.0", 2526 | "model_name": "HTMLModel", 2527 | "state": { 2528 | "_dom_classes": [], 2529 | "_model_module": "@jupyter-widgets/controls", 2530 | "_model_module_version": "1.5.0", 2531 | "_model_name": "HTMLModel", 2532 | "_view_count": null, 2533 | "_view_module": "@jupyter-widgets/controls", 2534 | "_view_module_version": "1.5.0", 2535 | "_view_name": "HTMLView", 2536 | "description": "", 2537 | "description_tooltip": null, 2538 | "layout": "IPY_MODEL_ebcf418b325e4fcab4ccd85117b33e65", 2539 | "placeholder": "​", 2540 | "style": "IPY_MODEL_f7feb49173d6451893648b2475b1f05a", 2541 | "value": " 3/3 [00:01<00:00, 2.52it/s]" 2542 | } 2543 | }, 2544 | "c6f8d7f4bf974baaaf0690eb686a7eef": { 2545 | "model_module": "@jupyter-widgets/controls", 2546 | "model_module_version": "1.5.0", 2547 | "model_name": "FloatProgressModel", 2548 | "state": { 2549 | "_dom_classes": [], 2550 | "_model_module": "@jupyter-widgets/controls", 2551 | "_model_module_version": "1.5.0", 2552 | "_model_name": "FloatProgressModel", 2553 | "_view_count": null, 2554 | "_view_module": "@jupyter-widgets/controls", 2555 | "_view_module_version": "1.5.0", 2556 | "_view_name": "ProgressView", 2557 | "bar_style": "success", 2558 | "description": "", 2559 | "description_tooltip": null, 2560 | "layout": "IPY_MODEL_88901984e77d47d08d7a897cec416037", 2561 | "max": 3, 2562 | "min": 0, 2563 | "orientation": "horizontal", 2564 | "style": "IPY_MODEL_02b655f2dd8e453c927f829e5e0377c9", 2565 | "value": 3 2566 | } 2567 | }, 2568 | "cd66653a339348fab93532e6623d319d": { 2569 | "model_module": "@jupyter-widgets/controls", 2570 | "model_module_version": "1.5.0", 2571 | "model_name": "HBoxModel", 2572 | "state": { 2573 | "_dom_classes": [], 2574 | "_model_module": "@jupyter-widgets/controls", 2575 | "_model_module_version": "1.5.0", 2576 | "_model_name": "HBoxModel", 2577 | "_view_count": null, 2578 | "_view_module": "@jupyter-widgets/controls", 2579 | "_view_module_version": "1.5.0", 2580 | "_view_name": "HBoxView", 2581 | "box_style": "", 2582 | "children": [ 2583 | "IPY_MODEL_d165e8cc2d544745a6ac9f85b187a535", 2584 | "IPY_MODEL_82e437676f2f486ab94bdfec15f05b31", 2585 | "IPY_MODEL_a3f3b4e080b8414cbec4e5cc5db34b6f" 2586 | ], 2587 | "layout": "IPY_MODEL_e8fc33d1e1a84386bd793c60407bcdac" 2588 | } 2589 | }, 2590 | "cd6c9bd2707f47cdab8d53618bd3057c": { 2591 | "model_module": "@jupyter-widgets/controls", 2592 | "model_module_version": "1.5.0", 2593 | "model_name": "DescriptionStyleModel", 2594 | "state": { 2595 | "_model_module": "@jupyter-widgets/controls", 2596 | "_model_module_version": "1.5.0", 2597 | "_model_name": "DescriptionStyleModel", 2598 | "_view_count": null, 2599 | "_view_module": "@jupyter-widgets/base", 2600 | "_view_module_version": "1.2.0", 2601 | "_view_name": "StyleView", 2602 | "description_width": "" 2603 | } 2604 | }, 2605 | "d165e8cc2d544745a6ac9f85b187a535": { 2606 | "model_module": "@jupyter-widgets/controls", 2607 | "model_module_version": "1.5.0", 2608 | "model_name": "HTMLModel", 2609 | "state": { 2610 | "_dom_classes": [], 2611 | "_model_module": "@jupyter-widgets/controls", 2612 | "_model_module_version": "1.5.0", 2613 | "_model_name": "HTMLModel", 2614 | "_view_count": null, 2615 | "_view_module": "@jupyter-widgets/controls", 2616 | "_view_module_version": "1.5.0", 2617 | "_view_name": "HTMLView", 2618 | "description": "", 2619 | "description_tooltip": null, 2620 | "layout": "IPY_MODEL_5f2568b10f0040278f3474ca26fac15f", 2621 | "placeholder": "​", 2622 | "style": "IPY_MODEL_ec4f63d8c9774c7cb212b5c3a76de52b", 2623 | "value": "100%" 2624 | } 2625 | }, 2626 | "dd133cad2921407c893918ce26310a61": { 2627 | "model_module": "@jupyter-widgets/controls", 2628 | "model_module_version": "1.5.0", 2629 | "model_name": "HTMLModel", 2630 | "state": { 2631 | "_dom_classes": [], 2632 | "_model_module": "@jupyter-widgets/controls", 2633 | "_model_module_version": "1.5.0", 2634 | "_model_name": "HTMLModel", 2635 | "_view_count": null, 2636 | "_view_module": "@jupyter-widgets/controls", 2637 | "_view_module_version": "1.5.0", 2638 | "_view_name": "HTMLView", 2639 | "description": "", 2640 | "description_tooltip": null, 2641 | "layout": "IPY_MODEL_3b94d51e2a134321b9c3a072619f4808", 2642 | "placeholder": "​", 2643 | "style": "IPY_MODEL_cd6c9bd2707f47cdab8d53618bd3057c", 2644 | "value": "Batches: 100%" 2645 | } 2646 | }, 2647 | "dd461a53a88a461f99dd91cda5abbf2f": { 2648 | "model_module": "@jupyter-widgets/controls", 2649 | "model_module_version": "1.5.0", 2650 | "model_name": "HBoxModel", 2651 | "state": { 2652 | "_dom_classes": [], 2653 | "_model_module": "@jupyter-widgets/controls", 2654 | "_model_module_version": "1.5.0", 2655 | "_model_name": "HBoxModel", 2656 | "_view_count": null, 2657 | "_view_module": "@jupyter-widgets/controls", 2658 | "_view_module_version": "1.5.0", 2659 | "_view_name": "HBoxView", 2660 | "box_style": "", 2661 | "children": [ 2662 | "IPY_MODEL_dd133cad2921407c893918ce26310a61", 2663 | "IPY_MODEL_c6f8d7f4bf974baaaf0690eb686a7eef", 2664 | "IPY_MODEL_c0e481ee21284807973ac5fde01575ed" 2665 | ], 2666 | "layout": "IPY_MODEL_e7e56bf261a1474ea351318ea21a7265" 2667 | } 2668 | }, 2669 | "e29b331e164649b8b6e252b71b47434b": { 2670 | "model_module": "@jupyter-widgets/controls", 2671 | "model_module_version": "1.5.0", 2672 | "model_name": "DescriptionStyleModel", 2673 | "state": { 2674 | "_model_module": "@jupyter-widgets/controls", 2675 | "_model_module_version": "1.5.0", 2676 | "_model_name": "DescriptionStyleModel", 2677 | "_view_count": null, 2678 | "_view_module": "@jupyter-widgets/base", 2679 | "_view_module_version": "1.2.0", 2680 | "_view_name": "StyleView", 2681 | "description_width": "" 2682 | } 2683 | }, 2684 | "e73980b6b1db4ef7a191c71fbc5d8831": { 2685 | "model_module": "@jupyter-widgets/base", 2686 | "model_module_version": "1.2.0", 2687 | "model_name": "LayoutModel", 2688 | "state": { 2689 | "_model_module": "@jupyter-widgets/base", 2690 | "_model_module_version": "1.2.0", 2691 | "_model_name": "LayoutModel", 2692 | "_view_count": null, 2693 | "_view_module": "@jupyter-widgets/base", 2694 | "_view_module_version": "1.2.0", 2695 | "_view_name": "LayoutView", 2696 | "align_content": null, 2697 | "align_items": null, 2698 | "align_self": null, 2699 | "border": null, 2700 | "bottom": null, 2701 | "display": null, 2702 | "flex": null, 2703 | "flex_flow": null, 2704 | "grid_area": null, 2705 | "grid_auto_columns": null, 2706 | "grid_auto_flow": null, 2707 | "grid_auto_rows": null, 2708 | "grid_column": null, 2709 | "grid_gap": null, 2710 | "grid_row": null, 2711 | "grid_template_areas": null, 2712 | "grid_template_columns": null, 2713 | "grid_template_rows": null, 2714 | "height": null, 2715 | "justify_content": null, 2716 | "justify_items": null, 2717 | "left": null, 2718 | "margin": null, 2719 | "max_height": null, 2720 | "max_width": null, 2721 | "min_height": null, 2722 | "min_width": null, 2723 | "object_fit": null, 2724 | "object_position": null, 2725 | "order": null, 2726 | "overflow": null, 2727 | "overflow_x": null, 2728 | "overflow_y": null, 2729 | "padding": null, 2730 | "right": null, 2731 | "top": null, 2732 | "visibility": null, 2733 | "width": null 2734 | } 2735 | }, 2736 | "e7e56bf261a1474ea351318ea21a7265": { 2737 | "model_module": "@jupyter-widgets/base", 2738 | "model_module_version": "1.2.0", 2739 | "model_name": "LayoutModel", 2740 | "state": { 2741 | "_model_module": "@jupyter-widgets/base", 2742 | "_model_module_version": "1.2.0", 2743 | "_model_name": "LayoutModel", 2744 | "_view_count": null, 2745 | "_view_module": "@jupyter-widgets/base", 2746 | "_view_module_version": "1.2.0", 2747 | "_view_name": "LayoutView", 2748 | "align_content": null, 2749 | "align_items": null, 2750 | "align_self": null, 2751 | "border": null, 2752 | "bottom": null, 2753 | "display": null, 2754 | "flex": null, 2755 | "flex_flow": null, 2756 | "grid_area": null, 2757 | "grid_auto_columns": null, 2758 | "grid_auto_flow": null, 2759 | "grid_auto_rows": null, 2760 | "grid_column": null, 2761 | "grid_gap": null, 2762 | "grid_row": null, 2763 | "grid_template_areas": null, 2764 | "grid_template_columns": null, 2765 | "grid_template_rows": null, 2766 | "height": null, 2767 | "justify_content": null, 2768 | "justify_items": null, 2769 | "left": null, 2770 | "margin": null, 2771 | "max_height": null, 2772 | "max_width": null, 2773 | "min_height": null, 2774 | "min_width": null, 2775 | "object_fit": null, 2776 | "object_position": null, 2777 | "order": null, 2778 | "overflow": null, 2779 | "overflow_x": null, 2780 | "overflow_y": null, 2781 | "padding": null, 2782 | "right": null, 2783 | "top": null, 2784 | "visibility": null, 2785 | "width": null 2786 | } 2787 | }, 2788 | "e8fc33d1e1a84386bd793c60407bcdac": { 2789 | "model_module": "@jupyter-widgets/base", 2790 | "model_module_version": "1.2.0", 2791 | "model_name": "LayoutModel", 2792 | "state": { 2793 | "_model_module": "@jupyter-widgets/base", 2794 | "_model_module_version": "1.2.0", 2795 | "_model_name": "LayoutModel", 2796 | "_view_count": null, 2797 | "_view_module": "@jupyter-widgets/base", 2798 | "_view_module_version": "1.2.0", 2799 | "_view_name": "LayoutView", 2800 | "align_content": null, 2801 | "align_items": null, 2802 | "align_self": null, 2803 | "border": null, 2804 | "bottom": null, 2805 | "display": null, 2806 | "flex": null, 2807 | "flex_flow": null, 2808 | "grid_area": null, 2809 | "grid_auto_columns": null, 2810 | "grid_auto_flow": null, 2811 | "grid_auto_rows": null, 2812 | "grid_column": null, 2813 | "grid_gap": null, 2814 | "grid_row": null, 2815 | "grid_template_areas": null, 2816 | "grid_template_columns": null, 2817 | "grid_template_rows": null, 2818 | "height": null, 2819 | "justify_content": null, 2820 | "justify_items": null, 2821 | "left": null, 2822 | "margin": null, 2823 | "max_height": null, 2824 | "max_width": null, 2825 | "min_height": null, 2826 | "min_width": null, 2827 | "object_fit": null, 2828 | "object_position": null, 2829 | "order": null, 2830 | "overflow": null, 2831 | "overflow_x": null, 2832 | "overflow_y": null, 2833 | "padding": null, 2834 | "right": null, 2835 | "top": null, 2836 | "visibility": null, 2837 | "width": null 2838 | } 2839 | }, 2840 | "ebcf418b325e4fcab4ccd85117b33e65": { 2841 | "model_module": "@jupyter-widgets/base", 2842 | "model_module_version": "1.2.0", 2843 | "model_name": "LayoutModel", 2844 | "state": { 2845 | "_model_module": "@jupyter-widgets/base", 2846 | "_model_module_version": "1.2.0", 2847 | "_model_name": "LayoutModel", 2848 | "_view_count": null, 2849 | "_view_module": "@jupyter-widgets/base", 2850 | "_view_module_version": "1.2.0", 2851 | "_view_name": "LayoutView", 2852 | "align_content": null, 2853 | "align_items": null, 2854 | "align_self": null, 2855 | "border": null, 2856 | "bottom": null, 2857 | "display": null, 2858 | "flex": null, 2859 | "flex_flow": null, 2860 | "grid_area": null, 2861 | "grid_auto_columns": null, 2862 | "grid_auto_flow": null, 2863 | "grid_auto_rows": null, 2864 | "grid_column": null, 2865 | "grid_gap": null, 2866 | "grid_row": null, 2867 | "grid_template_areas": null, 2868 | "grid_template_columns": null, 2869 | "grid_template_rows": null, 2870 | "height": null, 2871 | "justify_content": null, 2872 | "justify_items": null, 2873 | "left": null, 2874 | "margin": null, 2875 | "max_height": null, 2876 | "max_width": null, 2877 | "min_height": null, 2878 | "min_width": null, 2879 | "object_fit": null, 2880 | "object_position": null, 2881 | "order": null, 2882 | "overflow": null, 2883 | "overflow_x": null, 2884 | "overflow_y": null, 2885 | "padding": null, 2886 | "right": null, 2887 | "top": null, 2888 | "visibility": null, 2889 | "width": null 2890 | } 2891 | }, 2892 | "ec4f63d8c9774c7cb212b5c3a76de52b": { 2893 | "model_module": "@jupyter-widgets/controls", 2894 | "model_module_version": "1.5.0", 2895 | "model_name": "DescriptionStyleModel", 2896 | "state": { 2897 | "_model_module": "@jupyter-widgets/controls", 2898 | "_model_module_version": "1.5.0", 2899 | "_model_name": "DescriptionStyleModel", 2900 | "_view_count": null, 2901 | "_view_module": "@jupyter-widgets/base", 2902 | "_view_module_version": "1.2.0", 2903 | "_view_name": "StyleView", 2904 | "description_width": "" 2905 | } 2906 | }, 2907 | "eeb7c19df1d64397b0ea7b1a6e2a02a0": { 2908 | "model_module": "@jupyter-widgets/base", 2909 | "model_module_version": "1.2.0", 2910 | "model_name": "LayoutModel", 2911 | "state": { 2912 | "_model_module": "@jupyter-widgets/base", 2913 | "_model_module_version": "1.2.0", 2914 | "_model_name": "LayoutModel", 2915 | "_view_count": null, 2916 | "_view_module": "@jupyter-widgets/base", 2917 | "_view_module_version": "1.2.0", 2918 | "_view_name": "LayoutView", 2919 | "align_content": null, 2920 | "align_items": null, 2921 | "align_self": null, 2922 | "border": null, 2923 | "bottom": null, 2924 | "display": null, 2925 | "flex": null, 2926 | "flex_flow": null, 2927 | "grid_area": null, 2928 | "grid_auto_columns": null, 2929 | "grid_auto_flow": null, 2930 | "grid_auto_rows": null, 2931 | "grid_column": null, 2932 | "grid_gap": null, 2933 | "grid_row": null, 2934 | "grid_template_areas": null, 2935 | "grid_template_columns": null, 2936 | "grid_template_rows": null, 2937 | "height": null, 2938 | "justify_content": null, 2939 | "justify_items": null, 2940 | "left": null, 2941 | "margin": null, 2942 | "max_height": null, 2943 | "max_width": null, 2944 | "min_height": null, 2945 | "min_width": null, 2946 | "object_fit": null, 2947 | "object_position": null, 2948 | "order": null, 2949 | "overflow": null, 2950 | "overflow_x": null, 2951 | "overflow_y": null, 2952 | "padding": null, 2953 | "right": null, 2954 | "top": null, 2955 | "visibility": null, 2956 | "width": null 2957 | } 2958 | }, 2959 | "f7feb49173d6451893648b2475b1f05a": { 2960 | "model_module": "@jupyter-widgets/controls", 2961 | "model_module_version": "1.5.0", 2962 | "model_name": "DescriptionStyleModel", 2963 | "state": { 2964 | "_model_module": "@jupyter-widgets/controls", 2965 | "_model_module_version": "1.5.0", 2966 | "_model_name": "DescriptionStyleModel", 2967 | "_view_count": null, 2968 | "_view_module": "@jupyter-widgets/base", 2969 | "_view_module_version": "1.2.0", 2970 | "_view_name": "StyleView", 2971 | "description_width": "" 2972 | } 2973 | }, 2974 | "fb2b97cb722740139c33318d6042698b": { 2975 | "model_module": "@jupyter-widgets/controls", 2976 | "model_module_version": "1.5.0", 2977 | "model_name": "ProgressStyleModel", 2978 | "state": { 2979 | "_model_module": "@jupyter-widgets/controls", 2980 | "_model_module_version": "1.5.0", 2981 | "_model_name": "ProgressStyleModel", 2982 | "_view_count": null, 2983 | "_view_module": "@jupyter-widgets/base", 2984 | "_view_module_version": "1.2.0", 2985 | "_view_name": "StyleView", 2986 | "bar_color": null, 2987 | "description_width": "" 2988 | } 2989 | }, 2990 | "fd67eb37520b400db7e662c2a7fd7151": { 2991 | "model_module": "@jupyter-widgets/controls", 2992 | "model_module_version": "1.5.0", 2993 | "model_name": "DescriptionStyleModel", 2994 | "state": { 2995 | "_model_module": "@jupyter-widgets/controls", 2996 | "_model_module_version": "1.5.0", 2997 | "_model_name": "DescriptionStyleModel", 2998 | "_view_count": null, 2999 | "_view_module": "@jupyter-widgets/base", 3000 | "_view_module_version": "1.2.0", 3001 | "_view_name": "StyleView", 3002 | "description_width": "" 3003 | } 3004 | }, 3005 | "fd923f56550241b3a93bde41ffe0dbf1": { 3006 | "model_module": "@jupyter-widgets/controls", 3007 | "model_module_version": "1.5.0", 3008 | "model_name": "DescriptionStyleModel", 3009 | "state": { 3010 | "_model_module": "@jupyter-widgets/controls", 3011 | "_model_module_version": "1.5.0", 3012 | "_model_name": "DescriptionStyleModel", 3013 | "_view_count": null, 3014 | "_view_module": "@jupyter-widgets/base", 3015 | "_view_module_version": "1.2.0", 3016 | "_view_name": "StyleView", 3017 | "description_width": "" 3018 | } 3019 | } 3020 | }, 3021 | "version_major": 2, 3022 | "version_minor": 0 3023 | } 3024 | } 3025 | }, 3026 | "nbformat": 4, 3027 | "nbformat_minor": 5 3028 | } 3029 | --------------------------------------------------------------------------------