├── test.py ├── src └── textSummarizer │ ├── __init__.py │ ├── config │ ├── __init__.py │ └── configuration.py │ ├── utils │ ├── __init__.py │ └── common.py │ ├── conponents │ ├── __init__.py │ ├── data_validation.py │ ├── data_ingestion.py │ ├── data_transformation.py │ ├── model_trainer.py │ └── model_evaluation.py │ ├── pipeline │ ├── __init__.py │ ├── stage_04_model_trainer.py │ ├── stage_02_data_validation.py │ ├── stage_03_data_transformation.py │ ├── stage_05_model_evaluation.py │ ├── stage_01_data_ingestion.py │ └── prediction.py │ ├── constants │ └── __init__.py │ ├── logging │ └── __init__.py │ └── entity │ └── __init__.py ├── params.yaml ├── requirements.txt ├── Dockerfile ├── setup.py ├── app.py ├── LICENSE ├── config └── config.yaml ├── template.py ├── main.py ├── README.md ├── .github └── workflows │ └── main.yaml └── research ├── 02_data_validation.ipynb ├── 03_data_transformation.ipynb ├── trials.ipynb ├── 01_data_ingestion.ipynb ├── 05_Model_evaluation.ipynb └── 04_model_trainer.ipynb /test.py: -------------------------------------------------------------------------------- 1 | -------------------------------------------------------------------------------- /src/textSummarizer/__init__.py: -------------------------------------------------------------------------------- 1 | -------------------------------------------------------------------------------- /src/textSummarizer/config/__init__.py: -------------------------------------------------------------------------------- 1 | -------------------------------------------------------------------------------- /src/textSummarizer/utils/__init__.py: -------------------------------------------------------------------------------- 1 | -------------------------------------------------------------------------------- /src/textSummarizer/conponents/__init__.py: -------------------------------------------------------------------------------- 1 | -------------------------------------------------------------------------------- /src/textSummarizer/pipeline/__init__.py: -------------------------------------------------------------------------------- 1 | -------------------------------------------------------------------------------- /src/textSummarizer/constants/__init__.py: -------------------------------------------------------------------------------- 1 | from pathlib import Path 2 | 3 | CONFIG_FILE_PATH = Path("config/config.yaml") 4 | PARAMS_FILE_PATH = Path("params.yaml") -------------------------------------------------------------------------------- /params.yaml: -------------------------------------------------------------------------------- 1 | TrainingArguments: 2 | num_train_epochs: 1 3 | warmup_steps: 500 4 | per_device_train_batch_size: 1 5 | weight_decay: 0.01 6 | logging_steps: 10 7 | evaluation_strategy: steps 8 | eval_steps: 500 9 | save_steps: 1e6 10 | gradient_accumulation_steps: 16 11 | 12 | -------------------------------------------------------------------------------- /requirements.txt: -------------------------------------------------------------------------------- 1 | transformers 2 | transformers[sentencepiece] 3 | datasets 4 | sacrebleu 5 | rouge_score 6 | py7zr 7 | pandas 8 | nltk 9 | tqdm 10 | PyYAML 11 | matplotlib 12 | torch 13 | notebook 14 | boto3 15 | mypy-boto3-s3 16 | python-box==6.0.2 17 | ensure==1.0.2 18 | fastapi==0.78.0 19 | uvicorn==0.18.3 20 | Jinja2==3.1.2 21 | -e . 22 | -------------------------------------------------------------------------------- /Dockerfile: -------------------------------------------------------------------------------- 1 | FROM python:3.8-slim-buster 2 | 3 | RUN apt update -y && apt install awscli -y 4 | WORKDIR /app 5 | 6 | COPY . /app 7 | 8 | RUN pip install -r requirements.txt 9 | RUN pip install --upgrade accelerate 10 | RUN pip uninstall -y transformers accelerate 11 | RUN pip install transformers accelerate 12 | 13 | CMD ["python3", "app.py"] 14 | -------------------------------------------------------------------------------- /src/textSummarizer/logging/__init__.py: -------------------------------------------------------------------------------- 1 | import os 2 | import sys 3 | import logging 4 | 5 | logging_str = "[%(asctime)s: %(levelname)s: %(module)s: %(message)s]" 6 | log_dir = "logs" 7 | log_filepath = os.path.join(log_dir,"running_logs.log") 8 | os.makedirs(log_dir, exist_ok=True) 9 | 10 | 11 | 12 | logging.basicConfig( 13 | level= logging.INFO, 14 | format= logging_str, 15 | 16 | handlers=[ 17 | logging.FileHandler(log_filepath), 18 | logging.StreamHandler(sys.stdout) 19 | ] 20 | ) 21 | 22 | logger = logging.getLogger("textSummarizerLogger") -------------------------------------------------------------------------------- /src/textSummarizer/pipeline/stage_04_model_trainer.py: -------------------------------------------------------------------------------- 1 | from textSummarizer.config.configuration import ConfigurationManager 2 | from textSummarizer.conponents.model_trainer import ModelTrainer 3 | from textSummarizer.logging import logger 4 | 5 | 6 | class ModelTrainerTrainingPipeline: 7 | def __init__(self): 8 | pass 9 | 10 | def main(self): 11 | config = ConfigurationManager() 12 | model_trainer_config = config.get_model_trainer_config() 13 | model_trainer_config = ModelTrainer(config=model_trainer_config) 14 | model_trainer_config.train() -------------------------------------------------------------------------------- /src/textSummarizer/pipeline/stage_02_data_validation.py: -------------------------------------------------------------------------------- 1 | from textSummarizer.config.configuration import ConfigurationManager 2 | from textSummarizer.conponents.data_validation import DataValiadtion 3 | from textSummarizer.logging import logger 4 | 5 | 6 | class DataValidationTrainingPipeline: 7 | def __init__(self): 8 | pass 9 | 10 | def main(self): 11 | config = ConfigurationManager() 12 | data_validation_config = config.get_data_validation_config() 13 | data_validation = DataValiadtion(config=data_validation_config) 14 | data_validation.validate_all_files_exist() -------------------------------------------------------------------------------- /src/textSummarizer/pipeline/stage_03_data_transformation.py: -------------------------------------------------------------------------------- 1 | from textSummarizer.config.configuration import ConfigurationManager 2 | from textSummarizer.conponents.data_transformation import DataTransformation 3 | from textSummarizer.logging import logger 4 | 5 | 6 | class DataTransformationTrainingPipeline: 7 | def __init__(self): 8 | pass 9 | 10 | def main(self): 11 | config = ConfigurationManager() 12 | data_transformation_config = config.get_data_transformation_config() 13 | data_transformation = DataTransformation(config=data_transformation_config) 14 | data_transformation.convert() -------------------------------------------------------------------------------- /src/textSummarizer/pipeline/stage_05_model_evaluation.py: -------------------------------------------------------------------------------- 1 | from textSummarizer.config.configuration import ConfigurationManager 2 | from textSummarizer.conponents.model_evaluation import ModelEvaluation 3 | from textSummarizer.logging import logger 4 | 5 | 6 | 7 | 8 | class ModelEvaluationTrainingPipeline: 9 | def __init__(self): 10 | pass 11 | 12 | def main(self): 13 | config = ConfigurationManager() 14 | model_evaluation_config = config.get_model_evaluation_config() 15 | model_evaluation_config = ModelEvaluation(config=model_evaluation_config) 16 | model_evaluation_config.evaluate() -------------------------------------------------------------------------------- /src/textSummarizer/pipeline/stage_01_data_ingestion.py: -------------------------------------------------------------------------------- 1 | from textSummarizer.config.configuration import ConfigurationManager 2 | from textSummarizer.conponents.data_ingestion import DataIngestion 3 | from textSummarizer.logging import logger 4 | 5 | 6 | class DataIngestionTrainingPipeline: 7 | def __init__(self): 8 | pass 9 | 10 | def main(self): 11 | config = ConfigurationManager() 12 | data_ingestion_config = config.get_data_ingestion_config() 13 | data_ingestion = DataIngestion(config=data_ingestion_config) 14 | data_ingestion.download_file() 15 | data_ingestion.extract_zip_file() 16 | -------------------------------------------------------------------------------- /src/textSummarizer/pipeline/prediction.py: -------------------------------------------------------------------------------- 1 | from textSummarizer.config.configuration import ConfigurationManager 2 | from transformers import AutoTokenizer 3 | from transformers import pipeline 4 | 5 | 6 | class PredictionPipeline: 7 | def __init__(self): 8 | self.config = ConfigurationManager().get_model_evaluation_config() 9 | 10 | 11 | 12 | def predict(self,text): 13 | tokenizer = AutoTokenizer.from_pretrained(self.config.tokenizer_path) 14 | gen_kwargs = {"length_penalty": 0.8, "num_beams":8, "max_length": 128} 15 | 16 | pipe = pipeline("summarization", model=self.config.model_path,tokenizer=tokenizer) 17 | 18 | print("Dialogue:") 19 | print(text) 20 | 21 | output = pipe(text, **gen_kwargs)[0]["summary_text"] 22 | print("\nModel Summary:") 23 | print(output) 24 | 25 | return output -------------------------------------------------------------------------------- /setup.py: -------------------------------------------------------------------------------- 1 | import setuptools 2 | 3 | with open("README.md", "r", encoding="utf-8") as f: 4 | long_description = f.read() 5 | 6 | 7 | __version__ = "0.0.0" 8 | 9 | REPO_NAME = "Text-Summarizer-Project" 10 | AUTHOR_USER_NAME = "entbappy" 11 | SRC_REPO = "textSummarizer" 12 | AUTHOR_EMAIL = "entbappy73@gmail.com" 13 | 14 | 15 | 16 | setuptools.setup( 17 | name=SRC_REPO, 18 | version=__version__, 19 | author=AUTHOR_USER_NAME, 20 | author_email=AUTHOR_EMAIL, 21 | description="A small python package for NLP app", 22 | long_description=long_description, 23 | long_description_content="text/markdown", 24 | url=f"https://github.com/{AUTHOR_USER_NAME}/{REPO_NAME}", 25 | project_urls={ 26 | "Bug Tracker": f"https://github.com/{AUTHOR_USER_NAME}/{REPO_NAME}/issues", 27 | }, 28 | package_dir={"": "src"}, 29 | packages=setuptools.find_packages(where="src") 30 | ) -------------------------------------------------------------------------------- /app.py: -------------------------------------------------------------------------------- 1 | from fastapi import FastAPI 2 | import uvicorn 3 | import sys 4 | import os 5 | from fastapi.templating import Jinja2Templates 6 | from starlette.responses import RedirectResponse 7 | from fastapi.responses import Response 8 | from textSummarizer.pipeline.prediction import PredictionPipeline 9 | 10 | 11 | text:str = "What is Text Summarization?" 12 | 13 | app = FastAPI() 14 | 15 | @app.get("/", tags=["authentication"]) 16 | async def index(): 17 | return RedirectResponse(url="/docs") 18 | 19 | 20 | 21 | @app.get("/train") 22 | async def training(): 23 | try: 24 | os.system("python main.py") 25 | return Response("Training successful !!") 26 | 27 | except Exception as e: 28 | return Response(f"Error Occurred! {e}") 29 | 30 | 31 | 32 | 33 | @app.post("/predict") 34 | async def predict_route(text): 35 | try: 36 | 37 | obj = PredictionPipeline() 38 | text = obj.predict(text) 39 | return text 40 | except Exception as e: 41 | raise e 42 | 43 | 44 | if __name__=="__main__": 45 | uvicorn.run(app, host="0.0.0.0", port=8080) 46 | -------------------------------------------------------------------------------- /LICENSE: -------------------------------------------------------------------------------- 1 | MIT License 2 | 3 | Copyright (c) 2023 BAPPY AHMED 4 | 5 | Permission is hereby granted, free of charge, to any person obtaining a copy 6 | of this software and associated documentation files (the "Software"), to deal 7 | in the Software without restriction, including without limitation the rights 8 | to use, copy, modify, merge, publish, distribute, sublicense, and/or sell 9 | copies of the Software, and to permit persons to whom the Software is 10 | furnished to do so, subject to the following conditions: 11 | 12 | The above copyright notice and this permission notice shall be included in all 13 | copies or substantial portions of the Software. 14 | 15 | THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR 16 | IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, 17 | FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE 18 | AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER 19 | LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, 20 | OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE 21 | SOFTWARE. 22 | -------------------------------------------------------------------------------- /src/textSummarizer/conponents/data_validation.py: -------------------------------------------------------------------------------- 1 | import os 2 | from textSummarizer.logging import logger 3 | from textSummarizer.entity import DataValidationConfig 4 | 5 | class DataValiadtion: 6 | def __init__(self, config: DataValidationConfig): 7 | self.config = config 8 | 9 | 10 | 11 | def validate_all_files_exist(self)-> bool: 12 | try: 13 | validation_status = None 14 | 15 | all_files = os.listdir(os.path.join("artifacts","data_ingestion","samsum_dataset")) 16 | 17 | for file in all_files: 18 | if file not in self.config.ALL_REQUIRED_FILES: 19 | validation_status = False 20 | with open(self.config.STATUS_FILE, 'w') as f: 21 | f.write(f"Validation status: {validation_status}") 22 | else: 23 | validation_status = True 24 | with open(self.config.STATUS_FILE, 'w') as f: 25 | f.write(f"Validation status: {validation_status}") 26 | 27 | return validation_status 28 | 29 | except Exception as e: 30 | raise e 31 | -------------------------------------------------------------------------------- /src/textSummarizer/entity/__init__.py: -------------------------------------------------------------------------------- 1 | from dataclasses import dataclass 2 | from pathlib import Path 3 | 4 | @dataclass(frozen=True) 5 | class DataIngestionConfig: 6 | root_dir: Path 7 | source_URL: str 8 | local_data_file: Path 9 | unzip_dir: Path 10 | 11 | 12 | 13 | @dataclass(frozen=True) 14 | class DataValidationConfig: 15 | root_dir: Path 16 | STATUS_FILE: str 17 | ALL_REQUIRED_FILES: list 18 | 19 | 20 | 21 | @dataclass(frozen=True) 22 | class DataTransformationConfig: 23 | root_dir: Path 24 | data_path: Path 25 | tokenizer_name: Path 26 | 27 | 28 | 29 | @dataclass(frozen=True) 30 | class ModelTrainerConfig: 31 | root_dir: Path 32 | data_path: Path 33 | model_ckpt: Path 34 | num_train_epochs: int 35 | warmup_steps: int 36 | per_device_train_batch_size: int 37 | weight_decay: float 38 | logging_steps: int 39 | evaluation_strategy: str 40 | eval_steps: int 41 | save_steps: float 42 | gradient_accumulation_steps: int 43 | 44 | 45 | 46 | @dataclass(frozen=True) 47 | class ModelEvaluationConfig: 48 | root_dir: Path 49 | data_path: Path 50 | model_path: Path 51 | tokenizer_path: Path 52 | metric_file_name: Path -------------------------------------------------------------------------------- /config/config.yaml: -------------------------------------------------------------------------------- 1 | artifacts_root: artifacts 2 | 3 | 4 | data_ingestion: 5 | root_dir: artifacts/data_ingestion 6 | source_URL: https://github.com/entbappy/Branching-tutorial/raw/master/summarizer-data.zip 7 | local_data_file: artifacts/data_ingestion/data.zip 8 | unzip_dir: artifacts/data_ingestion 9 | 10 | 11 | 12 | data_validation: 13 | root_dir: artifacts/data_validation 14 | STATUS_FILE: artifacts/data_validation/status.txt 15 | ALL_REQUIRED_FILES: ["train", "test", "validation"] 16 | 17 | 18 | 19 | data_transformation: 20 | root_dir: artifacts/data_transformation 21 | data_path: artifacts/data_ingestion/samsum_dataset 22 | tokenizer_name: google/pegasus-cnn_dailymail 23 | 24 | 25 | 26 | 27 | model_trainer: 28 | root_dir: artifacts/model_trainer 29 | data_path: artifacts/data_transformation/samsum_dataset 30 | model_ckpt: google/pegasus-cnn_dailymail 31 | 32 | 33 | 34 | 35 | model_evaluation: 36 | root_dir: artifacts/model_evaluation 37 | data_path: artifacts/data_transformation/samsum_dataset 38 | model_path: artifacts/model_trainer/pegasus-samsum-model 39 | tokenizer_path: artifacts/model_trainer/tokenizer 40 | metric_file_name: artifacts/model_evaluation/metrics.csv 41 | 42 | -------------------------------------------------------------------------------- /src/textSummarizer/conponents/data_ingestion.py: -------------------------------------------------------------------------------- 1 | import os 2 | import urllib.request as request 3 | import zipfile 4 | from textSummarizer.logging import logger 5 | from textSummarizer.utils.common import get_size 6 | from pathlib import Path 7 | from textSummarizer.entity import DataIngestionConfig 8 | 9 | 10 | class DataIngestion: 11 | def __init__(self, config: DataIngestionConfig): 12 | self.config = config 13 | 14 | 15 | 16 | def download_file(self): 17 | if not os.path.exists(self.config.local_data_file): 18 | filename, headers = request.urlretrieve( 19 | url = self.config.source_URL, 20 | filename = self.config.local_data_file 21 | ) 22 | logger.info(f"{filename} download! with following info: \n{headers}") 23 | else: 24 | logger.info(f"File already exists of size: {get_size(Path(self.config.local_data_file))}") 25 | 26 | 27 | 28 | def extract_zip_file(self): 29 | """ 30 | zip_file_path: str 31 | Extracts the zip file into the data directory 32 | Function returns None 33 | """ 34 | unzip_path = self.config.unzip_dir 35 | os.makedirs(unzip_path, exist_ok=True) 36 | with zipfile.ZipFile(self.config.local_data_file, 'r') as zip_ref: 37 | zip_ref.extractall(unzip_path) -------------------------------------------------------------------------------- /src/textSummarizer/conponents/data_transformation.py: -------------------------------------------------------------------------------- 1 | import os 2 | from textSummarizer.logging import logger 3 | from transformers import AutoTokenizer 4 | from datasets import load_dataset, load_from_disk 5 | from textSummarizer.entity import DataTransformationConfig 6 | 7 | 8 | 9 | class DataTransformation: 10 | def __init__(self, config: DataTransformationConfig): 11 | self.config = config 12 | self.tokenizer = AutoTokenizer.from_pretrained(config.tokenizer_name) 13 | 14 | 15 | 16 | def convert_examples_to_features(self,example_batch): 17 | input_encodings = self.tokenizer(example_batch['dialogue'] , max_length = 1024, truncation = True ) 18 | 19 | with self.tokenizer.as_target_tokenizer(): 20 | target_encodings = self.tokenizer(example_batch['summary'], max_length = 128, truncation = True ) 21 | 22 | return { 23 | 'input_ids' : input_encodings['input_ids'], 24 | 'attention_mask': input_encodings['attention_mask'], 25 | 'labels': target_encodings['input_ids'] 26 | } 27 | 28 | 29 | def convert(self): 30 | dataset_samsum = load_from_disk(self.config.data_path) 31 | dataset_samsum_pt = dataset_samsum.map(self.convert_examples_to_features, batched = True) 32 | dataset_samsum_pt.save_to_disk(os.path.join(self.config.root_dir,"samsum_dataset")) 33 | 34 | 35 | -------------------------------------------------------------------------------- /template.py: -------------------------------------------------------------------------------- 1 | import os 2 | from pathlib import Path 3 | import logging 4 | 5 | logging.basicConfig(level=logging.INFO, format='[%(asctime)s]: %(message)s:') 6 | 7 | 8 | project_name = "textSummarizer" 9 | 10 | list_of_files = [ 11 | ".github/workflows/.gitkeep", 12 | f"src/{project_name}/__init__.py", 13 | f"src/{project_name}/conponents/__init__.py", 14 | f"src/{project_name}/utils/__init__.py", 15 | f"src/{project_name}/utils/common.py", 16 | f"src/{project_name}/logging/__init__.py", 17 | f"src/{project_name}/config/__init__.py", 18 | f"src/{project_name}/config/configuration.py", 19 | f"src/{project_name}/pipeline/__init__.py", 20 | f"src/{project_name}/entity/__init__.py", 21 | f"src/{project_name}/constants/__init__.py", 22 | "config/config.yaml", 23 | "params.yaml", 24 | "app.py", 25 | "main.py", 26 | "Dockerfile", 27 | "requirements.txt", 28 | "setup.py", 29 | "research/trials.ipynb", 30 | 31 | ] 32 | 33 | 34 | for filepath in list_of_files: 35 | filepath = Path(filepath) 36 | filedir, filename = os.path.split(filepath) 37 | 38 | if filedir != "": 39 | os.makedirs(filedir, exist_ok=True) 40 | logging.info(f"Creating directory:{filedir} for the file {filename}") 41 | 42 | 43 | if (not os.path.exists(filepath)) or (os.path.getsize(filepath) == 0): 44 | with open(filepath,'w') as f: 45 | pass 46 | logging.info(f"Creating empty file: {filepath}") 47 | 48 | 49 | 50 | else: 51 | logging.info(f"{filename} is already exists") 52 | 53 | -------------------------------------------------------------------------------- /src/textSummarizer/utils/common.py: -------------------------------------------------------------------------------- 1 | import os 2 | from box.exceptions import BoxValueError 3 | import yaml 4 | from textSummarizer.logging import logger 5 | from ensure import ensure_annotations 6 | from box import ConfigBox 7 | from pathlib import Path 8 | from typing import Any 9 | 10 | 11 | 12 | @ensure_annotations 13 | def read_yaml(path_to_yaml: Path) -> ConfigBox: 14 | """reads yaml file and returns 15 | 16 | Args: 17 | path_to_yaml (str): path like input 18 | 19 | Raises: 20 | ValueError: if yaml file is empty 21 | e: empty file 22 | 23 | Returns: 24 | ConfigBox: ConfigBox type 25 | """ 26 | try: 27 | with open(path_to_yaml) as yaml_file: 28 | content = yaml.safe_load(yaml_file) 29 | logger.info(f"yaml file: {path_to_yaml} loaded successfully") 30 | return ConfigBox(content) 31 | except BoxValueError: 32 | raise ValueError("yaml file is empty") 33 | except Exception as e: 34 | raise e 35 | 36 | 37 | 38 | @ensure_annotations 39 | def create_directories(path_to_directories: list, verbose=True): 40 | """create list of directories 41 | 42 | Args: 43 | path_to_directories (list): list of path of directories 44 | ignore_log (bool, optional): ignore if multiple dirs is to be created. Defaults to False. 45 | """ 46 | for path in path_to_directories: 47 | os.makedirs(path, exist_ok=True) 48 | if verbose: 49 | logger.info(f"created directory at: {path}") 50 | 51 | 52 | 53 | @ensure_annotations 54 | def get_size(path: Path) -> str: 55 | """get size in KB 56 | 57 | Args: 58 | path (Path): path of the file 59 | 60 | Returns: 61 | str: size in KB 62 | """ 63 | size_in_kb = round(os.path.getsize(path)/1024) 64 | return f"~ {size_in_kb} KB" 65 | 66 | 67 | -------------------------------------------------------------------------------- /main.py: -------------------------------------------------------------------------------- 1 | from textSummarizer.pipeline.stage_01_data_ingestion import DataIngestionTrainingPipeline 2 | from textSummarizer.pipeline.stage_02_data_validation import DataValidationTrainingPipeline 3 | from textSummarizer.pipeline.stage_03_data_transformation import DataTransformationTrainingPipeline 4 | from textSummarizer.pipeline.stage_04_model_trainer import ModelTrainerTrainingPipeline 5 | from textSummarizer.pipeline.stage_05_model_evaluation import ModelEvaluationTrainingPipeline 6 | from textSummarizer.logging import logger 7 | 8 | 9 | STAGE_NAME = "Data Ingestion stage" 10 | try: 11 | logger.info(f">>>>>> stage {STAGE_NAME} started <<<<<<") 12 | data_ingestion = DataIngestionTrainingPipeline() 13 | data_ingestion.main() 14 | logger.info(f">>>>>> stage {STAGE_NAME} completed <<<<<<\n\nx==========x") 15 | except Exception as e: 16 | logger.exception(e) 17 | raise e 18 | 19 | 20 | 21 | 22 | STAGE_NAME = "Data Validation stage" 23 | try: 24 | logger.info(f">>>>>> stage {STAGE_NAME} started <<<<<<") 25 | data_validation = DataValidationTrainingPipeline() 26 | data_validation.main() 27 | logger.info(f">>>>>> stage {STAGE_NAME} completed <<<<<<\n\nx==========x") 28 | except Exception as e: 29 | logger.exception(e) 30 | raise e 31 | 32 | 33 | 34 | STAGE_NAME = "Data Transformation stage" 35 | try: 36 | logger.info(f">>>>>> stage {STAGE_NAME} started <<<<<<") 37 | data_transformation = DataTransformationTrainingPipeline() 38 | data_transformation.main() 39 | logger.info(f">>>>>> stage {STAGE_NAME} completed <<<<<<\n\nx==========x") 40 | except Exception as e: 41 | logger.exception(e) 42 | raise e 43 | 44 | 45 | 46 | STAGE_NAME = "Model Trainer stage" 47 | try: 48 | logger.info(f"*******************") 49 | logger.info(f">>>>>> stage {STAGE_NAME} started <<<<<<") 50 | model_trainer = ModelTrainerTrainingPipeline() 51 | model_trainer.main() 52 | logger.info(f">>>>>> stage {STAGE_NAME} completed <<<<<<\n\nx==========x") 53 | except Exception as e: 54 | logger.exception(e) 55 | raise e 56 | 57 | 58 | 59 | 60 | STAGE_NAME = "Model Evaluation stage" 61 | try: 62 | logger.info(f"*******************") 63 | logger.info(f">>>>>> stage {STAGE_NAME} started <<<<<<") 64 | model_evaluation = ModelEvaluationTrainingPipeline() 65 | model_evaluation.main() 66 | logger.info(f">>>>>> stage {STAGE_NAME} completed <<<<<<\n\nx==========x") 67 | except Exception as e: 68 | logger.exception(e) 69 | raise e 70 | 71 | 72 | 73 | 74 | 75 | -------------------------------------------------------------------------------- /src/textSummarizer/conponents/model_trainer.py: -------------------------------------------------------------------------------- 1 | from transformers import TrainingArguments, Trainer 2 | from transformers import DataCollatorForSeq2Seq 3 | from transformers import AutoModelForSeq2SeqLM, AutoTokenizer 4 | from datasets import load_dataset, load_from_disk 5 | from textSummarizer.entity import ModelTrainerConfig 6 | import torch 7 | import os 8 | 9 | 10 | class ModelTrainer: 11 | def __init__(self, config: ModelTrainerConfig): 12 | self.config = config 13 | 14 | 15 | 16 | def train(self): 17 | device = "cuda" if torch.cuda.is_available() else "cpu" 18 | tokenizer = AutoTokenizer.from_pretrained(self.config.model_ckpt) 19 | model_pegasus = AutoModelForSeq2SeqLM.from_pretrained(self.config.model_ckpt).to(device) 20 | seq2seq_data_collator = DataCollatorForSeq2Seq(tokenizer, model=model_pegasus) 21 | 22 | #loading data 23 | dataset_samsum_pt = load_from_disk(self.config.data_path) 24 | 25 | # trainer_args = TrainingArguments( 26 | # output_dir=self.config.root_dir, num_train_epochs=self.config.num_train_epochs, warmup_steps=self.config.warmup_steps, 27 | # per_device_train_batch_size=self.config.per_device_train_batch_size, per_device_eval_batch_size=self.config.per_device_train_batch_size, 28 | # weight_decay=self.config.weight_decay, logging_steps=self.config.logging_steps, 29 | # evaluation_strategy=self.config.evaluation_strategy, eval_steps=self.config.eval_steps, save_steps=1e6, 30 | # gradient_accumulation_steps=self.config.gradient_accumulation_steps 31 | # ) 32 | 33 | 34 | trainer_args = TrainingArguments( 35 | output_dir=self.config.root_dir, num_train_epochs=1, warmup_steps=500, 36 | per_device_train_batch_size=1, per_device_eval_batch_size=1, 37 | weight_decay=0.01, logging_steps=10, 38 | evaluation_strategy='steps', eval_steps=500, save_steps=1e6, 39 | gradient_accumulation_steps=16 40 | ) 41 | 42 | trainer = Trainer(model=model_pegasus, args=trainer_args, 43 | tokenizer=tokenizer, data_collator=seq2seq_data_collator, 44 | train_dataset=dataset_samsum_pt["train"], 45 | eval_dataset=dataset_samsum_pt["validation"]) 46 | 47 | trainer.train() 48 | 49 | ## Save model 50 | model_pegasus.save_pretrained(os.path.join(self.config.root_dir,"pegasus-samsum-model")) 51 | ## Save tokenizer 52 | tokenizer.save_pretrained(os.path.join(self.config.root_dir,"tokenizer")) 53 | -------------------------------------------------------------------------------- /README.md: -------------------------------------------------------------------------------- 1 | # End to end Text-Summarizer-Project 2 | 3 | ## Workflows 4 | 5 | 1. Update config.yaml 6 | 2. Update params.yaml 7 | 3. Update entity 8 | 4. Update the configuration manager in src config 9 | 5. update the conponents 10 | 6. update the pipeline 11 | 7. update the main.py 12 | 8. update the app.py 13 | 14 | 15 | # How to run? 16 | ### STEPS: 17 | 18 | Clone the repository 19 | 20 | ```bash 21 | https://github.com/entbappy/End-to-end-Text-Summarization 22 | ``` 23 | ### STEP 01- Create a conda environment after opening the repository 24 | 25 | ```bash 26 | conda create -n summary python=3.8 -y 27 | ``` 28 | 29 | ```bash 30 | conda activate summary 31 | ``` 32 | 33 | 34 | ### STEP 02- install the requirements 35 | ```bash 36 | pip install -r requirements.txt 37 | ``` 38 | 39 | 40 | ```bash 41 | # Finally run the following command 42 | python app.py 43 | ``` 44 | 45 | Now, 46 | ```bash 47 | open up you local host and port 48 | ``` 49 | 50 | 51 | ```bash 52 | Author: Krish Naik 53 | Data Scientist 54 | Email: krishnaik06@gmail.com 55 | 56 | ``` 57 | 58 | 59 | 60 | # AWS-CICD-Deployment-with-Github-Actions 61 | 62 | ## 1. Login to AWS console. 63 | 64 | ## 2. Create IAM user for deployment 65 | 66 | #with specific access 67 | 68 | 1. EC2 access : It is virtual machine 69 | 70 | 2. ECR: Elastic Container registry to save your docker image in aws 71 | 72 | 73 | #Description: About the deployment 74 | 75 | 1. Build docker image of the source code 76 | 77 | 2. Push your docker image to ECR 78 | 79 | 3. Launch Your EC2 80 | 81 | 4. Pull Your image from ECR in EC2 82 | 83 | 5. Lauch your docker image in EC2 84 | 85 | #Policy: 86 | 87 | 1. AmazonEC2ContainerRegistryFullAccess 88 | 89 | 2. AmazonEC2FullAccess 90 | 91 | 92 | ## 3. Create ECR repo to store/save docker image 93 | - Save the URI: 566373416292.dkr.ecr.us-east-1.amazonaws.com/text-s 94 | 95 | 96 | ## 4. Create EC2 machine (Ubuntu) 97 | 98 | ## 5. Open EC2 and Install docker in EC2 Machine: 99 | 100 | 101 | #optinal 102 | 103 | sudo apt-get update -y 104 | 105 | sudo apt-get upgrade 106 | 107 | #required 108 | 109 | curl -fsSL https://get.docker.com -o get-docker.sh 110 | 111 | sudo sh get-docker.sh 112 | 113 | sudo usermod -aG docker ubuntu 114 | 115 | newgrp docker 116 | 117 | # 6. Configure EC2 as self-hosted runner: 118 | setting>actions>runner>new self hosted runner> choose os> then run command one by one 119 | 120 | 121 | # 7. Setup github secrets: 122 | 123 | AWS_ACCESS_KEY_ID= 124 | 125 | AWS_SECRET_ACCESS_KEY= 126 | 127 | AWS_REGION = us-east-1 128 | 129 | AWS_ECR_LOGIN_URI = demo>> 566373416292.dkr.ecr.ap-south-1.amazonaws.com 130 | 131 | ECR_REPOSITORY_NAME = simple-app 132 | -------------------------------------------------------------------------------- /.github/workflows/main.yaml: -------------------------------------------------------------------------------- 1 | name: workflow 2 | 3 | on: 4 | push: 5 | branches: 6 | - main 7 | paths-ignore: 8 | - 'README.md' 9 | 10 | permissions: 11 | id-token: write 12 | contents: read 13 | 14 | jobs: 15 | integration: 16 | name: Continuous Integration 17 | runs-on: ubuntu-latest 18 | steps: 19 | - name: Checkout Code 20 | uses: actions/checkout@v3 21 | 22 | - name: Lint code 23 | run: echo "Linting repository" 24 | 25 | - name: Run unit tests 26 | run: echo "Running unit tests" 27 | 28 | build-and-push-ecr-image: 29 | name: Continuous Delivery 30 | needs: integration 31 | runs-on: ubuntu-latest 32 | steps: 33 | - name: Checkout Code 34 | uses: actions/checkout@v3 35 | 36 | - name: Install Utilities 37 | run: | 38 | sudo apt-get update 39 | sudo apt-get install -y jq unzip 40 | - name: Configure AWS credentials 41 | uses: aws-actions/configure-aws-credentials@v1 42 | with: 43 | aws-access-key-id: ${{ secrets.AWS_ACCESS_KEY_ID }} 44 | aws-secret-access-key: ${{ secrets.AWS_SECRET_ACCESS_KEY }} 45 | aws-region: ${{ secrets.AWS_REGION }} 46 | 47 | - name: Login to Amazon ECR 48 | id: login-ecr 49 | uses: aws-actions/amazon-ecr-login@v1 50 | 51 | - name: Build, tag, and push image to Amazon ECR 52 | id: build-image 53 | env: 54 | ECR_REGISTRY: ${{ steps.login-ecr.outputs.registry }} 55 | ECR_REPOSITORY: ${{ secrets.ECR_REPOSITORY_NAME }} 56 | IMAGE_TAG: latest 57 | run: | 58 | # Build a docker container and 59 | # push it to ECR so that it can 60 | # be deployed to ECS. 61 | docker build -t $ECR_REGISTRY/$ECR_REPOSITORY:$IMAGE_TAG . 62 | docker push $ECR_REGISTRY/$ECR_REPOSITORY:$IMAGE_TAG 63 | echo "::set-output name=image::$ECR_REGISTRY/$ECR_REPOSITORY:$IMAGE_TAG" 64 | 65 | 66 | Continuous-Deployment: 67 | needs: build-and-push-ecr-image 68 | runs-on: self-hosted 69 | steps: 70 | - name: Checkout 71 | uses: actions/checkout@v3 72 | 73 | - name: Configure AWS credentials 74 | uses: aws-actions/configure-aws-credentials@v1 75 | with: 76 | aws-access-key-id: ${{ secrets.AWS_ACCESS_KEY_ID }} 77 | aws-secret-access-key: ${{ secrets.AWS_SECRET_ACCESS_KEY }} 78 | aws-region: ${{ secrets.AWS_REGION }} 79 | 80 | - name: Login to Amazon ECR 81 | id: login-ecr 82 | uses: aws-actions/amazon-ecr-login@v1 83 | 84 | 85 | - name: Pull latest images 86 | run: | 87 | docker pull ${{secrets.AWS_ECR_LOGIN_URI}}/${{ secrets.ECR_REPOSITORY_NAME }}:latest 88 | 89 | # - name: Stop and remove container if running 90 | # run: | 91 | # docker ps -q --filter "name=texts" | grep -q . && docker stop texts && docker rm -fv texts 92 | 93 | - name: Run Docker Image to serve users 94 | run: | 95 | docker run -d -p 8080:8080 --name=texts -e 'AWS_ACCESS_KEY_ID=${{ secrets.AWS_ACCESS_KEY_ID }}' -e 'AWS_SECRET_ACCESS_KEY=${{ secrets.AWS_SECRET_ACCESS_KEY }}' -e 'AWS_REGION=${{ secrets.AWS_REGION }}' ${{secrets.AWS_ECR_LOGIN_URI}}/${{ secrets.ECR_REPOSITORY_NAME }}:latest 96 | - name: Clean previous images and containers 97 | run: | 98 | docker system prune -f -------------------------------------------------------------------------------- /src/textSummarizer/conponents/model_evaluation.py: -------------------------------------------------------------------------------- 1 | from transformers import AutoModelForSeq2SeqLM, AutoTokenizer 2 | from datasets import load_dataset, load_from_disk, load_metric 3 | import torch 4 | import pandas as pd 5 | from tqdm import tqdm 6 | from textSummarizer.entity import ModelEvaluationConfig 7 | 8 | 9 | 10 | 11 | class ModelEvaluation: 12 | def __init__(self, config: ModelEvaluationConfig): 13 | self.config = config 14 | 15 | 16 | 17 | def generate_batch_sized_chunks(self,list_of_elements, batch_size): 18 | """split the dataset into smaller batches that we can process simultaneously 19 | Yield successive batch-sized chunks from list_of_elements.""" 20 | for i in range(0, len(list_of_elements), batch_size): 21 | yield list_of_elements[i : i + batch_size] 22 | 23 | 24 | def calculate_metric_on_test_ds(self,dataset, metric, model, tokenizer, 25 | batch_size=16, device="cuda" if torch.cuda.is_available() else "cpu", 26 | column_text="article", 27 | column_summary="highlights"): 28 | article_batches = list(self.generate_batch_sized_chunks(dataset[column_text], batch_size)) 29 | target_batches = list(self.generate_batch_sized_chunks(dataset[column_summary], batch_size)) 30 | 31 | for article_batch, target_batch in tqdm( 32 | zip(article_batches, target_batches), total=len(article_batches)): 33 | 34 | inputs = tokenizer(article_batch, max_length=1024, truncation=True, 35 | padding="max_length", return_tensors="pt") 36 | 37 | summaries = model.generate(input_ids=inputs["input_ids"].to(device), 38 | attention_mask=inputs["attention_mask"].to(device), 39 | length_penalty=0.8, num_beams=8, max_length=128) 40 | ''' parameter for length penalty ensures that the model does not generate sequences that are too long. ''' 41 | 42 | # Finally, we decode the generated texts, 43 | # replace the token, and add the decoded texts with the references to the metric. 44 | decoded_summaries = [tokenizer.decode(s, skip_special_tokens=True, 45 | clean_up_tokenization_spaces=True) 46 | for s in summaries] 47 | 48 | decoded_summaries = [d.replace("", " ") for d in decoded_summaries] 49 | 50 | 51 | metric.add_batch(predictions=decoded_summaries, references=target_batch) 52 | 53 | # Finally compute and return the ROUGE scores. 54 | score = metric.compute() 55 | return score 56 | 57 | 58 | def evaluate(self): 59 | device = "cuda" if torch.cuda.is_available() else "cpu" 60 | tokenizer = AutoTokenizer.from_pretrained(self.config.tokenizer_path) 61 | model_pegasus = AutoModelForSeq2SeqLM.from_pretrained(self.config.model_path).to(device) 62 | 63 | #loading data 64 | dataset_samsum_pt = load_from_disk(self.config.data_path) 65 | 66 | 67 | rouge_names = ["rouge1", "rouge2", "rougeL", "rougeLsum"] 68 | 69 | rouge_metric = load_metric('rouge') 70 | 71 | score = self.calculate_metric_on_test_ds( 72 | dataset_samsum_pt['test'][0:10], rouge_metric, model_pegasus, tokenizer, batch_size = 2, column_text = 'dialogue', column_summary= 'summary' 73 | ) 74 | 75 | rouge_dict = dict((rn, score[rn].mid.fmeasure ) for rn in rouge_names ) 76 | 77 | df = pd.DataFrame(rouge_dict, index = ['pegasus'] ) 78 | df.to_csv(self.config.metric_file_name, index=False) 79 | 80 | 81 | 82 | -------------------------------------------------------------------------------- /src/textSummarizer/config/configuration.py: -------------------------------------------------------------------------------- 1 | from textSummarizer.constants import * 2 | from textSummarizer.utils.common import read_yaml, create_directories 3 | from textSummarizer.entity import (DataIngestionConfig, 4 | DataValidationConfig, 5 | DataTransformationConfig, 6 | ModelTrainerConfig, 7 | ModelEvaluationConfig) 8 | 9 | 10 | class ConfigurationManager: 11 | def __init__( 12 | self, 13 | config_filepath = CONFIG_FILE_PATH, 14 | params_filepath = PARAMS_FILE_PATH): 15 | 16 | self.config = read_yaml(config_filepath) 17 | self.params = read_yaml(params_filepath) 18 | 19 | create_directories([self.config.artifacts_root]) 20 | 21 | 22 | 23 | def get_data_ingestion_config(self) -> DataIngestionConfig: 24 | config = self.config.data_ingestion 25 | 26 | create_directories([config.root_dir]) 27 | 28 | data_ingestion_config = DataIngestionConfig( 29 | root_dir=config.root_dir, 30 | source_URL=config.source_URL, 31 | local_data_file=config.local_data_file, 32 | unzip_dir=config.unzip_dir 33 | ) 34 | 35 | return data_ingestion_config 36 | 37 | 38 | 39 | def get_data_validation_config(self) -> DataValidationConfig: 40 | config = self.config.data_validation 41 | 42 | create_directories([config.root_dir]) 43 | 44 | data_validation_config = DataValidationConfig( 45 | root_dir=config.root_dir, 46 | STATUS_FILE=config.STATUS_FILE, 47 | ALL_REQUIRED_FILES=config.ALL_REQUIRED_FILES, 48 | ) 49 | 50 | return data_validation_config 51 | 52 | 53 | def get_data_transformation_config(self) -> DataTransformationConfig: 54 | config = self.config.data_transformation 55 | 56 | create_directories([config.root_dir]) 57 | 58 | data_transformation_config = DataTransformationConfig( 59 | root_dir=config.root_dir, 60 | data_path=config.data_path, 61 | tokenizer_name = config.tokenizer_name 62 | ) 63 | 64 | return data_transformation_config 65 | 66 | 67 | 68 | def get_model_trainer_config(self) -> ModelTrainerConfig: 69 | config = self.config.model_trainer 70 | params = self.params.TrainingArguments 71 | 72 | create_directories([config.root_dir]) 73 | 74 | model_trainer_config = ModelTrainerConfig( 75 | root_dir=config.root_dir, 76 | data_path=config.data_path, 77 | model_ckpt = config.model_ckpt, 78 | num_train_epochs = params.num_train_epochs, 79 | warmup_steps = params.warmup_steps, 80 | per_device_train_batch_size = params.per_device_train_batch_size, 81 | weight_decay = params.weight_decay, 82 | logging_steps = params.logging_steps, 83 | evaluation_strategy = params.evaluation_strategy, 84 | eval_steps = params.evaluation_strategy, 85 | save_steps = params.save_steps, 86 | gradient_accumulation_steps = params.gradient_accumulation_steps 87 | ) 88 | 89 | return model_trainer_config 90 | 91 | 92 | def get_model_evaluation_config(self) -> ModelEvaluationConfig: 93 | config = self.config.model_evaluation 94 | 95 | create_directories([config.root_dir]) 96 | 97 | model_evaluation_config = ModelEvaluationConfig( 98 | root_dir=config.root_dir, 99 | data_path=config.data_path, 100 | model_path = config.model_path, 101 | tokenizer_path = config.tokenizer_path, 102 | metric_file_name = config.metric_file_name 103 | 104 | ) 105 | 106 | return model_evaluation_config 107 | -------------------------------------------------------------------------------- /research/02_data_validation.ipynb: -------------------------------------------------------------------------------- 1 | { 2 | "cells": [ 3 | { 4 | "cell_type": "code", 5 | "execution_count": 1, 6 | "metadata": {}, 7 | "outputs": [], 8 | "source": [ 9 | "import os" 10 | ] 11 | }, 12 | { 13 | "cell_type": "code", 14 | "execution_count": 2, 15 | "metadata": {}, 16 | "outputs": [ 17 | { 18 | "data": { 19 | "text/plain": [ 20 | "'d:\\\\Bappy\\\\YouTube\\\\Text-Summarizer-Project\\\\research'" 21 | ] 22 | }, 23 | "execution_count": 2, 24 | "metadata": {}, 25 | "output_type": "execute_result" 26 | } 27 | ], 28 | "source": [ 29 | "%pwd" 30 | ] 31 | }, 32 | { 33 | "cell_type": "code", 34 | "execution_count": 3, 35 | "metadata": {}, 36 | "outputs": [], 37 | "source": [ 38 | "os.chdir(\"../\")" 39 | ] 40 | }, 41 | { 42 | "cell_type": "code", 43 | "execution_count": 4, 44 | "metadata": {}, 45 | "outputs": [ 46 | { 47 | "data": { 48 | "text/plain": [ 49 | "'d:\\\\Bappy\\\\YouTube\\\\Text-Summarizer-Project'" 50 | ] 51 | }, 52 | "execution_count": 4, 53 | "metadata": {}, 54 | "output_type": "execute_result" 55 | } 56 | ], 57 | "source": [ 58 | "%pwd" 59 | ] 60 | }, 61 | { 62 | "cell_type": "code", 63 | "execution_count": 5, 64 | "metadata": {}, 65 | "outputs": [], 66 | "source": [ 67 | "from dataclasses import dataclass\n", 68 | "from pathlib import Path\n", 69 | "\n", 70 | "\n", 71 | "@dataclass(frozen=True)\n", 72 | "class DataValidationConfig:\n", 73 | " root_dir: Path\n", 74 | " STATUS_FILE: str\n", 75 | " ALL_REQUIRED_FILES: list" 76 | ] 77 | }, 78 | { 79 | "cell_type": "code", 80 | "execution_count": 6, 81 | "metadata": {}, 82 | "outputs": [], 83 | "source": [ 84 | "from textSummarizer.constants import *\n", 85 | "from textSummarizer.utils.common import read_yaml, create_directories" 86 | ] 87 | }, 88 | { 89 | "cell_type": "code", 90 | "execution_count": 7, 91 | "metadata": {}, 92 | "outputs": [], 93 | "source": [ 94 | "class ConfigurationManager:\n", 95 | " def __init__(\n", 96 | " self,\n", 97 | " config_filepath = CONFIG_FILE_PATH,\n", 98 | " params_filepath = PARAMS_FILE_PATH):\n", 99 | "\n", 100 | " self.config = read_yaml(config_filepath)\n", 101 | " self.params = read_yaml(params_filepath)\n", 102 | "\n", 103 | " create_directories([self.config.artifacts_root])\n", 104 | "\n", 105 | "\n", 106 | " \n", 107 | " def get_data_validation_config(self) -> DataValidationConfig:\n", 108 | " config = self.config.data_validation\n", 109 | "\n", 110 | " create_directories([config.root_dir])\n", 111 | "\n", 112 | " data_validation_config = DataValidationConfig(\n", 113 | " root_dir=config.root_dir,\n", 114 | " STATUS_FILE=config.STATUS_FILE,\n", 115 | " ALL_REQUIRED_FILES=config.ALL_REQUIRED_FILES,\n", 116 | " )\n", 117 | "\n", 118 | " return data_validation_config" 119 | ] 120 | }, 121 | { 122 | "cell_type": "code", 123 | "execution_count": 8, 124 | "metadata": {}, 125 | "outputs": [], 126 | "source": [ 127 | "import os\n", 128 | "from textSummarizer.logging import logger" 129 | ] 130 | }, 131 | { 132 | "cell_type": "code", 133 | "execution_count": 9, 134 | "metadata": {}, 135 | "outputs": [], 136 | "source": [ 137 | "class DataValiadtion:\n", 138 | " def __init__(self, config: DataValidationConfig):\n", 139 | " self.config = config\n", 140 | "\n", 141 | "\n", 142 | " \n", 143 | " def validate_all_files_exist(self)-> bool:\n", 144 | " try:\n", 145 | " validation_status = None\n", 146 | "\n", 147 | " all_files = os.listdir(os.path.join(\"artifacts\",\"data_ingestion\",\"samsum_dataset\"))\n", 148 | "\n", 149 | " for file in all_files:\n", 150 | " if file not in self.config.ALL_REQUIRED_FILES:\n", 151 | " validation_status = False\n", 152 | " with open(self.config.STATUS_FILE, 'w') as f:\n", 153 | " f.write(f\"Validation status: {validation_status}\")\n", 154 | " else:\n", 155 | " validation_status = True\n", 156 | " with open(self.config.STATUS_FILE, 'w') as f:\n", 157 | " f.write(f\"Validation status: {validation_status}\")\n", 158 | "\n", 159 | " return validation_status\n", 160 | " \n", 161 | " except Exception as e:\n", 162 | " raise e\n" 163 | ] 164 | }, 165 | { 166 | "cell_type": "code", 167 | "execution_count": 10, 168 | "metadata": {}, 169 | "outputs": [ 170 | { 171 | "name": "stdout", 172 | "output_type": "stream", 173 | "text": [ 174 | "[2023-05-17 11:58:23,823: INFO: common: yaml file: config\\config.yaml loaded successfully]\n", 175 | "[2023-05-17 11:58:23,826: INFO: common: yaml file: params.yaml loaded successfully]\n", 176 | "[2023-05-17 11:58:23,828: INFO: common: created directory at: artifacts]\n", 177 | "[2023-05-17 11:58:23,829: INFO: common: created directory at: artifacts/data_validation]\n" 178 | ] 179 | } 180 | ], 181 | "source": [ 182 | "try:\n", 183 | " config = ConfigurationManager()\n", 184 | " data_validation_config = config.get_data_validation_config()\n", 185 | " data_validation = DataValiadtion(config=data_validation_config)\n", 186 | " data_validation.validate_all_files_exist()\n", 187 | "except Exception as e:\n", 188 | " raise e" 189 | ] 190 | }, 191 | { 192 | "cell_type": "code", 193 | "execution_count": null, 194 | "metadata": {}, 195 | "outputs": [], 196 | "source": [] 197 | } 198 | ], 199 | "metadata": { 200 | "kernelspec": { 201 | "display_name": "textS", 202 | "language": "python", 203 | "name": "python3" 204 | }, 205 | "language_info": { 206 | "codemirror_mode": { 207 | "name": "ipython", 208 | "version": 3 209 | }, 210 | "file_extension": ".py", 211 | "mimetype": "text/x-python", 212 | "name": "python", 213 | "nbconvert_exporter": "python", 214 | "pygments_lexer": "ipython3", 215 | "version": "3.8.16" 216 | }, 217 | "orig_nbformat": 4 218 | }, 219 | "nbformat": 4, 220 | "nbformat_minor": 2 221 | } 222 | -------------------------------------------------------------------------------- /research/03_data_transformation.ipynb: -------------------------------------------------------------------------------- 1 | { 2 | "cells": [ 3 | { 4 | "cell_type": "code", 5 | "execution_count": 1, 6 | "metadata": {}, 7 | "outputs": [], 8 | "source": [ 9 | "import os" 10 | ] 11 | }, 12 | { 13 | "cell_type": "code", 14 | "execution_count": 2, 15 | "metadata": {}, 16 | "outputs": [ 17 | { 18 | "data": { 19 | "text/plain": [ 20 | "'d:\\\\Bappy\\\\YouTube\\\\Text-Summarizer-Project\\\\research'" 21 | ] 22 | }, 23 | "execution_count": 2, 24 | "metadata": {}, 25 | "output_type": "execute_result" 26 | } 27 | ], 28 | "source": [ 29 | "%pwd" 30 | ] 31 | }, 32 | { 33 | "cell_type": "code", 34 | "execution_count": 3, 35 | "metadata": {}, 36 | "outputs": [], 37 | "source": [ 38 | "os.chdir(\"../\")" 39 | ] 40 | }, 41 | { 42 | "cell_type": "code", 43 | "execution_count": 4, 44 | "metadata": {}, 45 | "outputs": [ 46 | { 47 | "data": { 48 | "text/plain": [ 49 | "'d:\\\\Bappy\\\\YouTube\\\\Text-Summarizer-Project'" 50 | ] 51 | }, 52 | "execution_count": 4, 53 | "metadata": {}, 54 | "output_type": "execute_result" 55 | } 56 | ], 57 | "source": [ 58 | "%pwd" 59 | ] 60 | }, 61 | { 62 | "cell_type": "code", 63 | "execution_count": 5, 64 | "metadata": {}, 65 | "outputs": [], 66 | "source": [ 67 | "from dataclasses import dataclass\n", 68 | "from pathlib import Path\n", 69 | "\n", 70 | "\n", 71 | "@dataclass(frozen=True)\n", 72 | "class DataTransformationConfig:\n", 73 | " root_dir: Path\n", 74 | " data_path: Path\n", 75 | " tokenizer_name: Path\n" 76 | ] 77 | }, 78 | { 79 | "cell_type": "code", 80 | "execution_count": 6, 81 | "metadata": {}, 82 | "outputs": [], 83 | "source": [ 84 | "from textSummarizer.constants import *\n", 85 | "from textSummarizer.utils.common import read_yaml, create_directories" 86 | ] 87 | }, 88 | { 89 | "cell_type": "code", 90 | "execution_count": 7, 91 | "metadata": {}, 92 | "outputs": [], 93 | "source": [ 94 | "class ConfigurationManager:\n", 95 | " def __init__(\n", 96 | " self,\n", 97 | " config_filepath = CONFIG_FILE_PATH,\n", 98 | " params_filepath = PARAMS_FILE_PATH):\n", 99 | "\n", 100 | " self.config = read_yaml(config_filepath)\n", 101 | " self.params = read_yaml(params_filepath)\n", 102 | "\n", 103 | " create_directories([self.config.artifacts_root])\n", 104 | "\n", 105 | "\n", 106 | " \n", 107 | " def get_data_transformation_config(self) -> DataTransformationConfig:\n", 108 | " config = self.config.data_transformation\n", 109 | "\n", 110 | " create_directories([config.root_dir])\n", 111 | "\n", 112 | " data_transformation_config = DataTransformationConfig(\n", 113 | " root_dir=config.root_dir,\n", 114 | " data_path=config.data_path,\n", 115 | " tokenizer_name = config.tokenizer_name\n", 116 | " )\n", 117 | "\n", 118 | " return data_transformation_config\n" 119 | ] 120 | }, 121 | { 122 | "cell_type": "code", 123 | "execution_count": 9, 124 | "metadata": {}, 125 | "outputs": [], 126 | "source": [ 127 | "import os\n", 128 | "from textSummarizer.logging import logger\n", 129 | "from transformers import AutoTokenizer\n", 130 | "from datasets import load_dataset, load_from_disk" 131 | ] 132 | }, 133 | { 134 | "cell_type": "code", 135 | "execution_count": 10, 136 | "metadata": {}, 137 | "outputs": [], 138 | "source": [ 139 | "class DataTransformation:\n", 140 | " def __init__(self, config: DataTransformationConfig):\n", 141 | " self.config = config\n", 142 | " self.tokenizer = AutoTokenizer.from_pretrained(config.tokenizer_name)\n", 143 | "\n", 144 | "\n", 145 | " \n", 146 | " def convert_examples_to_features(self,example_batch):\n", 147 | " input_encodings = self.tokenizer(example_batch['dialogue'] , max_length = 1024, truncation = True )\n", 148 | " \n", 149 | " with self.tokenizer.as_target_tokenizer():\n", 150 | " target_encodings = self.tokenizer(example_batch['summary'], max_length = 128, truncation = True )\n", 151 | " \n", 152 | " return {\n", 153 | " 'input_ids' : input_encodings['input_ids'],\n", 154 | " 'attention_mask': input_encodings['attention_mask'],\n", 155 | " 'labels': target_encodings['input_ids']\n", 156 | " }\n", 157 | " \n", 158 | "\n", 159 | " def convert(self):\n", 160 | " dataset_samsum = load_from_disk(self.config.data_path)\n", 161 | " dataset_samsum_pt = dataset_samsum.map(self.convert_examples_to_features, batched = True)\n", 162 | " dataset_samsum_pt.save_to_disk(os.path.join(self.config.root_dir,\"samsum_dataset\"))" 163 | ] 164 | }, 165 | { 166 | "cell_type": "code", 167 | "execution_count": 11, 168 | "metadata": {}, 169 | "outputs": [ 170 | { 171 | "name": "stdout", 172 | "output_type": "stream", 173 | "text": [ 174 | "[2023-05-18 08:51:29,881: INFO: common: yaml file: config\\config.yaml loaded successfully]\n", 175 | "[2023-05-18 08:51:29,892: INFO: common: yaml file: params.yaml loaded successfully]\n", 176 | "[2023-05-18 08:51:29,893: INFO: common: created directory at: artifacts]\n", 177 | "[2023-05-18 08:51:29,894: INFO: common: created directory at: artifacts/data_transformation]\n" 178 | ] 179 | }, 180 | { 181 | "name": "stderr", 182 | "output_type": "stream", 183 | "text": [ 184 | "Map: 0%| | 0/14732 [00:00 1\u001b[0m d\u001b[39m.\u001b[39;49mkey\n", 45 | "\u001b[1;31mAttributeError\u001b[0m: 'dict' object has no attribute 'key'" 46 | ] 47 | } 48 | ], 49 | "source": [ 50 | "d.key" 51 | ] 52 | }, 53 | { 54 | "cell_type": "code", 55 | "execution_count": 4, 56 | "metadata": {}, 57 | "outputs": [], 58 | "source": [ 59 | "from box import ConfigBox" 60 | ] 61 | }, 62 | { 63 | "cell_type": "code", 64 | "execution_count": 5, 65 | "metadata": {}, 66 | "outputs": [], 67 | "source": [ 68 | "d2 = ConfigBox({\"key\": \"value\", \"key1\": \"value1\"})" 69 | ] 70 | }, 71 | { 72 | "cell_type": "code", 73 | "execution_count": 6, 74 | "metadata": {}, 75 | "outputs": [ 76 | { 77 | "data": { 78 | "text/plain": [ 79 | "ConfigBox({'key': 'value', 'key1': 'value1'})" 80 | ] 81 | }, 82 | "execution_count": 6, 83 | "metadata": {}, 84 | "output_type": "execute_result" 85 | } 86 | ], 87 | "source": [ 88 | "d2" 89 | ] 90 | }, 91 | { 92 | "cell_type": "code", 93 | "execution_count": 7, 94 | "metadata": {}, 95 | "outputs": [ 96 | { 97 | "data": { 98 | "text/plain": [ 99 | "'value'" 100 | ] 101 | }, 102 | "execution_count": 7, 103 | "metadata": {}, 104 | "output_type": "execute_result" 105 | } 106 | ], 107 | "source": [ 108 | "d2.key" 109 | ] 110 | }, 111 | { 112 | "cell_type": "code", 113 | "execution_count": 8, 114 | "metadata": {}, 115 | "outputs": [], 116 | "source": [ 117 | "def get_product(x: int, y: int) -> int:\n", 118 | " return x * y" 119 | ] 120 | }, 121 | { 122 | "cell_type": "code", 123 | "execution_count": 9, 124 | "metadata": {}, 125 | "outputs": [ 126 | { 127 | "data": { 128 | "text/plain": [ 129 | "8" 130 | ] 131 | }, 132 | "execution_count": 9, 133 | "metadata": {}, 134 | "output_type": "execute_result" 135 | } 136 | ], 137 | "source": [ 138 | "get_product(x = 2, y = 4)" 139 | ] 140 | }, 141 | { 142 | "cell_type": "code", 143 | "execution_count": 10, 144 | "metadata": {}, 145 | "outputs": [ 146 | { 147 | "data": { 148 | "text/plain": [ 149 | "'44'" 150 | ] 151 | }, 152 | "execution_count": 10, 153 | "metadata": {}, 154 | "output_type": "execute_result" 155 | } 156 | ], 157 | "source": [ 158 | "get_product(x = 2, y = \"4\")" 159 | ] 160 | }, 161 | { 162 | "cell_type": "code", 163 | "execution_count": 11, 164 | "metadata": {}, 165 | "outputs": [], 166 | "source": [ 167 | "from ensure import ensure_annotations" 168 | ] 169 | }, 170 | { 171 | "cell_type": "code", 172 | "execution_count": 12, 173 | "metadata": {}, 174 | "outputs": [], 175 | "source": [ 176 | "@ensure_annotations\n", 177 | "def get_product(x: int, y: int) -> int:\n", 178 | " return x * y" 179 | ] 180 | }, 181 | { 182 | "cell_type": "code", 183 | "execution_count": 13, 184 | "metadata": {}, 185 | "outputs": [ 186 | { 187 | "data": { 188 | "text/plain": [ 189 | "8" 190 | ] 191 | }, 192 | "execution_count": 13, 193 | "metadata": {}, 194 | "output_type": "execute_result" 195 | } 196 | ], 197 | "source": [ 198 | "get_product(x = 2, y = 4)" 199 | ] 200 | }, 201 | { 202 | "cell_type": "code", 203 | "execution_count": 14, 204 | "metadata": {}, 205 | "outputs": [ 206 | { 207 | "ename": "EnsureError", 208 | "evalue": "Argument y of type to does not match annotation type ", 209 | "output_type": "error", 210 | "traceback": [ 211 | "\u001b[1;31m---------------------------------------------------------------------------\u001b[0m", 212 | "\u001b[1;31mEnsureError\u001b[0m Traceback (most recent call last)", 213 | "Cell \u001b[1;32mIn[14], line 1\u001b[0m\n\u001b[1;32m----> 1\u001b[0m get_product(x \u001b[39m=\u001b[39;49m \u001b[39m2\u001b[39;49m, y \u001b[39m=\u001b[39;49m \u001b[39m\"\u001b[39;49m\u001b[39m4\u001b[39;49m\u001b[39m\"\u001b[39;49m)\n", 214 | "File \u001b[1;32md:\\Softwares\\anaconda3\\envs\\textS\\lib\\site-packages\\ensure\\main.py:845\u001b[0m, in \u001b[0;36mWrappedFunctionReturn.__call__\u001b[1;34m(self, *args, **kwargs)\u001b[0m\n\u001b[0;32m 840\u001b[0m \u001b[39mif\u001b[39;00m \u001b[39mnot\u001b[39;00m \u001b[39misinstance\u001b[39m(value, templ):\n\u001b[0;32m 841\u001b[0m msg \u001b[39m=\u001b[39m (\n\u001b[0;32m 842\u001b[0m \u001b[39m\"\u001b[39m\u001b[39mArgument \u001b[39m\u001b[39m{arg}\u001b[39;00m\u001b[39m of type \u001b[39m\u001b[39m{valt}\u001b[39;00m\u001b[39m to \u001b[39m\u001b[39m{f}\u001b[39;00m\u001b[39m \u001b[39m\u001b[39m\"\u001b[39m\n\u001b[0;32m 843\u001b[0m \u001b[39m\"\u001b[39m\u001b[39mdoes not match annotation type \u001b[39m\u001b[39m{t}\u001b[39;00m\u001b[39m\"\u001b[39m\n\u001b[0;32m 844\u001b[0m )\n\u001b[1;32m--> 845\u001b[0m \u001b[39mraise\u001b[39;00m EnsureError(msg\u001b[39m.\u001b[39mformat(\n\u001b[0;32m 846\u001b[0m arg\u001b[39m=\u001b[39marg, f\u001b[39m=\u001b[39m\u001b[39mself\u001b[39m\u001b[39m.\u001b[39mf, t\u001b[39m=\u001b[39mtempl, valt\u001b[39m=\u001b[39m\u001b[39mtype\u001b[39m(value)\n\u001b[0;32m 847\u001b[0m ))\n\u001b[0;32m 849\u001b[0m return_val \u001b[39m=\u001b[39m \u001b[39mself\u001b[39m\u001b[39m.\u001b[39mf(\u001b[39m*\u001b[39margs, \u001b[39m*\u001b[39m\u001b[39m*\u001b[39mkwargs)\n\u001b[0;32m 850\u001b[0m \u001b[39mif\u001b[39;00m \u001b[39mnot\u001b[39;00m \u001b[39misinstance\u001b[39m(return_val, \u001b[39mself\u001b[39m\u001b[39m.\u001b[39mreturn_templ):\n", 215 | "\u001b[1;31mEnsureError\u001b[0m: Argument y of type to does not match annotation type " 216 | ] 217 | } 218 | ], 219 | "source": [ 220 | "get_product(x = 2, y = \"4\")" 221 | ] 222 | }, 223 | { 224 | "cell_type": "code", 225 | "execution_count": null, 226 | "metadata": {}, 227 | "outputs": [], 228 | "source": [] 229 | } 230 | ], 231 | "metadata": { 232 | "kernelspec": { 233 | "display_name": "textS", 234 | "language": "python", 235 | "name": "python3" 236 | }, 237 | "language_info": { 238 | "codemirror_mode": { 239 | "name": "ipython", 240 | "version": 3 241 | }, 242 | "file_extension": ".py", 243 | "mimetype": "text/x-python", 244 | "name": "python", 245 | "nbconvert_exporter": "python", 246 | "pygments_lexer": "ipython3", 247 | "version": "3.8.16" 248 | }, 249 | "orig_nbformat": 4 250 | }, 251 | "nbformat": 4, 252 | "nbformat_minor": 2 253 | } 254 | -------------------------------------------------------------------------------- /research/01_data_ingestion.ipynb: -------------------------------------------------------------------------------- 1 | { 2 | "cells": [ 3 | { 4 | "cell_type": "code", 5 | "execution_count": 1, 6 | "metadata": {}, 7 | "outputs": [], 8 | "source": [ 9 | "import os" 10 | ] 11 | }, 12 | { 13 | "cell_type": "code", 14 | "execution_count": 2, 15 | "metadata": {}, 16 | "outputs": [ 17 | { 18 | "data": { 19 | "text/plain": [ 20 | "'d:\\\\Bappy\\\\YouTube\\\\Text-Summarizer-Project\\\\research'" 21 | ] 22 | }, 23 | "execution_count": 2, 24 | "metadata": {}, 25 | "output_type": "execute_result" 26 | } 27 | ], 28 | "source": [ 29 | "%pwd" 30 | ] 31 | }, 32 | { 33 | "cell_type": "code", 34 | "execution_count": 3, 35 | "metadata": {}, 36 | "outputs": [], 37 | "source": [ 38 | "os.chdir(\"../\")" 39 | ] 40 | }, 41 | { 42 | "cell_type": "code", 43 | "execution_count": 4, 44 | "metadata": {}, 45 | "outputs": [ 46 | { 47 | "data": { 48 | "text/plain": [ 49 | "'d:\\\\Bappy\\\\YouTube\\\\Text-Summarizer-Project'" 50 | ] 51 | }, 52 | "execution_count": 4, 53 | "metadata": {}, 54 | "output_type": "execute_result" 55 | } 56 | ], 57 | "source": [ 58 | "%pwd" 59 | ] 60 | }, 61 | { 62 | "cell_type": "code", 63 | "execution_count": 5, 64 | "metadata": {}, 65 | "outputs": [], 66 | "source": [ 67 | "from dataclasses import dataclass\n", 68 | "from pathlib import Path\n", 69 | "\n", 70 | "@dataclass(frozen=True)\n", 71 | "class DataIngestionConfig:\n", 72 | " root_dir: Path\n", 73 | " source_URL: str\n", 74 | " local_data_file: Path\n", 75 | " unzip_dir: Path" 76 | ] 77 | }, 78 | { 79 | "cell_type": "code", 80 | "execution_count": 6, 81 | "metadata": {}, 82 | "outputs": [], 83 | "source": [ 84 | "from textSummarizer.constants import *\n", 85 | "from textSummarizer.utils.common import read_yaml, create_directories" 86 | ] 87 | }, 88 | { 89 | "cell_type": "code", 90 | "execution_count": 7, 91 | "metadata": {}, 92 | "outputs": [], 93 | "source": [ 94 | "class ConfigurationManager:\n", 95 | " def __init__(\n", 96 | " self,\n", 97 | " config_filepath = CONFIG_FILE_PATH,\n", 98 | " params_filepath = PARAMS_FILE_PATH):\n", 99 | "\n", 100 | " self.config = read_yaml(config_filepath)\n", 101 | " self.params = read_yaml(params_filepath)\n", 102 | "\n", 103 | " create_directories([self.config.artifacts_root])\n", 104 | "\n", 105 | " \n", 106 | "\n", 107 | " def get_data_ingestion_config(self) -> DataIngestionConfig:\n", 108 | " config = self.config.data_ingestion\n", 109 | "\n", 110 | " create_directories([config.root_dir])\n", 111 | "\n", 112 | " data_ingestion_config = DataIngestionConfig(\n", 113 | " root_dir=config.root_dir,\n", 114 | " source_URL=config.source_URL,\n", 115 | " local_data_file=config.local_data_file,\n", 116 | " unzip_dir=config.unzip_dir \n", 117 | " )\n", 118 | "\n", 119 | " return data_ingestion_config" 120 | ] 121 | }, 122 | { 123 | "cell_type": "code", 124 | "execution_count": 8, 125 | "metadata": {}, 126 | "outputs": [], 127 | "source": [ 128 | "import os\n", 129 | "import urllib.request as request\n", 130 | "import zipfile\n", 131 | "from textSummarizer.logging import logger\n", 132 | "from textSummarizer.utils.common import get_size" 133 | ] 134 | }, 135 | { 136 | "cell_type": "code", 137 | "execution_count": 9, 138 | "metadata": {}, 139 | "outputs": [], 140 | "source": [ 141 | "class DataIngestion:\n", 142 | " def __init__(self, config: DataIngestionConfig):\n", 143 | " self.config = config\n", 144 | "\n", 145 | "\n", 146 | " \n", 147 | " def download_file(self):\n", 148 | " if not os.path.exists(self.config.local_data_file):\n", 149 | " filename, headers = request.urlretrieve(\n", 150 | " url = self.config.source_URL,\n", 151 | " filename = self.config.local_data_file\n", 152 | " )\n", 153 | " logger.info(f\"{filename} download! with following info: \\n{headers}\")\n", 154 | " else:\n", 155 | " logger.info(f\"File already exists of size: {get_size(Path(self.config.local_data_file))}\") \n", 156 | "\n", 157 | " \n", 158 | " \n", 159 | " def extract_zip_file(self):\n", 160 | " \"\"\"\n", 161 | " zip_file_path: str\n", 162 | " Extracts the zip file into the data directory\n", 163 | " Function returns None\n", 164 | " \"\"\"\n", 165 | " unzip_path = self.config.unzip_dir\n", 166 | " os.makedirs(unzip_path, exist_ok=True)\n", 167 | " with zipfile.ZipFile(self.config.local_data_file, 'r') as zip_ref:\n", 168 | " zip_ref.extractall(unzip_path)" 169 | ] 170 | }, 171 | { 172 | "cell_type": "code", 173 | "execution_count": 10, 174 | "metadata": {}, 175 | "outputs": [ 176 | { 177 | "name": "stdout", 178 | "output_type": "stream", 179 | "text": [ 180 | "[2023-05-17 10:39:37,034: INFO: common: yaml file: config\\config.yaml loaded successfully]\n", 181 | "[2023-05-17 10:39:37,038: INFO: common: yaml file: params.yaml loaded successfully]\n", 182 | "[2023-05-17 10:39:37,040: INFO: common: created directory at: artifacts]\n", 183 | "[2023-05-17 10:39:37,042: INFO: common: created directory at: artifacts/data_ingestion]\n", 184 | "[2023-05-17 10:39:51,282: INFO: 1434958058: artifacts/data_ingestion/data.zip download! with following info: \n", 185 | "Connection: close\n", 186 | "Content-Length: 7903594\n", 187 | "Cache-Control: max-age=300\n", 188 | "Content-Security-Policy: default-src 'none'; style-src 'unsafe-inline'; sandbox\n", 189 | "Content-Type: application/zip\n", 190 | "ETag: \"dbc016a060da18070593b83afff580c9b300f0b6ea4147a7988433e04df246ca\"\n", 191 | "Strict-Transport-Security: max-age=31536000\n", 192 | "X-Content-Type-Options: nosniff\n", 193 | "X-Frame-Options: deny\n", 194 | "X-XSS-Protection: 1; mode=block\n", 195 | "X-GitHub-Request-Id: 38C6:7AC2:33B271D:3D24998:64645A8C\n", 196 | "Accept-Ranges: bytes\n", 197 | "Date: Wed, 17 May 2023 04:39:41 GMT\n", 198 | "Via: 1.1 varnish\n", 199 | "X-Served-By: cache-mrs10532-MRS\n", 200 | "X-Cache: MISS\n", 201 | "X-Cache-Hits: 0\n", 202 | "X-Timer: S1684298381.769825,VS0,VE670\n", 203 | "Vary: Authorization,Accept-Encoding,Origin\n", 204 | "Access-Control-Allow-Origin: *\n", 205 | "X-Fastly-Request-ID: b2ff330ea1bb7f1da4c072d6a895f5e14951d76e\n", 206 | "Expires: Wed, 17 May 2023 04:44:41 GMT\n", 207 | "Source-Age: 0\n", 208 | "\n", 209 | "]\n" 210 | ] 211 | } 212 | ], 213 | "source": [ 214 | "try:\n", 215 | " config = ConfigurationManager()\n", 216 | " data_ingestion_config = config.get_data_ingestion_config()\n", 217 | " data_ingestion = DataIngestion(config=data_ingestion_config)\n", 218 | " data_ingestion.download_file()\n", 219 | " data_ingestion.extract_zip_file()\n", 220 | "except Exception as e:\n", 221 | " raise e" 222 | ] 223 | }, 224 | { 225 | "cell_type": "code", 226 | "execution_count": null, 227 | "metadata": {}, 228 | "outputs": [], 229 | "source": [] 230 | } 231 | ], 232 | "metadata": { 233 | "kernelspec": { 234 | "display_name": "textS", 235 | "language": "python", 236 | "name": "python3" 237 | }, 238 | "language_info": { 239 | "codemirror_mode": { 240 | "name": "ipython", 241 | "version": 3 242 | }, 243 | "file_extension": ".py", 244 | "mimetype": "text/x-python", 245 | "name": "python", 246 | "nbconvert_exporter": "python", 247 | "pygments_lexer": "ipython3", 248 | "version": "3.8.16" 249 | }, 250 | "orig_nbformat": 4 251 | }, 252 | "nbformat": 4, 253 | "nbformat_minor": 2 254 | } 255 | -------------------------------------------------------------------------------- /research/05_Model_evaluation.ipynb: -------------------------------------------------------------------------------- 1 | { 2 | "cells": [ 3 | { 4 | "cell_type": "code", 5 | "execution_count": 1, 6 | "metadata": {}, 7 | "outputs": [], 8 | "source": [ 9 | "import os" 10 | ] 11 | }, 12 | { 13 | "cell_type": "code", 14 | "execution_count": 2, 15 | "metadata": {}, 16 | "outputs": [ 17 | { 18 | "data": { 19 | "text/plain": [ 20 | "'d:\\\\Bappy\\\\YouTube\\\\Text-Summarizer-Project\\\\research'" 21 | ] 22 | }, 23 | "execution_count": 2, 24 | "metadata": {}, 25 | "output_type": "execute_result" 26 | } 27 | ], 28 | "source": [ 29 | "%pwd" 30 | ] 31 | }, 32 | { 33 | "cell_type": "code", 34 | "execution_count": 3, 35 | "metadata": {}, 36 | "outputs": [], 37 | "source": [ 38 | "os.chdir(\"../\")" 39 | ] 40 | }, 41 | { 42 | "cell_type": "code", 43 | "execution_count": 4, 44 | "metadata": {}, 45 | "outputs": [ 46 | { 47 | "data": { 48 | "text/plain": [ 49 | "'d:\\\\Bappy\\\\YouTube\\\\Text-Summarizer-Project'" 50 | ] 51 | }, 52 | "execution_count": 4, 53 | "metadata": {}, 54 | "output_type": "execute_result" 55 | } 56 | ], 57 | "source": [ 58 | "%pwd" 59 | ] 60 | }, 61 | { 62 | "cell_type": "code", 63 | "execution_count": 5, 64 | "metadata": {}, 65 | "outputs": [], 66 | "source": [ 67 | "from dataclasses import dataclass\n", 68 | "from pathlib import Path\n", 69 | "\n", 70 | "\n", 71 | "@dataclass(frozen=True)\n", 72 | "class ModelEvaluationConfig:\n", 73 | " root_dir: Path\n", 74 | " data_path: Path\n", 75 | " model_path: Path\n", 76 | " tokenizer_path: Path\n", 77 | " metric_file_name: Path" 78 | ] 79 | }, 80 | { 81 | "cell_type": "code", 82 | "execution_count": 6, 83 | "metadata": {}, 84 | "outputs": [], 85 | "source": [ 86 | "from textSummarizer.constants import *\n", 87 | "from textSummarizer.utils.common import read_yaml, create_directories" 88 | ] 89 | }, 90 | { 91 | "cell_type": "code", 92 | "execution_count": 7, 93 | "metadata": {}, 94 | "outputs": [], 95 | "source": [ 96 | "class ConfigurationManager:\n", 97 | " def __init__(\n", 98 | " self,\n", 99 | " config_filepath = CONFIG_FILE_PATH,\n", 100 | " params_filepath = PARAMS_FILE_PATH):\n", 101 | "\n", 102 | " self.config = read_yaml(config_filepath)\n", 103 | " self.params = read_yaml(params_filepath)\n", 104 | "\n", 105 | " create_directories([self.config.artifacts_root])\n", 106 | "\n", 107 | "\n", 108 | " \n", 109 | " def get_model_evaluation_config(self) -> ModelEvaluationConfig:\n", 110 | " config = self.config.model_evaluation\n", 111 | "\n", 112 | " create_directories([config.root_dir])\n", 113 | "\n", 114 | " model_evaluation_config = ModelEvaluationConfig(\n", 115 | " root_dir=config.root_dir,\n", 116 | " data_path=config.data_path,\n", 117 | " model_path = config.model_path,\n", 118 | " tokenizer_path = config.tokenizer_path,\n", 119 | " metric_file_name = config.metric_file_name\n", 120 | " \n", 121 | " )\n", 122 | "\n", 123 | " return model_evaluation_config\n" 124 | ] 125 | }, 126 | { 127 | "cell_type": "code", 128 | "execution_count": 9, 129 | "metadata": {}, 130 | "outputs": [], 131 | "source": [ 132 | "from transformers import AutoModelForSeq2SeqLM, AutoTokenizer\n", 133 | "from datasets import load_dataset, load_from_disk, load_metric\n", 134 | "import torch\n", 135 | "import pandas as pd\n", 136 | "from tqdm import tqdm" 137 | ] 138 | }, 139 | { 140 | "cell_type": "code", 141 | "execution_count": 10, 142 | "metadata": {}, 143 | "outputs": [], 144 | "source": [ 145 | "class ModelEvaluation:\n", 146 | " def __init__(self, config: ModelEvaluationConfig):\n", 147 | " self.config = config\n", 148 | "\n", 149 | "\n", 150 | " \n", 151 | " def generate_batch_sized_chunks(self,list_of_elements, batch_size):\n", 152 | " \"\"\"split the dataset into smaller batches that we can process simultaneously\n", 153 | " Yield successive batch-sized chunks from list_of_elements.\"\"\"\n", 154 | " for i in range(0, len(list_of_elements), batch_size):\n", 155 | " yield list_of_elements[i : i + batch_size]\n", 156 | "\n", 157 | " \n", 158 | " def calculate_metric_on_test_ds(self,dataset, metric, model, tokenizer, \n", 159 | " batch_size=16, device=\"cuda\" if torch.cuda.is_available() else \"cpu\", \n", 160 | " column_text=\"article\", \n", 161 | " column_summary=\"highlights\"):\n", 162 | " article_batches = list(self.generate_batch_sized_chunks(dataset[column_text], batch_size))\n", 163 | " target_batches = list(self.generate_batch_sized_chunks(dataset[column_summary], batch_size))\n", 164 | "\n", 165 | " for article_batch, target_batch in tqdm(\n", 166 | " zip(article_batches, target_batches), total=len(article_batches)):\n", 167 | " \n", 168 | " inputs = tokenizer(article_batch, max_length=1024, truncation=True, \n", 169 | " padding=\"max_length\", return_tensors=\"pt\")\n", 170 | " \n", 171 | " summaries = model.generate(input_ids=inputs[\"input_ids\"].to(device),\n", 172 | " attention_mask=inputs[\"attention_mask\"].to(device), \n", 173 | " length_penalty=0.8, num_beams=8, max_length=128)\n", 174 | " ''' parameter for length penalty ensures that the model does not generate sequences that are too long. '''\n", 175 | " \n", 176 | " # Finally, we decode the generated texts, \n", 177 | " # replace the token, and add the decoded texts with the references to the metric.\n", 178 | " decoded_summaries = [tokenizer.decode(s, skip_special_tokens=True, \n", 179 | " clean_up_tokenization_spaces=True) \n", 180 | " for s in summaries] \n", 181 | " \n", 182 | " decoded_summaries = [d.replace(\"\", \" \") for d in decoded_summaries]\n", 183 | " \n", 184 | " \n", 185 | " metric.add_batch(predictions=decoded_summaries, references=target_batch)\n", 186 | " \n", 187 | " # Finally compute and return the ROUGE scores.\n", 188 | " score = metric.compute()\n", 189 | " return score\n", 190 | "\n", 191 | "\n", 192 | " def evaluate(self):\n", 193 | " device = \"cuda\" if torch.cuda.is_available() else \"cpu\"\n", 194 | " tokenizer = AutoTokenizer.from_pretrained(self.config.tokenizer_path)\n", 195 | " model_pegasus = AutoModelForSeq2SeqLM.from_pretrained(self.config.model_path).to(device)\n", 196 | " \n", 197 | " #loading data \n", 198 | " dataset_samsum_pt = load_from_disk(self.config.data_path)\n", 199 | "\n", 200 | "\n", 201 | " rouge_names = [\"rouge1\", \"rouge2\", \"rougeL\", \"rougeLsum\"]\n", 202 | " \n", 203 | " rouge_metric = load_metric('rouge')\n", 204 | "\n", 205 | " score = self.calculate_metric_on_test_ds(\n", 206 | " dataset_samsum_pt['test'][0:10], rouge_metric, model_pegasus, tokenizer, batch_size = 2, column_text = 'dialogue', column_summary= 'summary'\n", 207 | " )\n", 208 | "\n", 209 | " rouge_dict = dict((rn, score[rn].mid.fmeasure ) for rn in rouge_names )\n", 210 | "\n", 211 | " df = pd.DataFrame(rouge_dict, index = ['pegasus'] )\n", 212 | " df.to_csv(self.config.metric_file_name, index=False)\n", 213 | "\n", 214 | " \n", 215 | "\n" 216 | ] 217 | }, 218 | { 219 | "cell_type": "code", 220 | "execution_count": 11, 221 | "metadata": {}, 222 | "outputs": [ 223 | { 224 | "name": "stdout", 225 | "output_type": "stream", 226 | "text": [ 227 | "[2023-05-18 20:14:03,142: INFO: common: yaml file: config\\config.yaml loaded successfully]\n", 228 | "[2023-05-18 20:14:03,151: INFO: common: yaml file: params.yaml loaded successfully]\n", 229 | "[2023-05-18 20:14:03,153: INFO: common: created directory at: artifacts]\n", 230 | "[2023-05-18 20:14:03,155: INFO: common: created directory at: artifacts/model_evaluation]\n" 231 | ] 232 | }, 233 | { 234 | "name": "stderr", 235 | "output_type": "stream", 236 | "text": [ 237 | "C:\\Users\\bokti\\AppData\\Local\\Temp\\ipykernel_25280\\2973449339.py:59: FutureWarning: load_metric is deprecated and will be removed in the next major version of datasets. Use 'evaluate.load' instead, from the new library 🤗 Evaluate: https://huggingface.co/docs/evaluate\n", 238 | " rouge_metric = load_metric('rouge')\n", 239 | "100%|██████████| 5/5 [03:54<00:00, 46.91s/it]" 240 | ] 241 | }, 242 | { 243 | "name": "stdout", 244 | "output_type": "stream", 245 | "text": [ 246 | "[2023-05-18 20:18:18,394: INFO: rouge_scorer: Using default tokenizer.]\n" 247 | ] 248 | }, 249 | { 250 | "name": "stderr", 251 | "output_type": "stream", 252 | "text": [ 253 | "\n" 254 | ] 255 | } 256 | ], 257 | "source": [ 258 | "try:\n", 259 | " config = ConfigurationManager()\n", 260 | " model_evaluation_config = config.get_model_evaluation_config()\n", 261 | " model_evaluation_config = ModelEvaluation(config=model_evaluation_config)\n", 262 | " model_evaluation_config.evaluate()\n", 263 | "except Exception as e:\n", 264 | " raise e" 265 | ] 266 | }, 267 | { 268 | "cell_type": "code", 269 | "execution_count": null, 270 | "metadata": {}, 271 | "outputs": [], 272 | "source": [] 273 | } 274 | ], 275 | "metadata": { 276 | "kernelspec": { 277 | "display_name": "textS", 278 | "language": "python", 279 | "name": "python3" 280 | }, 281 | "language_info": { 282 | "codemirror_mode": { 283 | "name": "ipython", 284 | "version": 3 285 | }, 286 | "file_extension": ".py", 287 | "mimetype": "text/x-python", 288 | "name": "python", 289 | "nbconvert_exporter": "python", 290 | "pygments_lexer": "ipython3", 291 | "version": "3.8.16" 292 | }, 293 | "orig_nbformat": 4 294 | }, 295 | "nbformat": 4, 296 | "nbformat_minor": 2 297 | } 298 | -------------------------------------------------------------------------------- /research/04_model_trainer.ipynb: -------------------------------------------------------------------------------- 1 | { 2 | "cells": [ 3 | { 4 | "cell_type": "code", 5 | "execution_count": 1, 6 | "metadata": {}, 7 | "outputs": [], 8 | "source": [ 9 | "import os" 10 | ] 11 | }, 12 | { 13 | "cell_type": "code", 14 | "execution_count": 2, 15 | "metadata": {}, 16 | "outputs": [ 17 | { 18 | "data": { 19 | "text/plain": [ 20 | "'d:\\\\Bappy\\\\YouTube\\\\Text-Summarizer-Project\\\\research'" 21 | ] 22 | }, 23 | "execution_count": 2, 24 | "metadata": {}, 25 | "output_type": "execute_result" 26 | } 27 | ], 28 | "source": [ 29 | "%pwd" 30 | ] 31 | }, 32 | { 33 | "cell_type": "code", 34 | "execution_count": 3, 35 | "metadata": {}, 36 | "outputs": [], 37 | "source": [ 38 | "os.chdir(\"../\")" 39 | ] 40 | }, 41 | { 42 | "cell_type": "code", 43 | "execution_count": 4, 44 | "metadata": {}, 45 | "outputs": [ 46 | { 47 | "data": { 48 | "text/plain": [ 49 | "'d:\\\\Bappy\\\\YouTube\\\\Text-Summarizer-Project'" 50 | ] 51 | }, 52 | "execution_count": 4, 53 | "metadata": {}, 54 | "output_type": "execute_result" 55 | } 56 | ], 57 | "source": [ 58 | "%pwd" 59 | ] 60 | }, 61 | { 62 | "cell_type": "code", 63 | "execution_count": 5, 64 | "metadata": {}, 65 | "outputs": [], 66 | "source": [ 67 | "from dataclasses import dataclass\n", 68 | "from pathlib import Path\n", 69 | "\n", 70 | "\n", 71 | "@dataclass(frozen=True)\n", 72 | "class ModelTrainerConfig:\n", 73 | " root_dir: Path\n", 74 | " data_path: Path\n", 75 | " model_ckpt: Path\n", 76 | " num_train_epochs: int\n", 77 | " warmup_steps: int\n", 78 | " per_device_train_batch_size: int\n", 79 | " weight_decay: float\n", 80 | " logging_steps: int\n", 81 | " evaluation_strategy: str\n", 82 | " eval_steps: int\n", 83 | " save_steps: float\n", 84 | " gradient_accumulation_steps: int" 85 | ] 86 | }, 87 | { 88 | "cell_type": "code", 89 | "execution_count": 6, 90 | "metadata": {}, 91 | "outputs": [], 92 | "source": [ 93 | "from textSummarizer.constants import *\n", 94 | "from textSummarizer.utils.common import read_yaml, create_directories" 95 | ] 96 | }, 97 | { 98 | "cell_type": "code", 99 | "execution_count": 7, 100 | "metadata": {}, 101 | "outputs": [], 102 | "source": [ 103 | "class ConfigurationManager:\n", 104 | " def __init__(\n", 105 | " self,\n", 106 | " config_filepath = CONFIG_FILE_PATH,\n", 107 | " params_filepath = PARAMS_FILE_PATH):\n", 108 | "\n", 109 | " self.config = read_yaml(config_filepath)\n", 110 | " self.params = read_yaml(params_filepath)\n", 111 | "\n", 112 | " create_directories([self.config.artifacts_root])\n", 113 | "\n", 114 | "\n", 115 | " \n", 116 | " def get_model_trainer_config(self) -> ModelTrainerConfig:\n", 117 | " config = self.config.model_trainer\n", 118 | " params = self.params.TrainingArguments\n", 119 | "\n", 120 | " create_directories([config.root_dir])\n", 121 | "\n", 122 | " model_trainer_config = ModelTrainerConfig(\n", 123 | " root_dir=config.root_dir,\n", 124 | " data_path=config.data_path,\n", 125 | " model_ckpt = config.model_ckpt,\n", 126 | " num_train_epochs = params.num_train_epochs,\n", 127 | " warmup_steps = params.warmup_steps,\n", 128 | " per_device_train_batch_size = params.per_device_train_batch_size,\n", 129 | " weight_decay = params.weight_decay,\n", 130 | " logging_steps = params.logging_steps,\n", 131 | " evaluation_strategy = params.evaluation_strategy,\n", 132 | " eval_steps = params.evaluation_strategy,\n", 133 | " save_steps = params.save_steps,\n", 134 | " gradient_accumulation_steps = params.gradient_accumulation_steps\n", 135 | " )\n", 136 | "\n", 137 | " return model_trainer_config" 138 | ] 139 | }, 140 | { 141 | "cell_type": "code", 142 | "execution_count": 8, 143 | "metadata": {}, 144 | "outputs": [ 145 | { 146 | "name": "stderr", 147 | "output_type": "stream", 148 | "text": [ 149 | "d:\\Softwares\\anaconda3\\envs\\textS\\lib\\site-packages\\tqdm\\auto.py:21: TqdmWarning: IProgress not found. Please update jupyter and ipywidgets. See https://ipywidgets.readthedocs.io/en/stable/user_install.html\n", 150 | " from .autonotebook import tqdm as notebook_tqdm\n" 151 | ] 152 | } 153 | ], 154 | "source": [ 155 | "from transformers import TrainingArguments, Trainer\n", 156 | "from transformers import DataCollatorForSeq2Seq\n", 157 | "from transformers import AutoModelForSeq2SeqLM, AutoTokenizer\n", 158 | "from datasets import load_dataset, load_from_disk\n", 159 | "import torch" 160 | ] 161 | }, 162 | { 163 | "cell_type": "code", 164 | "execution_count": 9, 165 | "metadata": {}, 166 | "outputs": [], 167 | "source": [ 168 | "class ModelTrainer:\n", 169 | " def __init__(self, config: ModelTrainerConfig):\n", 170 | " self.config = config\n", 171 | "\n", 172 | "\n", 173 | " \n", 174 | " def train(self):\n", 175 | " device = \"cuda\" if torch.cuda.is_available() else \"cpu\"\n", 176 | " tokenizer = AutoTokenizer.from_pretrained(self.config.model_ckpt)\n", 177 | " model_pegasus = AutoModelForSeq2SeqLM.from_pretrained(self.config.model_ckpt).to(device)\n", 178 | " seq2seq_data_collator = DataCollatorForSeq2Seq(tokenizer, model=model_pegasus)\n", 179 | " \n", 180 | " #loading data \n", 181 | " dataset_samsum_pt = load_from_disk(self.config.data_path)\n", 182 | "\n", 183 | " # trainer_args = TrainingArguments(\n", 184 | " # output_dir=self.config.root_dir, num_train_epochs=self.config.num_train_epochs, warmup_steps=self.config.warmup_steps,\n", 185 | " # per_device_train_batch_size=self.config.per_device_train_batch_size, per_device_eval_batch_size=self.config.per_device_train_batch_size,\n", 186 | " # weight_decay=self.config.weight_decay, logging_steps=self.config.logging_steps,\n", 187 | " # evaluation_strategy=self.config.evaluation_strategy, eval_steps=self.config.eval_steps, save_steps=1e6,\n", 188 | " # gradient_accumulation_steps=self.config.gradient_accumulation_steps\n", 189 | " # ) \n", 190 | "\n", 191 | "\n", 192 | " trainer_args = TrainingArguments(\n", 193 | " output_dir=self.config.root_dir, num_train_epochs=1, warmup_steps=500,\n", 194 | " per_device_train_batch_size=1, per_device_eval_batch_size=1,\n", 195 | " weight_decay=0.01, logging_steps=10,\n", 196 | " evaluation_strategy='steps', eval_steps=500, save_steps=1e6,\n", 197 | " gradient_accumulation_steps=16\n", 198 | " ) \n", 199 | "\n", 200 | " trainer = Trainer(model=model_pegasus, args=trainer_args,\n", 201 | " tokenizer=tokenizer, data_collator=seq2seq_data_collator,\n", 202 | " train_dataset=dataset_samsum_pt[\"train\"], \n", 203 | " eval_dataset=dataset_samsum_pt[\"validation\"])\n", 204 | " \n", 205 | " trainer.train()\n", 206 | "\n", 207 | " ## Save model\n", 208 | " model_pegasus.save_pretrained(os.path.join(self.config.root_dir,\"pegasus-samsum-model\"))\n", 209 | " ## Save tokenizer\n", 210 | " tokenizer.save_pretrained(os.path.join(self.config.root_dir,\"tokenizer\"))\n" 211 | ] 212 | }, 213 | { 214 | "cell_type": "code", 215 | "execution_count": 10, 216 | "metadata": {}, 217 | "outputs": [ 218 | { 219 | "name": "stdout", 220 | "output_type": "stream", 221 | "text": [ 222 | "[2023-05-18 12:54:11,649: INFO: common: yaml file: config\\config.yaml loaded successfully]\n", 223 | "[2023-05-18 12:54:11,652: INFO: common: yaml file: params.yaml loaded successfully]\n", 224 | "[2023-05-18 12:54:11,654: INFO: common: created directory at: artifacts]\n", 225 | "[2023-05-18 12:54:11,655: INFO: common: created directory at: artifacts/model_trainer]\n" 226 | ] 227 | }, 228 | { 229 | "name": "stderr", 230 | "output_type": "stream", 231 | "text": [ 232 | "d:\\Softwares\\anaconda3\\envs\\textS\\lib\\site-packages\\transformers\\optimization.py:407: FutureWarning: This implementation of AdamW is deprecated and will be removed in a future version. Use the PyTorch implementation torch.optim.AdamW instead, or set `no_deprecation_warning=True` to disable this warning\n", 233 | " warnings.warn(\n", 234 | " 0%| | 0/51 [00:00