├── test.py
├── src
    └── textSummarizer
    │   ├── __init__.py
    │   ├── config
    │       ├── __init__.py
    │       └── configuration.py
    │   ├── utils
    │       ├── __init__.py
    │       └── common.py
    │   ├── conponents
    │       ├── __init__.py
    │       ├── data_validation.py
    │       ├── data_ingestion.py
    │       ├── data_transformation.py
    │       ├── model_trainer.py
    │       └── model_evaluation.py
    │   ├── pipeline
    │       ├── __init__.py
    │       ├── stage_04_model_trainer.py
    │       ├── stage_02_data_validation.py
    │       ├── stage_03_data_transformation.py
    │       ├── stage_05_model_evaluation.py
    │       ├── stage_01_data_ingestion.py
    │       └── prediction.py
    │   ├── constants
    │       └── __init__.py
    │   ├── logging
    │       └── __init__.py
    │   └── entity
    │       └── __init__.py
├── params.yaml
├── requirements.txt
├── Dockerfile
├── setup.py
├── app.py
├── LICENSE
├── config
    └── config.yaml
├── template.py
├── main.py
├── README.md
├── .github
    └── workflows
    │   └── main.yaml
└── research
    ├── 02_data_validation.ipynb
    ├── 03_data_transformation.ipynb
    ├── trials.ipynb
    ├── 01_data_ingestion.ipynb
    ├── 05_Model_evaluation.ipynb
    └── 04_model_trainer.ipynb


/test.py:
--------------------------------------------------------------------------------
1 | 


--------------------------------------------------------------------------------
/src/textSummarizer/__init__.py:
--------------------------------------------------------------------------------
1 | 


--------------------------------------------------------------------------------
/src/textSummarizer/config/__init__.py:
--------------------------------------------------------------------------------
1 | 


--------------------------------------------------------------------------------
/src/textSummarizer/utils/__init__.py:
--------------------------------------------------------------------------------
1 | 


--------------------------------------------------------------------------------
/src/textSummarizer/conponents/__init__.py:
--------------------------------------------------------------------------------
1 | 


--------------------------------------------------------------------------------
/src/textSummarizer/pipeline/__init__.py:
--------------------------------------------------------------------------------
1 | 


--------------------------------------------------------------------------------
/src/textSummarizer/constants/__init__.py:
--------------------------------------------------------------------------------
1 | from pathlib import Path
2 | 
3 | CONFIG_FILE_PATH = Path("config/config.yaml")
4 | PARAMS_FILE_PATH = Path("params.yaml")


--------------------------------------------------------------------------------
/params.yaml:
--------------------------------------------------------------------------------
 1 | TrainingArguments:
 2 |   num_train_epochs: 1
 3 |   warmup_steps: 500
 4 |   per_device_train_batch_size: 1
 5 |   weight_decay: 0.01
 6 |   logging_steps: 10
 7 |   evaluation_strategy: steps
 8 |   eval_steps: 500
 9 |   save_steps: 1e6
10 |   gradient_accumulation_steps: 16
11 | 
12 | 


--------------------------------------------------------------------------------
/requirements.txt:
--------------------------------------------------------------------------------
 1 | transformers
 2 | transformers[sentencepiece]
 3 | datasets
 4 | sacrebleu 
 5 | rouge_score 
 6 | py7zr
 7 | pandas
 8 | nltk
 9 | tqdm
10 | PyYAML
11 | matplotlib
12 | torch
13 | notebook
14 | boto3
15 | mypy-boto3-s3
16 | python-box==6.0.2
17 | ensure==1.0.2
18 | fastapi==0.78.0
19 | uvicorn==0.18.3
20 | Jinja2==3.1.2
21 | -e .
22 | 


--------------------------------------------------------------------------------
/Dockerfile:
--------------------------------------------------------------------------------
 1 | FROM python:3.8-slim-buster
 2 | 
 3 | RUN apt update -y && apt install awscli -y
 4 | WORKDIR /app
 5 | 
 6 | COPY . /app
 7 | 
 8 | RUN pip install -r requirements.txt
 9 | RUN pip install --upgrade accelerate
10 | RUN pip uninstall -y transformers accelerate
11 | RUN pip install transformers accelerate
12 | 
13 | CMD ["python3", "app.py"]
14 | 


--------------------------------------------------------------------------------
/src/textSummarizer/logging/__init__.py:
--------------------------------------------------------------------------------
 1 | import os
 2 | import sys
 3 | import logging
 4 | 
 5 | logging_str = "[%(asctime)s: %(levelname)s: %(module)s: %(message)s]"
 6 | log_dir = "logs"
 7 | log_filepath = os.path.join(log_dir,"running_logs.log")
 8 | os.makedirs(log_dir, exist_ok=True)
 9 | 
10 | 
11 | 
12 | logging.basicConfig(
13 |     level= logging.INFO,
14 |     format= logging_str,
15 | 
16 |     handlers=[
17 |         logging.FileHandler(log_filepath),
18 |         logging.StreamHandler(sys.stdout)
19 |     ]
20 | )
21 | 
22 | logger = logging.getLogger("textSummarizerLogger")


--------------------------------------------------------------------------------
/src/textSummarizer/pipeline/stage_04_model_trainer.py:
--------------------------------------------------------------------------------
 1 | from textSummarizer.config.configuration import ConfigurationManager
 2 | from textSummarizer.conponents.model_trainer import ModelTrainer
 3 | from textSummarizer.logging import logger
 4 | 
 5 | 
 6 | class ModelTrainerTrainingPipeline:
 7 |     def __init__(self):
 8 |         pass
 9 | 
10 |     def main(self):
11 |         config = ConfigurationManager()
12 |         model_trainer_config = config.get_model_trainer_config()
13 |         model_trainer_config = ModelTrainer(config=model_trainer_config)
14 |         model_trainer_config.train()


--------------------------------------------------------------------------------
/src/textSummarizer/pipeline/stage_02_data_validation.py:
--------------------------------------------------------------------------------
 1 | from textSummarizer.config.configuration import ConfigurationManager
 2 | from textSummarizer.conponents.data_validation import DataValiadtion
 3 | from textSummarizer.logging import logger
 4 | 
 5 | 
 6 | class DataValidationTrainingPipeline:
 7 |     def __init__(self):
 8 |         pass
 9 | 
10 |     def main(self):
11 |         config = ConfigurationManager()
12 |         data_validation_config = config.get_data_validation_config()
13 |         data_validation = DataValiadtion(config=data_validation_config)
14 |         data_validation.validate_all_files_exist()


--------------------------------------------------------------------------------
/src/textSummarizer/pipeline/stage_03_data_transformation.py:
--------------------------------------------------------------------------------
 1 | from textSummarizer.config.configuration import ConfigurationManager
 2 | from textSummarizer.conponents.data_transformation import DataTransformation
 3 | from textSummarizer.logging import logger
 4 | 
 5 | 
 6 | class DataTransformationTrainingPipeline:
 7 |     def __init__(self):
 8 |         pass
 9 | 
10 |     def main(self):
11 |         config = ConfigurationManager()
12 |         data_transformation_config = config.get_data_transformation_config()
13 |         data_transformation = DataTransformation(config=data_transformation_config)
14 |         data_transformation.convert()


--------------------------------------------------------------------------------
/src/textSummarizer/pipeline/stage_05_model_evaluation.py:
--------------------------------------------------------------------------------
 1 | from textSummarizer.config.configuration import ConfigurationManager
 2 | from textSummarizer.conponents.model_evaluation import ModelEvaluation
 3 | from textSummarizer.logging import logger
 4 | 
 5 | 
 6 | 
 7 | 
 8 | class ModelEvaluationTrainingPipeline:
 9 |     def __init__(self):
10 |         pass
11 | 
12 |     def main(self):
13 |         config = ConfigurationManager()
14 |         model_evaluation_config = config.get_model_evaluation_config()
15 |         model_evaluation_config = ModelEvaluation(config=model_evaluation_config)
16 |         model_evaluation_config.evaluate()


--------------------------------------------------------------------------------
/src/textSummarizer/pipeline/stage_01_data_ingestion.py:
--------------------------------------------------------------------------------
 1 | from textSummarizer.config.configuration import ConfigurationManager
 2 | from textSummarizer.conponents.data_ingestion import DataIngestion
 3 | from textSummarizer.logging import logger
 4 | 
 5 | 
 6 | class DataIngestionTrainingPipeline:
 7 |     def __init__(self):
 8 |         pass
 9 | 
10 |     def main(self):
11 |         config = ConfigurationManager()
12 |         data_ingestion_config = config.get_data_ingestion_config()
13 |         data_ingestion = DataIngestion(config=data_ingestion_config)
14 |         data_ingestion.download_file()
15 |         data_ingestion.extract_zip_file()
16 | 


--------------------------------------------------------------------------------
/src/textSummarizer/pipeline/prediction.py:
--------------------------------------------------------------------------------
 1 | from textSummarizer.config.configuration import ConfigurationManager
 2 | from transformers import AutoTokenizer
 3 | from transformers import pipeline
 4 | 
 5 | 
 6 | class PredictionPipeline:
 7 |     def __init__(self):
 8 |         self.config = ConfigurationManager().get_model_evaluation_config()
 9 | 
10 | 
11 |     
12 |     def predict(self,text):
13 |         tokenizer = AutoTokenizer.from_pretrained(self.config.tokenizer_path)
14 |         gen_kwargs = {"length_penalty": 0.8, "num_beams":8, "max_length": 128}
15 | 
16 |         pipe = pipeline("summarization", model=self.config.model_path,tokenizer=tokenizer)
17 | 
18 |         print("Dialogue:")
19 |         print(text)
20 | 
21 |         output = pipe(text, **gen_kwargs)[0]["summary_text"]
22 |         print("\nModel Summary:")
23 |         print(output)
24 | 
25 |         return output


--------------------------------------------------------------------------------
/setup.py:
--------------------------------------------------------------------------------
 1 | import setuptools
 2 | 
 3 | with open("README.md", "r", encoding="utf-8") as f:
 4 |     long_description = f.read()
 5 | 
 6 | 
 7 | __version__ = "0.0.0"
 8 | 
 9 | REPO_NAME = "Text-Summarizer-Project"
10 | AUTHOR_USER_NAME = "entbappy"
11 | SRC_REPO = "textSummarizer"
12 | AUTHOR_EMAIL = "entbappy73@gmail.com"
13 | 
14 | 
15 | 
16 | setuptools.setup(
17 |     name=SRC_REPO,
18 |     version=__version__,
19 |     author=AUTHOR_USER_NAME,
20 |     author_email=AUTHOR_EMAIL,
21 |     description="A small python package for NLP app",
22 |     long_description=long_description,
23 |     long_description_content="text/markdown",
24 |     url=f"https://github.com/{AUTHOR_USER_NAME}/{REPO_NAME}",
25 |     project_urls={
26 |         "Bug Tracker": f"https://github.com/{AUTHOR_USER_NAME}/{REPO_NAME}/issues",
27 |     },
28 |     package_dir={"": "src"},
29 |     packages=setuptools.find_packages(where="src")
30 | )


--------------------------------------------------------------------------------
/app.py:
--------------------------------------------------------------------------------
 1 | from fastapi import FastAPI
 2 | import uvicorn
 3 | import sys
 4 | import os
 5 | from fastapi.templating import Jinja2Templates
 6 | from starlette.responses import RedirectResponse
 7 | from fastapi.responses import Response
 8 | from textSummarizer.pipeline.prediction import PredictionPipeline
 9 | 
10 | 
11 | text:str = "What is Text Summarization?"
12 | 
13 | app = FastAPI()
14 | 
15 | @app.get("/", tags=["authentication"])
16 | async def index():
17 |     return RedirectResponse(url="/docs")
18 | 
19 | 
20 | 
21 | @app.get("/train")
22 | async def training():
23 |     try:
24 |         os.system("python main.py")
25 |         return Response("Training successful !!")
26 | 
27 |     except Exception as e:
28 |         return Response(f"Error Occurred! {e}")
29 |     
30 | 
31 | 
32 | 
33 | @app.post("/predict")
34 | async def predict_route(text):
35 |     try:
36 | 
37 |         obj = PredictionPipeline()
38 |         text = obj.predict(text)
39 |         return text
40 |     except Exception as e:
41 |         raise e
42 |     
43 | 
44 | if __name__=="__main__":
45 |     uvicorn.run(app, host="0.0.0.0", port=8080)
46 | 


--------------------------------------------------------------------------------
/LICENSE:
--------------------------------------------------------------------------------
 1 | MIT License
 2 | 
 3 | Copyright (c) 2023 BAPPY AHMED
 4 | 
 5 | Permission is hereby granted, free of charge, to any person obtaining a copy
 6 | of this software and associated documentation files (the "Software"), to deal
 7 | in the Software without restriction, including without limitation the rights
 8 | to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
 9 | copies of the Software, and to permit persons to whom the Software is
10 | furnished to do so, subject to the following conditions:
11 | 
12 | The above copyright notice and this permission notice shall be included in all
13 | copies or substantial portions of the Software.
14 | 
15 | THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
16 | IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
17 | FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
18 | AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
19 | LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
20 | OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
21 | SOFTWARE.
22 | 


--------------------------------------------------------------------------------
/src/textSummarizer/conponents/data_validation.py:
--------------------------------------------------------------------------------
 1 | import os
 2 | from textSummarizer.logging import logger
 3 | from textSummarizer.entity import DataValidationConfig
 4 | 
 5 | class DataValiadtion:
 6 |     def __init__(self, config: DataValidationConfig):
 7 |         self.config = config
 8 | 
 9 | 
10 |     
11 |     def validate_all_files_exist(self)-> bool:
12 |         try:
13 |             validation_status = None
14 | 
15 |             all_files = os.listdir(os.path.join("artifacts","data_ingestion","samsum_dataset"))
16 | 
17 |             for file in all_files:
18 |                 if file not in self.config.ALL_REQUIRED_FILES:
19 |                     validation_status = False
20 |                     with open(self.config.STATUS_FILE, 'w') as f:
21 |                         f.write(f"Validation status: {validation_status}")
22 |                 else:
23 |                     validation_status = True
24 |                     with open(self.config.STATUS_FILE, 'w') as f:
25 |                         f.write(f"Validation status: {validation_status}")
26 | 
27 |             return validation_status
28 |         
29 |         except Exception as e:
30 |             raise e
31 | 


--------------------------------------------------------------------------------
/src/textSummarizer/entity/__init__.py:
--------------------------------------------------------------------------------
 1 | from dataclasses import dataclass
 2 | from pathlib import Path
 3 | 
 4 | @dataclass(frozen=True)
 5 | class DataIngestionConfig:
 6 |     root_dir: Path
 7 |     source_URL: str
 8 |     local_data_file: Path
 9 |     unzip_dir: Path
10 | 
11 | 
12 | 
13 | @dataclass(frozen=True)
14 | class DataValidationConfig:
15 |     root_dir: Path
16 |     STATUS_FILE: str
17 |     ALL_REQUIRED_FILES: list
18 | 
19 | 
20 | 
21 | @dataclass(frozen=True)
22 | class DataTransformationConfig:
23 |     root_dir: Path
24 |     data_path: Path
25 |     tokenizer_name: Path
26 | 
27 | 
28 | 
29 | @dataclass(frozen=True)
30 | class ModelTrainerConfig:
31 |     root_dir: Path
32 |     data_path: Path
33 |     model_ckpt: Path
34 |     num_train_epochs: int
35 |     warmup_steps: int
36 |     per_device_train_batch_size: int
37 |     weight_decay: float
38 |     logging_steps: int
39 |     evaluation_strategy: str
40 |     eval_steps: int
41 |     save_steps: float
42 |     gradient_accumulation_steps: int
43 | 
44 | 
45 | 
46 | @dataclass(frozen=True)
47 | class ModelEvaluationConfig:
48 |     root_dir: Path
49 |     data_path: Path
50 |     model_path: Path
51 |     tokenizer_path: Path
52 |     metric_file_name: Path


--------------------------------------------------------------------------------
/config/config.yaml:
--------------------------------------------------------------------------------
 1 | artifacts_root: artifacts
 2 | 
 3 | 
 4 | data_ingestion:
 5 |   root_dir: artifacts/data_ingestion
 6 |   source_URL: https://github.com/entbappy/Branching-tutorial/raw/master/summarizer-data.zip
 7 |   local_data_file: artifacts/data_ingestion/data.zip
 8 |   unzip_dir: artifacts/data_ingestion
 9 | 
10 | 
11 | 
12 | data_validation:
13 |   root_dir: artifacts/data_validation
14 |   STATUS_FILE: artifacts/data_validation/status.txt
15 |   ALL_REQUIRED_FILES: ["train", "test", "validation"]
16 | 
17 | 
18 | 
19 | data_transformation:
20 |   root_dir: artifacts/data_transformation
21 |   data_path: artifacts/data_ingestion/samsum_dataset
22 |   tokenizer_name: google/pegasus-cnn_dailymail
23 | 
24 | 
25 | 
26 | 
27 | model_trainer:
28 |   root_dir: artifacts/model_trainer
29 |   data_path: artifacts/data_transformation/samsum_dataset
30 |   model_ckpt: google/pegasus-cnn_dailymail
31 | 
32 | 
33 | 
34 | 
35 | model_evaluation:
36 |   root_dir: artifacts/model_evaluation
37 |   data_path: artifacts/data_transformation/samsum_dataset
38 |   model_path: artifacts/model_trainer/pegasus-samsum-model
39 |   tokenizer_path: artifacts/model_trainer/tokenizer
40 |   metric_file_name: artifacts/model_evaluation/metrics.csv
41 | 
42 | 


--------------------------------------------------------------------------------
/src/textSummarizer/conponents/data_ingestion.py:
--------------------------------------------------------------------------------
 1 | import os
 2 | import urllib.request as request
 3 | import zipfile
 4 | from textSummarizer.logging import logger
 5 | from textSummarizer.utils.common import get_size
 6 | from pathlib import Path
 7 | from textSummarizer.entity import DataIngestionConfig
 8 | 
 9 | 
10 | class DataIngestion:
11 |     def __init__(self, config: DataIngestionConfig):
12 |         self.config = config
13 | 
14 | 
15 |     
16 |     def download_file(self):
17 |         if not os.path.exists(self.config.local_data_file):
18 |             filename, headers = request.urlretrieve(
19 |                 url = self.config.source_URL,
20 |                 filename = self.config.local_data_file
21 |             )
22 |             logger.info(f"{filename} download! with following info: \n{headers}")
23 |         else:
24 |             logger.info(f"File already exists of size: {get_size(Path(self.config.local_data_file))}")  
25 | 
26 |         
27 |     
28 |     def extract_zip_file(self):
29 |         """
30 |         zip_file_path: str
31 |         Extracts the zip file into the data directory
32 |         Function returns None
33 |         """
34 |         unzip_path = self.config.unzip_dir
35 |         os.makedirs(unzip_path, exist_ok=True)
36 |         with zipfile.ZipFile(self.config.local_data_file, 'r') as zip_ref:
37 |             zip_ref.extractall(unzip_path)


--------------------------------------------------------------------------------
/src/textSummarizer/conponents/data_transformation.py:
--------------------------------------------------------------------------------
 1 | import os
 2 | from textSummarizer.logging import logger
 3 | from transformers import AutoTokenizer
 4 | from datasets import load_dataset, load_from_disk
 5 | from textSummarizer.entity import DataTransformationConfig
 6 | 
 7 | 
 8 | 
 9 | class DataTransformation:
10 |     def __init__(self, config: DataTransformationConfig):
11 |         self.config = config
12 |         self.tokenizer = AutoTokenizer.from_pretrained(config.tokenizer_name)
13 | 
14 | 
15 |     
16 |     def convert_examples_to_features(self,example_batch):
17 |         input_encodings = self.tokenizer(example_batch['dialogue'] , max_length = 1024, truncation = True )
18 |         
19 |         with self.tokenizer.as_target_tokenizer():
20 |             target_encodings = self.tokenizer(example_batch['summary'], max_length = 128, truncation = True )
21 |             
22 |         return {
23 |             'input_ids' : input_encodings['input_ids'],
24 |             'attention_mask': input_encodings['attention_mask'],
25 |             'labels': target_encodings['input_ids']
26 |         }
27 |     
28 | 
29 |     def convert(self):
30 |         dataset_samsum = load_from_disk(self.config.data_path)
31 |         dataset_samsum_pt = dataset_samsum.map(self.convert_examples_to_features, batched = True)
32 |         dataset_samsum_pt.save_to_disk(os.path.join(self.config.root_dir,"samsum_dataset"))
33 | 
34 | 
35 | 


--------------------------------------------------------------------------------
/template.py:
--------------------------------------------------------------------------------
 1 | import os
 2 | from pathlib import Path
 3 | import logging
 4 | 
 5 | logging.basicConfig(level=logging.INFO, format='[%(asctime)s]: %(message)s:')
 6 | 
 7 | 
 8 | project_name = "textSummarizer"
 9 | 
10 | list_of_files = [
11 |     ".github/workflows/.gitkeep",
12 |     f"src/{project_name}/__init__.py",
13 |     f"src/{project_name}/conponents/__init__.py",
14 |     f"src/{project_name}/utils/__init__.py",
15 |     f"src/{project_name}/utils/common.py",
16 |     f"src/{project_name}/logging/__init__.py",
17 |     f"src/{project_name}/config/__init__.py",
18 |     f"src/{project_name}/config/configuration.py",
19 |     f"src/{project_name}/pipeline/__init__.py",
20 |     f"src/{project_name}/entity/__init__.py",
21 |     f"src/{project_name}/constants/__init__.py",
22 |     "config/config.yaml",
23 |     "params.yaml",
24 |     "app.py",
25 |     "main.py",
26 |     "Dockerfile",
27 |     "requirements.txt",
28 |     "setup.py",
29 |     "research/trials.ipynb",
30 | 
31 | ]
32 | 
33 | 
34 | for filepath in list_of_files:
35 |     filepath = Path(filepath)
36 |     filedir, filename = os.path.split(filepath)
37 | 
38 |     if filedir != "":
39 |         os.makedirs(filedir, exist_ok=True)
40 |         logging.info(f"Creating directory:{filedir} for the file {filename}")
41 | 
42 |     
43 |     if (not os.path.exists(filepath)) or (os.path.getsize(filepath) == 0):
44 |         with open(filepath,'w') as f:
45 |             pass
46 |             logging.info(f"Creating empty file: {filepath}")
47 | 
48 | 
49 |     
50 |     else:
51 |         logging.info(f"{filename} is already exists")
52 | 
53 | 


--------------------------------------------------------------------------------
/src/textSummarizer/utils/common.py:
--------------------------------------------------------------------------------
 1 | import os
 2 | from box.exceptions import BoxValueError
 3 | import yaml
 4 | from textSummarizer.logging import logger
 5 | from ensure import ensure_annotations
 6 | from box import ConfigBox
 7 | from pathlib import Path
 8 | from typing import Any
 9 | 
10 | 
11 | 
12 | @ensure_annotations
13 | def read_yaml(path_to_yaml: Path) -> ConfigBox:
14 |     """reads yaml file and returns
15 | 
16 |     Args:
17 |         path_to_yaml (str): path like input
18 | 
19 |     Raises:
20 |         ValueError: if yaml file is empty
21 |         e: empty file
22 | 
23 |     Returns:
24 |         ConfigBox: ConfigBox type
25 |     """
26 |     try:
27 |         with open(path_to_yaml) as yaml_file:
28 |             content = yaml.safe_load(yaml_file)
29 |             logger.info(f"yaml file: {path_to_yaml} loaded successfully")
30 |             return ConfigBox(content)
31 |     except BoxValueError:
32 |         raise ValueError("yaml file is empty")
33 |     except Exception as e:
34 |         raise e
35 |     
36 | 
37 | 
38 | @ensure_annotations
39 | def create_directories(path_to_directories: list, verbose=True):
40 |     """create list of directories
41 | 
42 |     Args:
43 |         path_to_directories (list): list of path of directories
44 |         ignore_log (bool, optional): ignore if multiple dirs is to be created. Defaults to False.
45 |     """
46 |     for path in path_to_directories:
47 |         os.makedirs(path, exist_ok=True)
48 |         if verbose:
49 |             logger.info(f"created directory at: {path}")
50 | 
51 | 
52 | 
53 | @ensure_annotations
54 | def get_size(path: Path) -> str:
55 |     """get size in KB
56 | 
57 |     Args:
58 |         path (Path): path of the file
59 | 
60 |     Returns:
61 |         str: size in KB
62 |     """
63 |     size_in_kb = round(os.path.getsize(path)/1024)
64 |     return f"~ {size_in_kb} KB"
65 | 
66 |     
67 | 


--------------------------------------------------------------------------------
/main.py:
--------------------------------------------------------------------------------
 1 | from textSummarizer.pipeline.stage_01_data_ingestion import DataIngestionTrainingPipeline
 2 | from textSummarizer.pipeline.stage_02_data_validation import DataValidationTrainingPipeline
 3 | from textSummarizer.pipeline.stage_03_data_transformation import DataTransformationTrainingPipeline
 4 | from textSummarizer.pipeline.stage_04_model_trainer import ModelTrainerTrainingPipeline
 5 | from textSummarizer.pipeline.stage_05_model_evaluation import ModelEvaluationTrainingPipeline
 6 | from textSummarizer.logging import logger
 7 | 
 8 | 
 9 | STAGE_NAME = "Data Ingestion stage"
10 | try:
11 |    logger.info(f">>>>>> stage {STAGE_NAME} started <<<<<<") 
12 |    data_ingestion = DataIngestionTrainingPipeline()
13 |    data_ingestion.main()
14 |    logger.info(f">>>>>> stage {STAGE_NAME} completed <<<<<<\n\nx==========x")
15 | except Exception as e:
16 |         logger.exception(e)
17 |         raise e
18 | 
19 | 
20 | 
21 | 
22 | STAGE_NAME = "Data Validation stage"
23 | try:
24 |    logger.info(f">>>>>> stage {STAGE_NAME} started <<<<<<") 
25 |    data_validation = DataValidationTrainingPipeline()
26 |    data_validation.main()
27 |    logger.info(f">>>>>> stage {STAGE_NAME} completed <<<<<<\n\nx==========x")
28 | except Exception as e:
29 |         logger.exception(e)
30 |         raise e
31 | 
32 | 
33 | 
34 | STAGE_NAME = "Data Transformation stage"
35 | try:
36 |    logger.info(f">>>>>> stage {STAGE_NAME} started <<<<<<") 
37 |    data_transformation = DataTransformationTrainingPipeline()
38 |    data_transformation.main()
39 |    logger.info(f">>>>>> stage {STAGE_NAME} completed <<<<<<\n\nx==========x")
40 | except Exception as e:
41 |         logger.exception(e)
42 |         raise e
43 | 
44 | 
45 | 
46 | STAGE_NAME = "Model Trainer stage"
47 | try: 
48 |    logger.info(f"*******************")
49 |    logger.info(f">>>>>> stage {STAGE_NAME} started <<<<<<")
50 |    model_trainer = ModelTrainerTrainingPipeline()
51 |    model_trainer.main()
52 |    logger.info(f">>>>>> stage {STAGE_NAME} completed <<<<<<\n\nx==========x")
53 | except Exception as e:
54 |         logger.exception(e)
55 |         raise e
56 | 
57 | 
58 | 
59 | 
60 | STAGE_NAME = "Model Evaluation stage"
61 | try: 
62 |    logger.info(f"*******************")
63 |    logger.info(f">>>>>> stage {STAGE_NAME} started <<<<<<")
64 |    model_evaluation = ModelEvaluationTrainingPipeline()
65 |    model_evaluation.main()
66 |    logger.info(f">>>>>> stage {STAGE_NAME} completed <<<<<<\n\nx==========x")
67 | except Exception as e:
68 |         logger.exception(e)
69 |         raise e
70 | 
71 | 
72 | 
73 | 
74 | 
75 | 


--------------------------------------------------------------------------------
/src/textSummarizer/conponents/model_trainer.py:
--------------------------------------------------------------------------------
 1 | from transformers import TrainingArguments, Trainer
 2 | from transformers import DataCollatorForSeq2Seq
 3 | from transformers import AutoModelForSeq2SeqLM, AutoTokenizer
 4 | from datasets import load_dataset, load_from_disk
 5 | from textSummarizer.entity import ModelTrainerConfig
 6 | import torch
 7 | import os
 8 | 
 9 | 
10 | class ModelTrainer:
11 |     def __init__(self, config: ModelTrainerConfig):
12 |         self.config = config
13 | 
14 | 
15 |     
16 |     def train(self):
17 |         device = "cuda" if torch.cuda.is_available() else "cpu"
18 |         tokenizer = AutoTokenizer.from_pretrained(self.config.model_ckpt)
19 |         model_pegasus = AutoModelForSeq2SeqLM.from_pretrained(self.config.model_ckpt).to(device)
20 |         seq2seq_data_collator = DataCollatorForSeq2Seq(tokenizer, model=model_pegasus)
21 |         
22 |         #loading data 
23 |         dataset_samsum_pt = load_from_disk(self.config.data_path)
24 | 
25 |         # trainer_args = TrainingArguments(
26 |         #     output_dir=self.config.root_dir, num_train_epochs=self.config.num_train_epochs, warmup_steps=self.config.warmup_steps,
27 |         #     per_device_train_batch_size=self.config.per_device_train_batch_size, per_device_eval_batch_size=self.config.per_device_train_batch_size,
28 |         #     weight_decay=self.config.weight_decay, logging_steps=self.config.logging_steps,
29 |         #     evaluation_strategy=self.config.evaluation_strategy, eval_steps=self.config.eval_steps, save_steps=1e6,
30 |         #     gradient_accumulation_steps=self.config.gradient_accumulation_steps
31 |         # ) 
32 | 
33 | 
34 |         trainer_args = TrainingArguments(
35 |             output_dir=self.config.root_dir, num_train_epochs=1, warmup_steps=500,
36 |             per_device_train_batch_size=1, per_device_eval_batch_size=1,
37 |             weight_decay=0.01, logging_steps=10,
38 |             evaluation_strategy='steps', eval_steps=500, save_steps=1e6,
39 |             gradient_accumulation_steps=16
40 |         ) 
41 | 
42 |         trainer = Trainer(model=model_pegasus, args=trainer_args,
43 |                   tokenizer=tokenizer, data_collator=seq2seq_data_collator,
44 |                   train_dataset=dataset_samsum_pt["train"], 
45 |                   eval_dataset=dataset_samsum_pt["validation"])
46 |         
47 |         trainer.train()
48 | 
49 |         ## Save model
50 |         model_pegasus.save_pretrained(os.path.join(self.config.root_dir,"pegasus-samsum-model"))
51 |         ## Save tokenizer
52 |         tokenizer.save_pretrained(os.path.join(self.config.root_dir,"tokenizer"))
53 | 


--------------------------------------------------------------------------------
/README.md:
--------------------------------------------------------------------------------
  1 | # End to end Text-Summarizer-Project
  2 | 
  3 | ## Workflows
  4 | 
  5 | 1. Update config.yaml
  6 | 2. Update params.yaml
  7 | 3. Update entity
  8 | 4. Update the configuration manager in src config
  9 | 5. update the conponents
 10 | 6. update the pipeline
 11 | 7. update the main.py
 12 | 8. update the app.py
 13 | 
 14 | 
 15 | # How to run?
 16 | ### STEPS:
 17 | 
 18 | Clone the repository
 19 | 
 20 | ```bash
 21 | https://github.com/entbappy/End-to-end-Text-Summarization
 22 | ```
 23 | ### STEP 01- Create a conda environment after opening the repository
 24 | 
 25 | ```bash
 26 | conda create -n summary python=3.8 -y
 27 | ```
 28 | 
 29 | ```bash
 30 | conda activate summary
 31 | ```
 32 | 
 33 | 
 34 | ### STEP 02- install the requirements
 35 | ```bash
 36 | pip install -r requirements.txt
 37 | ```
 38 | 
 39 | 
 40 | ```bash
 41 | # Finally run the following command
 42 | python app.py
 43 | ```
 44 | 
 45 | Now,
 46 | ```bash
 47 | open up you local host and port
 48 | ```
 49 | 
 50 | 
 51 | ```bash
 52 | Author: Krish Naik
 53 | Data Scientist
 54 | Email: krishnaik06@gmail.com
 55 | 
 56 | ```
 57 | 
 58 | 
 59 | 
 60 | # AWS-CICD-Deployment-with-Github-Actions
 61 | 
 62 | ## 1. Login to AWS console.
 63 | 
 64 | ## 2. Create IAM user for deployment
 65 | 
 66 | 	#with specific access
 67 | 
 68 | 	1. EC2 access : It is virtual machine
 69 | 
 70 | 	2. ECR: Elastic Container registry to save your docker image in aws
 71 | 
 72 | 
 73 | 	#Description: About the deployment
 74 | 
 75 | 	1. Build docker image of the source code
 76 | 
 77 | 	2. Push your docker image to ECR
 78 | 
 79 | 	3. Launch Your EC2 
 80 | 
 81 | 	4. Pull Your image from ECR in EC2
 82 | 
 83 | 	5. Lauch your docker image in EC2
 84 | 
 85 | 	#Policy:
 86 | 
 87 | 	1. AmazonEC2ContainerRegistryFullAccess
 88 | 
 89 | 	2. AmazonEC2FullAccess
 90 | 
 91 | 	
 92 | ## 3. Create ECR repo to store/save docker image
 93 |     - Save the URI: 566373416292.dkr.ecr.us-east-1.amazonaws.com/text-s
 94 | 
 95 | 	
 96 | ## 4. Create EC2 machine (Ubuntu) 
 97 | 
 98 | ## 5. Open EC2 and Install docker in EC2 Machine:
 99 | 	
100 | 	
101 | 	#optinal
102 | 
103 | 	sudo apt-get update -y
104 | 
105 | 	sudo apt-get upgrade
106 | 	
107 | 	#required
108 | 
109 | 	curl -fsSL https://get.docker.com -o get-docker.sh
110 | 
111 | 	sudo sh get-docker.sh
112 | 
113 | 	sudo usermod -aG docker ubuntu
114 | 
115 | 	newgrp docker
116 | 	
117 | # 6. Configure EC2 as self-hosted runner:
118 |     setting>actions>runner>new self hosted runner> choose os> then run command one by one
119 | 
120 | 
121 | # 7. Setup github secrets:
122 | 
123 |     AWS_ACCESS_KEY_ID=
124 | 
125 |     AWS_SECRET_ACCESS_KEY=
126 | 
127 |     AWS_REGION = us-east-1
128 | 
129 |     AWS_ECR_LOGIN_URI = demo>>  566373416292.dkr.ecr.ap-south-1.amazonaws.com
130 | 
131 |     ECR_REPOSITORY_NAME = simple-app
132 | 


--------------------------------------------------------------------------------
/.github/workflows/main.yaml:
--------------------------------------------------------------------------------
 1 | name: workflow
 2 | 
 3 | on:
 4 |   push:
 5 |     branches:
 6 |       - main
 7 |     paths-ignore:
 8 |       - 'README.md'
 9 | 
10 | permissions:
11 |   id-token: write
12 |   contents: read
13 | 
14 | jobs:
15 |   integration:
16 |     name: Continuous Integration
17 |     runs-on: ubuntu-latest
18 |     steps:
19 |       - name: Checkout Code
20 |         uses: actions/checkout@v3
21 | 
22 |       - name: Lint code
23 |         run: echo "Linting repository"
24 | 
25 |       - name: Run unit tests
26 |         run: echo "Running unit tests"
27 | 
28 |   build-and-push-ecr-image:
29 |     name: Continuous Delivery
30 |     needs: integration
31 |     runs-on: ubuntu-latest
32 |     steps:
33 |       - name: Checkout Code
34 |         uses: actions/checkout@v3
35 | 
36 |       - name: Install Utilities
37 |         run: |
38 |           sudo apt-get update
39 |           sudo apt-get install -y jq unzip
40 |       - name: Configure AWS credentials
41 |         uses: aws-actions/configure-aws-credentials@v1
42 |         with:
43 |           aws-access-key-id: ${{ secrets.AWS_ACCESS_KEY_ID }}
44 |           aws-secret-access-key: ${{ secrets.AWS_SECRET_ACCESS_KEY }}
45 |           aws-region: ${{ secrets.AWS_REGION }}
46 | 
47 |       - name: Login to Amazon ECR
48 |         id: login-ecr
49 |         uses: aws-actions/amazon-ecr-login@v1
50 | 
51 |       - name: Build, tag, and push image to Amazon ECR
52 |         id: build-image
53 |         env:
54 |           ECR_REGISTRY: ${{ steps.login-ecr.outputs.registry }}
55 |           ECR_REPOSITORY: ${{ secrets.ECR_REPOSITORY_NAME }}
56 |           IMAGE_TAG: latest
57 |         run: |
58 |           # Build a docker container and
59 |           # push it to ECR so that it can
60 |           # be deployed to ECS.
61 |           docker build -t $ECR_REGISTRY/$ECR_REPOSITORY:$IMAGE_TAG .
62 |           docker push $ECR_REGISTRY/$ECR_REPOSITORY:$IMAGE_TAG
63 |           echo "::set-output name=image::$ECR_REGISTRY/$ECR_REPOSITORY:$IMAGE_TAG"
64 |           
65 |           
66 |   Continuous-Deployment:
67 |     needs: build-and-push-ecr-image
68 |     runs-on: self-hosted
69 |     steps:
70 |       - name: Checkout
71 |         uses: actions/checkout@v3
72 | 
73 |       - name: Configure AWS credentials
74 |         uses: aws-actions/configure-aws-credentials@v1
75 |         with:
76 |           aws-access-key-id: ${{ secrets.AWS_ACCESS_KEY_ID }}
77 |           aws-secret-access-key: ${{ secrets.AWS_SECRET_ACCESS_KEY }}
78 |           aws-region: ${{ secrets.AWS_REGION }}
79 | 
80 |       - name: Login to Amazon ECR
81 |         id: login-ecr
82 |         uses: aws-actions/amazon-ecr-login@v1
83 |       
84 |       
85 |       - name: Pull latest images
86 |         run: |
87 |          docker pull ${{secrets.AWS_ECR_LOGIN_URI}}/${{ secrets.ECR_REPOSITORY_NAME }}:latest
88 |          
89 |       # - name: Stop and remove container if running
90 |       #   run: |
91 |       #    docker ps -q --filter "name=texts" | grep -q . && docker stop texts && docker rm -fv texts
92 |        
93 |       - name: Run Docker Image to serve users
94 |         run: |
95 |          docker run -d -p 8080:8080 --name=texts -e 'AWS_ACCESS_KEY_ID=${{ secrets.AWS_ACCESS_KEY_ID }}' -e 'AWS_SECRET_ACCESS_KEY=${{ secrets.AWS_SECRET_ACCESS_KEY }}' -e 'AWS_REGION=${{ secrets.AWS_REGION }}'  ${{secrets.AWS_ECR_LOGIN_URI}}/${{ secrets.ECR_REPOSITORY_NAME }}:latest
96 |       - name: Clean previous images and containers
97 |         run: |
98 |          docker system prune -f


--------------------------------------------------------------------------------
/src/textSummarizer/conponents/model_evaluation.py:
--------------------------------------------------------------------------------
 1 | from transformers import AutoModelForSeq2SeqLM, AutoTokenizer
 2 | from datasets import load_dataset, load_from_disk, load_metric
 3 | import torch
 4 | import pandas as pd
 5 | from tqdm import tqdm
 6 | from textSummarizer.entity import ModelEvaluationConfig
 7 | 
 8 | 
 9 | 
10 | 
11 | class ModelEvaluation:
12 |     def __init__(self, config: ModelEvaluationConfig):
13 |         self.config = config
14 | 
15 | 
16 |     
17 |     def generate_batch_sized_chunks(self,list_of_elements, batch_size):
18 |         """split the dataset into smaller batches that we can process simultaneously
19 |         Yield successive batch-sized chunks from list_of_elements."""
20 |         for i in range(0, len(list_of_elements), batch_size):
21 |             yield list_of_elements[i : i + batch_size]
22 | 
23 |     
24 |     def calculate_metric_on_test_ds(self,dataset, metric, model, tokenizer, 
25 |                                batch_size=16, device="cuda" if torch.cuda.is_available() else "cpu", 
26 |                                column_text="article", 
27 |                                column_summary="highlights"):
28 |         article_batches = list(self.generate_batch_sized_chunks(dataset[column_text], batch_size))
29 |         target_batches = list(self.generate_batch_sized_chunks(dataset[column_summary], batch_size))
30 | 
31 |         for article_batch, target_batch in tqdm(
32 |             zip(article_batches, target_batches), total=len(article_batches)):
33 |             
34 |             inputs = tokenizer(article_batch, max_length=1024,  truncation=True, 
35 |                             padding="max_length", return_tensors="pt")
36 |             
37 |             summaries = model.generate(input_ids=inputs["input_ids"].to(device),
38 |                             attention_mask=inputs["attention_mask"].to(device), 
39 |                             length_penalty=0.8, num_beams=8, max_length=128)
40 |             ''' parameter for length penalty ensures that the model does not generate sequences that are too long. '''
41 |             
42 |             # Finally, we decode the generated texts, 
43 |             # replace the  token, and add the decoded texts with the references to the metric.
44 |             decoded_summaries = [tokenizer.decode(s, skip_special_tokens=True, 
45 |                                     clean_up_tokenization_spaces=True) 
46 |                 for s in summaries]      
47 |             
48 |             decoded_summaries = [d.replace("", " ") for d in decoded_summaries]
49 |             
50 |             
51 |             metric.add_batch(predictions=decoded_summaries, references=target_batch)
52 |             
53 |         #  Finally compute and return the ROUGE scores.
54 |         score = metric.compute()
55 |         return score
56 | 
57 | 
58 |     def evaluate(self):
59 |         device = "cuda" if torch.cuda.is_available() else "cpu"
60 |         tokenizer = AutoTokenizer.from_pretrained(self.config.tokenizer_path)
61 |         model_pegasus = AutoModelForSeq2SeqLM.from_pretrained(self.config.model_path).to(device)
62 |        
63 |         #loading data 
64 |         dataset_samsum_pt = load_from_disk(self.config.data_path)
65 | 
66 | 
67 |         rouge_names = ["rouge1", "rouge2", "rougeL", "rougeLsum"]
68 |   
69 |         rouge_metric = load_metric('rouge')
70 | 
71 |         score = self.calculate_metric_on_test_ds(
72 |         dataset_samsum_pt['test'][0:10], rouge_metric, model_pegasus, tokenizer, batch_size = 2, column_text = 'dialogue', column_summary= 'summary'
73 |             )
74 | 
75 |         rouge_dict = dict((rn, score[rn].mid.fmeasure ) for rn in rouge_names )
76 | 
77 |         df = pd.DataFrame(rouge_dict, index = ['pegasus'] )
78 |         df.to_csv(self.config.metric_file_name, index=False)
79 | 
80 |         
81 | 
82 | 


--------------------------------------------------------------------------------
/src/textSummarizer/config/configuration.py:
--------------------------------------------------------------------------------
  1 | from textSummarizer.constants import *
  2 | from textSummarizer.utils.common import read_yaml, create_directories
  3 | from textSummarizer.entity import (DataIngestionConfig,
  4 |                                    DataValidationConfig,
  5 |                                    DataTransformationConfig,
  6 |                                    ModelTrainerConfig,
  7 |                                    ModelEvaluationConfig)
  8 | 
  9 | 
 10 | class ConfigurationManager:
 11 |     def __init__(
 12 |         self,
 13 |         config_filepath = CONFIG_FILE_PATH,
 14 |         params_filepath = PARAMS_FILE_PATH):
 15 | 
 16 |         self.config = read_yaml(config_filepath)
 17 |         self.params = read_yaml(params_filepath)
 18 | 
 19 |         create_directories([self.config.artifacts_root])
 20 | 
 21 |     
 22 | 
 23 |     def get_data_ingestion_config(self) -> DataIngestionConfig:
 24 |         config = self.config.data_ingestion
 25 | 
 26 |         create_directories([config.root_dir])
 27 | 
 28 |         data_ingestion_config = DataIngestionConfig(
 29 |             root_dir=config.root_dir,
 30 |             source_URL=config.source_URL,
 31 |             local_data_file=config.local_data_file,
 32 |             unzip_dir=config.unzip_dir 
 33 |         )
 34 | 
 35 |         return data_ingestion_config
 36 |     
 37 | 
 38 | 
 39 |     def get_data_validation_config(self) -> DataValidationConfig:
 40 |         config = self.config.data_validation
 41 | 
 42 |         create_directories([config.root_dir])
 43 | 
 44 |         data_validation_config = DataValidationConfig(
 45 |             root_dir=config.root_dir,
 46 |             STATUS_FILE=config.STATUS_FILE,
 47 |             ALL_REQUIRED_FILES=config.ALL_REQUIRED_FILES,
 48 |         )
 49 | 
 50 |         return data_validation_config
 51 |     
 52 | 
 53 |     def get_data_transformation_config(self) -> DataTransformationConfig:
 54 |         config = self.config.data_transformation
 55 | 
 56 |         create_directories([config.root_dir])
 57 | 
 58 |         data_transformation_config = DataTransformationConfig(
 59 |             root_dir=config.root_dir,
 60 |             data_path=config.data_path,
 61 |             tokenizer_name = config.tokenizer_name
 62 |         )
 63 | 
 64 |         return data_transformation_config
 65 |     
 66 | 
 67 | 
 68 |     def get_model_trainer_config(self) -> ModelTrainerConfig:
 69 |         config = self.config.model_trainer
 70 |         params = self.params.TrainingArguments
 71 | 
 72 |         create_directories([config.root_dir])
 73 | 
 74 |         model_trainer_config = ModelTrainerConfig(
 75 |             root_dir=config.root_dir,
 76 |             data_path=config.data_path,
 77 |             model_ckpt = config.model_ckpt,
 78 |             num_train_epochs = params.num_train_epochs,
 79 |             warmup_steps = params.warmup_steps,
 80 |             per_device_train_batch_size = params.per_device_train_batch_size,
 81 |             weight_decay = params.weight_decay,
 82 |             logging_steps = params.logging_steps,
 83 |             evaluation_strategy = params.evaluation_strategy,
 84 |             eval_steps = params.evaluation_strategy,
 85 |             save_steps = params.save_steps,
 86 |             gradient_accumulation_steps = params.gradient_accumulation_steps
 87 |         )
 88 | 
 89 |         return model_trainer_config
 90 |     
 91 | 
 92 |     def get_model_evaluation_config(self) -> ModelEvaluationConfig:
 93 |         config = self.config.model_evaluation
 94 | 
 95 |         create_directories([config.root_dir])
 96 | 
 97 |         model_evaluation_config = ModelEvaluationConfig(
 98 |             root_dir=config.root_dir,
 99 |             data_path=config.data_path,
100 |             model_path = config.model_path,
101 |             tokenizer_path = config.tokenizer_path,
102 |             metric_file_name = config.metric_file_name
103 |            
104 |         )
105 | 
106 |         return model_evaluation_config
107 | 


--------------------------------------------------------------------------------
/research/02_data_validation.ipynb:
--------------------------------------------------------------------------------
  1 | {
  2 |  "cells": [
  3 |   {
  4 |    "cell_type": "code",
  5 |    "execution_count": 1,
  6 |    "metadata": {},
  7 |    "outputs": [],
  8 |    "source": [
  9 |     "import os"
 10 |    ]
 11 |   },
 12 |   {
 13 |    "cell_type": "code",
 14 |    "execution_count": 2,
 15 |    "metadata": {},
 16 |    "outputs": [
 17 |     {
 18 |      "data": {
 19 |       "text/plain": [
 20 |        "'d:\\\\Bappy\\\\YouTube\\\\Text-Summarizer-Project\\\\research'"
 21 |       ]
 22 |      },
 23 |      "execution_count": 2,
 24 |      "metadata": {},
 25 |      "output_type": "execute_result"
 26 |     }
 27 |    ],
 28 |    "source": [
 29 |     "%pwd"
 30 |    ]
 31 |   },
 32 |   {
 33 |    "cell_type": "code",
 34 |    "execution_count": 3,
 35 |    "metadata": {},
 36 |    "outputs": [],
 37 |    "source": [
 38 |     "os.chdir(\"../\")"
 39 |    ]
 40 |   },
 41 |   {
 42 |    "cell_type": "code",
 43 |    "execution_count": 4,
 44 |    "metadata": {},
 45 |    "outputs": [
 46 |     {
 47 |      "data": {
 48 |       "text/plain": [
 49 |        "'d:\\\\Bappy\\\\YouTube\\\\Text-Summarizer-Project'"
 50 |       ]
 51 |      },
 52 |      "execution_count": 4,
 53 |      "metadata": {},
 54 |      "output_type": "execute_result"
 55 |     }
 56 |    ],
 57 |    "source": [
 58 |     "%pwd"
 59 |    ]
 60 |   },
 61 |   {
 62 |    "cell_type": "code",
 63 |    "execution_count": 5,
 64 |    "metadata": {},
 65 |    "outputs": [],
 66 |    "source": [
 67 |     "from dataclasses import dataclass\n",
 68 |     "from pathlib import Path\n",
 69 |     "\n",
 70 |     "\n",
 71 |     "@dataclass(frozen=True)\n",
 72 |     "class DataValidationConfig:\n",
 73 |     "    root_dir: Path\n",
 74 |     "    STATUS_FILE: str\n",
 75 |     "    ALL_REQUIRED_FILES: list"
 76 |    ]
 77 |   },
 78 |   {
 79 |    "cell_type": "code",
 80 |    "execution_count": 6,
 81 |    "metadata": {},
 82 |    "outputs": [],
 83 |    "source": [
 84 |     "from textSummarizer.constants import *\n",
 85 |     "from textSummarizer.utils.common import read_yaml, create_directories"
 86 |    ]
 87 |   },
 88 |   {
 89 |    "cell_type": "code",
 90 |    "execution_count": 7,
 91 |    "metadata": {},
 92 |    "outputs": [],
 93 |    "source": [
 94 |     "class ConfigurationManager:\n",
 95 |     "    def __init__(\n",
 96 |     "        self,\n",
 97 |     "        config_filepath = CONFIG_FILE_PATH,\n",
 98 |     "        params_filepath = PARAMS_FILE_PATH):\n",
 99 |     "\n",
100 |     "        self.config = read_yaml(config_filepath)\n",
101 |     "        self.params = read_yaml(params_filepath)\n",
102 |     "\n",
103 |     "        create_directories([self.config.artifacts_root])\n",
104 |     "\n",
105 |     "\n",
106 |     "    \n",
107 |     "    def get_data_validation_config(self) -> DataValidationConfig:\n",
108 |     "        config = self.config.data_validation\n",
109 |     "\n",
110 |     "        create_directories([config.root_dir])\n",
111 |     "\n",
112 |     "        data_validation_config = DataValidationConfig(\n",
113 |     "            root_dir=config.root_dir,\n",
114 |     "            STATUS_FILE=config.STATUS_FILE,\n",
115 |     "            ALL_REQUIRED_FILES=config.ALL_REQUIRED_FILES,\n",
116 |     "        )\n",
117 |     "\n",
118 |     "        return data_validation_config"
119 |    ]
120 |   },
121 |   {
122 |    "cell_type": "code",
123 |    "execution_count": 8,
124 |    "metadata": {},
125 |    "outputs": [],
126 |    "source": [
127 |     "import os\n",
128 |     "from textSummarizer.logging import logger"
129 |    ]
130 |   },
131 |   {
132 |    "cell_type": "code",
133 |    "execution_count": 9,
134 |    "metadata": {},
135 |    "outputs": [],
136 |    "source": [
137 |     "class DataValiadtion:\n",
138 |     "    def __init__(self, config: DataValidationConfig):\n",
139 |     "        self.config = config\n",
140 |     "\n",
141 |     "\n",
142 |     "    \n",
143 |     "    def validate_all_files_exist(self)-> bool:\n",
144 |     "        try:\n",
145 |     "            validation_status = None\n",
146 |     "\n",
147 |     "            all_files = os.listdir(os.path.join(\"artifacts\",\"data_ingestion\",\"samsum_dataset\"))\n",
148 |     "\n",
149 |     "            for file in all_files:\n",
150 |     "                if file not in self.config.ALL_REQUIRED_FILES:\n",
151 |     "                    validation_status = False\n",
152 |     "                    with open(self.config.STATUS_FILE, 'w') as f:\n",
153 |     "                        f.write(f\"Validation status: {validation_status}\")\n",
154 |     "                else:\n",
155 |     "                    validation_status = True\n",
156 |     "                    with open(self.config.STATUS_FILE, 'w') as f:\n",
157 |     "                        f.write(f\"Validation status: {validation_status}\")\n",
158 |     "\n",
159 |     "            return validation_status\n",
160 |     "        \n",
161 |     "        except Exception as e:\n",
162 |     "            raise e\n"
163 |    ]
164 |   },
165 |   {
166 |    "cell_type": "code",
167 |    "execution_count": 10,
168 |    "metadata": {},
169 |    "outputs": [
170 |     {
171 |      "name": "stdout",
172 |      "output_type": "stream",
173 |      "text": [
174 |       "[2023-05-17 11:58:23,823: INFO: common: yaml file: config\\config.yaml loaded successfully]\n",
175 |       "[2023-05-17 11:58:23,826: INFO: common: yaml file: params.yaml loaded successfully]\n",
176 |       "[2023-05-17 11:58:23,828: INFO: common: created directory at: artifacts]\n",
177 |       "[2023-05-17 11:58:23,829: INFO: common: created directory at: artifacts/data_validation]\n"
178 |      ]
179 |     }
180 |    ],
181 |    "source": [
182 |     "try:\n",
183 |     "    config = ConfigurationManager()\n",
184 |     "    data_validation_config = config.get_data_validation_config()\n",
185 |     "    data_validation = DataValiadtion(config=data_validation_config)\n",
186 |     "    data_validation.validate_all_files_exist()\n",
187 |     "except Exception as e:\n",
188 |     "    raise e"
189 |    ]
190 |   },
191 |   {
192 |    "cell_type": "code",
193 |    "execution_count": null,
194 |    "metadata": {},
195 |    "outputs": [],
196 |    "source": []
197 |   }
198 |  ],
199 |  "metadata": {
200 |   "kernelspec": {
201 |    "display_name": "textS",
202 |    "language": "python",
203 |    "name": "python3"
204 |   },
205 |   "language_info": {
206 |    "codemirror_mode": {
207 |     "name": "ipython",
208 |     "version": 3
209 |    },
210 |    "file_extension": ".py",
211 |    "mimetype": "text/x-python",
212 |    "name": "python",
213 |    "nbconvert_exporter": "python",
214 |    "pygments_lexer": "ipython3",
215 |    "version": "3.8.16"
216 |   },
217 |   "orig_nbformat": 4
218 |  },
219 |  "nbformat": 4,
220 |  "nbformat_minor": 2
221 | }
222 | 


--------------------------------------------------------------------------------
/research/03_data_transformation.ipynb:
--------------------------------------------------------------------------------
  1 | {
  2 |  "cells": [
  3 |   {
  4 |    "cell_type": "code",
  5 |    "execution_count": 1,
  6 |    "metadata": {},
  7 |    "outputs": [],
  8 |    "source": [
  9 |     "import os"
 10 |    ]
 11 |   },
 12 |   {
 13 |    "cell_type": "code",
 14 |    "execution_count": 2,
 15 |    "metadata": {},
 16 |    "outputs": [
 17 |     {
 18 |      "data": {
 19 |       "text/plain": [
 20 |        "'d:\\\\Bappy\\\\YouTube\\\\Text-Summarizer-Project\\\\research'"
 21 |       ]
 22 |      },
 23 |      "execution_count": 2,
 24 |      "metadata": {},
 25 |      "output_type": "execute_result"
 26 |     }
 27 |    ],
 28 |    "source": [
 29 |     "%pwd"
 30 |    ]
 31 |   },
 32 |   {
 33 |    "cell_type": "code",
 34 |    "execution_count": 3,
 35 |    "metadata": {},
 36 |    "outputs": [],
 37 |    "source": [
 38 |     "os.chdir(\"../\")"
 39 |    ]
 40 |   },
 41 |   {
 42 |    "cell_type": "code",
 43 |    "execution_count": 4,
 44 |    "metadata": {},
 45 |    "outputs": [
 46 |     {
 47 |      "data": {
 48 |       "text/plain": [
 49 |        "'d:\\\\Bappy\\\\YouTube\\\\Text-Summarizer-Project'"
 50 |       ]
 51 |      },
 52 |      "execution_count": 4,
 53 |      "metadata": {},
 54 |      "output_type": "execute_result"
 55 |     }
 56 |    ],
 57 |    "source": [
 58 |     "%pwd"
 59 |    ]
 60 |   },
 61 |   {
 62 |    "cell_type": "code",
 63 |    "execution_count": 5,
 64 |    "metadata": {},
 65 |    "outputs": [],
 66 |    "source": [
 67 |     "from dataclasses import dataclass\n",
 68 |     "from pathlib import Path\n",
 69 |     "\n",
 70 |     "\n",
 71 |     "@dataclass(frozen=True)\n",
 72 |     "class DataTransformationConfig:\n",
 73 |     "    root_dir: Path\n",
 74 |     "    data_path: Path\n",
 75 |     "    tokenizer_name: Path\n"
 76 |    ]
 77 |   },
 78 |   {
 79 |    "cell_type": "code",
 80 |    "execution_count": 6,
 81 |    "metadata": {},
 82 |    "outputs": [],
 83 |    "source": [
 84 |     "from textSummarizer.constants import *\n",
 85 |     "from textSummarizer.utils.common import read_yaml, create_directories"
 86 |    ]
 87 |   },
 88 |   {
 89 |    "cell_type": "code",
 90 |    "execution_count": 7,
 91 |    "metadata": {},
 92 |    "outputs": [],
 93 |    "source": [
 94 |     "class ConfigurationManager:\n",
 95 |     "    def __init__(\n",
 96 |     "        self,\n",
 97 |     "        config_filepath = CONFIG_FILE_PATH,\n",
 98 |     "        params_filepath = PARAMS_FILE_PATH):\n",
 99 |     "\n",
100 |     "        self.config = read_yaml(config_filepath)\n",
101 |     "        self.params = read_yaml(params_filepath)\n",
102 |     "\n",
103 |     "        create_directories([self.config.artifacts_root])\n",
104 |     "\n",
105 |     "\n",
106 |     "    \n",
107 |     "    def get_data_transformation_config(self) -> DataTransformationConfig:\n",
108 |     "        config = self.config.data_transformation\n",
109 |     "\n",
110 |     "        create_directories([config.root_dir])\n",
111 |     "\n",
112 |     "        data_transformation_config = DataTransformationConfig(\n",
113 |     "            root_dir=config.root_dir,\n",
114 |     "            data_path=config.data_path,\n",
115 |     "            tokenizer_name = config.tokenizer_name\n",
116 |     "        )\n",
117 |     "\n",
118 |     "        return data_transformation_config\n"
119 |    ]
120 |   },
121 |   {
122 |    "cell_type": "code",
123 |    "execution_count": 9,
124 |    "metadata": {},
125 |    "outputs": [],
126 |    "source": [
127 |     "import os\n",
128 |     "from textSummarizer.logging import logger\n",
129 |     "from transformers import AutoTokenizer\n",
130 |     "from datasets import load_dataset, load_from_disk"
131 |    ]
132 |   },
133 |   {
134 |    "cell_type": "code",
135 |    "execution_count": 10,
136 |    "metadata": {},
137 |    "outputs": [],
138 |    "source": [
139 |     "class DataTransformation:\n",
140 |     "    def __init__(self, config: DataTransformationConfig):\n",
141 |     "        self.config = config\n",
142 |     "        self.tokenizer = AutoTokenizer.from_pretrained(config.tokenizer_name)\n",
143 |     "\n",
144 |     "\n",
145 |     "    \n",
146 |     "    def convert_examples_to_features(self,example_batch):\n",
147 |     "        input_encodings = self.tokenizer(example_batch['dialogue'] , max_length = 1024, truncation = True )\n",
148 |     "        \n",
149 |     "        with self.tokenizer.as_target_tokenizer():\n",
150 |     "            target_encodings = self.tokenizer(example_batch['summary'], max_length = 128, truncation = True )\n",
151 |     "            \n",
152 |     "        return {\n",
153 |     "            'input_ids' : input_encodings['input_ids'],\n",
154 |     "            'attention_mask': input_encodings['attention_mask'],\n",
155 |     "            'labels': target_encodings['input_ids']\n",
156 |     "        }\n",
157 |     "    \n",
158 |     "\n",
159 |     "    def convert(self):\n",
160 |     "        dataset_samsum = load_from_disk(self.config.data_path)\n",
161 |     "        dataset_samsum_pt = dataset_samsum.map(self.convert_examples_to_features, batched = True)\n",
162 |     "        dataset_samsum_pt.save_to_disk(os.path.join(self.config.root_dir,\"samsum_dataset\"))"
163 |    ]
164 |   },
165 |   {
166 |    "cell_type": "code",
167 |    "execution_count": 11,
168 |    "metadata": {},
169 |    "outputs": [
170 |     {
171 |      "name": "stdout",
172 |      "output_type": "stream",
173 |      "text": [
174 |       "[2023-05-18 08:51:29,881: INFO: common: yaml file: config\\config.yaml loaded successfully]\n",
175 |       "[2023-05-18 08:51:29,892: INFO: common: yaml file: params.yaml loaded successfully]\n",
176 |       "[2023-05-18 08:51:29,893: INFO: common: created directory at: artifacts]\n",
177 |       "[2023-05-18 08:51:29,894: INFO: common: created directory at: artifacts/data_transformation]\n"
178 |      ]
179 |     },
180 |     {
181 |      "name": "stderr",
182 |      "output_type": "stream",
183 |      "text": [
184 |       "Map:   0%|          | 0/14732 [00:00<?, ? examples/s]d:\\Softwares\\anaconda3\\envs\\textS\\lib\\site-packages\\transformers\\tokenization_utils_base.py:3606: UserWarning: `as_target_tokenizer` is deprecated and will be removed in v5 of Transformers. You can tokenize your labels by using the argument `text_target` of the regular `__call__` method (either in the same call as your input texts if you use the same keyword arguments, or in a separate call.\n",
185 |       "  warnings.warn(\n",
186 |       "                                                                                                 \r"
187 |      ]
188 |     }
189 |    ],
190 |    "source": [
191 |     "try:\n",
192 |     "    config = ConfigurationManager()\n",
193 |     "    data_transformation_config = config.get_data_transformation_config()\n",
194 |     "    data_transformation = DataTransformation(config=data_transformation_config)\n",
195 |     "    data_transformation.convert()\n",
196 |     "except Exception as e:\n",
197 |     "    raise e"
198 |    ]
199 |   },
200 |   {
201 |    "cell_type": "code",
202 |    "execution_count": null,
203 |    "metadata": {},
204 |    "outputs": [],
205 |    "source": []
206 |   }
207 |  ],
208 |  "metadata": {
209 |   "kernelspec": {
210 |    "display_name": "textS",
211 |    "language": "python",
212 |    "name": "python3"
213 |   },
214 |   "language_info": {
215 |    "codemirror_mode": {
216 |     "name": "ipython",
217 |     "version": 3
218 |    },
219 |    "file_extension": ".py",
220 |    "mimetype": "text/x-python",
221 |    "name": "python",
222 |    "nbconvert_exporter": "python",
223 |    "pygments_lexer": "ipython3",
224 |    "version": "3.8.16"
225 |   },
226 |   "orig_nbformat": 4
227 |  },
228 |  "nbformat": 4,
229 |  "nbformat_minor": 2
230 | }
231 | 


--------------------------------------------------------------------------------
/research/trials.ipynb:
--------------------------------------------------------------------------------
  1 | {
  2 |  "cells": [
  3 |   {
  4 |    "cell_type": "code",
  5 |    "execution_count": 1,
  6 |    "metadata": {},
  7 |    "outputs": [],
  8 |    "source": [
  9 |     "d = {\"key\": \"value\", \"key1\": \"value1\"}"
 10 |    ]
 11 |   },
 12 |   {
 13 |    "cell_type": "code",
 14 |    "execution_count": 2,
 15 |    "metadata": {},
 16 |    "outputs": [
 17 |     {
 18 |      "data": {
 19 |       "text/plain": [
 20 |        "'value'"
 21 |       ]
 22 |      },
 23 |      "execution_count": 2,
 24 |      "metadata": {},
 25 |      "output_type": "execute_result"
 26 |     }
 27 |    ],
 28 |    "source": [
 29 |     "d['key']"
 30 |    ]
 31 |   },
 32 |   {
 33 |    "cell_type": "code",
 34 |    "execution_count": 3,
 35 |    "metadata": {},
 36 |    "outputs": [
 37 |     {
 38 |      "ename": "AttributeError",
 39 |      "evalue": "'dict' object has no attribute 'key'",
 40 |      "output_type": "error",
 41 |      "traceback": [
 42 |       "\u001b[1;31m---------------------------------------------------------------------------\u001b[0m",
 43 |       "\u001b[1;31mAttributeError\u001b[0m                            Traceback (most recent call last)",
 44 |       "Cell \u001b[1;32mIn[3], line 1\u001b[0m\n\u001b[1;32m----> 1\u001b[0m d\u001b[39m.\u001b[39;49mkey\n",
 45 |       "\u001b[1;31mAttributeError\u001b[0m: 'dict' object has no attribute 'key'"
 46 |      ]
 47 |     }
 48 |    ],
 49 |    "source": [
 50 |     "d.key"
 51 |    ]
 52 |   },
 53 |   {
 54 |    "cell_type": "code",
 55 |    "execution_count": 4,
 56 |    "metadata": {},
 57 |    "outputs": [],
 58 |    "source": [
 59 |     "from box import ConfigBox"
 60 |    ]
 61 |   },
 62 |   {
 63 |    "cell_type": "code",
 64 |    "execution_count": 5,
 65 |    "metadata": {},
 66 |    "outputs": [],
 67 |    "source": [
 68 |     "d2 = ConfigBox({\"key\": \"value\", \"key1\": \"value1\"})"
 69 |    ]
 70 |   },
 71 |   {
 72 |    "cell_type": "code",
 73 |    "execution_count": 6,
 74 |    "metadata": {},
 75 |    "outputs": [
 76 |     {
 77 |      "data": {
 78 |       "text/plain": [
 79 |        "ConfigBox({'key': 'value', 'key1': 'value1'})"
 80 |       ]
 81 |      },
 82 |      "execution_count": 6,
 83 |      "metadata": {},
 84 |      "output_type": "execute_result"
 85 |     }
 86 |    ],
 87 |    "source": [
 88 |     "d2"
 89 |    ]
 90 |   },
 91 |   {
 92 |    "cell_type": "code",
 93 |    "execution_count": 7,
 94 |    "metadata": {},
 95 |    "outputs": [
 96 |     {
 97 |      "data": {
 98 |       "text/plain": [
 99 |        "'value'"
100 |       ]
101 |      },
102 |      "execution_count": 7,
103 |      "metadata": {},
104 |      "output_type": "execute_result"
105 |     }
106 |    ],
107 |    "source": [
108 |     "d2.key"
109 |    ]
110 |   },
111 |   {
112 |    "cell_type": "code",
113 |    "execution_count": 8,
114 |    "metadata": {},
115 |    "outputs": [],
116 |    "source": [
117 |     "def get_product(x: int, y: int) -> int:\n",
118 |     "    return x * y"
119 |    ]
120 |   },
121 |   {
122 |    "cell_type": "code",
123 |    "execution_count": 9,
124 |    "metadata": {},
125 |    "outputs": [
126 |     {
127 |      "data": {
128 |       "text/plain": [
129 |        "8"
130 |       ]
131 |      },
132 |      "execution_count": 9,
133 |      "metadata": {},
134 |      "output_type": "execute_result"
135 |     }
136 |    ],
137 |    "source": [
138 |     "get_product(x = 2, y = 4)"
139 |    ]
140 |   },
141 |   {
142 |    "cell_type": "code",
143 |    "execution_count": 10,
144 |    "metadata": {},
145 |    "outputs": [
146 |     {
147 |      "data": {
148 |       "text/plain": [
149 |        "'44'"
150 |       ]
151 |      },
152 |      "execution_count": 10,
153 |      "metadata": {},
154 |      "output_type": "execute_result"
155 |     }
156 |    ],
157 |    "source": [
158 |     "get_product(x = 2, y = \"4\")"
159 |    ]
160 |   },
161 |   {
162 |    "cell_type": "code",
163 |    "execution_count": 11,
164 |    "metadata": {},
165 |    "outputs": [],
166 |    "source": [
167 |     "from ensure import ensure_annotations"
168 |    ]
169 |   },
170 |   {
171 |    "cell_type": "code",
172 |    "execution_count": 12,
173 |    "metadata": {},
174 |    "outputs": [],
175 |    "source": [
176 |     "@ensure_annotations\n",
177 |     "def get_product(x: int, y: int) -> int:\n",
178 |     "    return x * y"
179 |    ]
180 |   },
181 |   {
182 |    "cell_type": "code",
183 |    "execution_count": 13,
184 |    "metadata": {},
185 |    "outputs": [
186 |     {
187 |      "data": {
188 |       "text/plain": [
189 |        "8"
190 |       ]
191 |      },
192 |      "execution_count": 13,
193 |      "metadata": {},
194 |      "output_type": "execute_result"
195 |     }
196 |    ],
197 |    "source": [
198 |     "get_product(x = 2, y = 4)"
199 |    ]
200 |   },
201 |   {
202 |    "cell_type": "code",
203 |    "execution_count": 14,
204 |    "metadata": {},
205 |    "outputs": [
206 |     {
207 |      "ename": "EnsureError",
208 |      "evalue": "Argument y of type <class 'str'> to <function get_product at 0x000001C5AB4FB700> does not match annotation type <class 'int'>",
209 |      "output_type": "error",
210 |      "traceback": [
211 |       "\u001b[1;31m---------------------------------------------------------------------------\u001b[0m",
212 |       "\u001b[1;31mEnsureError\u001b[0m                               Traceback (most recent call last)",
213 |       "Cell \u001b[1;32mIn[14], line 1\u001b[0m\n\u001b[1;32m----> 1\u001b[0m get_product(x \u001b[39m=\u001b[39;49m \u001b[39m2\u001b[39;49m, y \u001b[39m=\u001b[39;49m \u001b[39m\"\u001b[39;49m\u001b[39m4\u001b[39;49m\u001b[39m\"\u001b[39;49m)\n",
214 |       "File \u001b[1;32md:\\Softwares\\anaconda3\\envs\\textS\\lib\\site-packages\\ensure\\main.py:845\u001b[0m, in \u001b[0;36mWrappedFunctionReturn.__call__\u001b[1;34m(self, *args, **kwargs)\u001b[0m\n\u001b[0;32m    840\u001b[0m     \u001b[39mif\u001b[39;00m \u001b[39mnot\u001b[39;00m \u001b[39misinstance\u001b[39m(value, templ):\n\u001b[0;32m    841\u001b[0m         msg \u001b[39m=\u001b[39m (\n\u001b[0;32m    842\u001b[0m             \u001b[39m\"\u001b[39m\u001b[39mArgument \u001b[39m\u001b[39m{arg}\u001b[39;00m\u001b[39m of type \u001b[39m\u001b[39m{valt}\u001b[39;00m\u001b[39m to \u001b[39m\u001b[39m{f}\u001b[39;00m\u001b[39m \u001b[39m\u001b[39m\"\u001b[39m\n\u001b[0;32m    843\u001b[0m             \u001b[39m\"\u001b[39m\u001b[39mdoes not match annotation type \u001b[39m\u001b[39m{t}\u001b[39;00m\u001b[39m\"\u001b[39m\n\u001b[0;32m    844\u001b[0m         )\n\u001b[1;32m--> 845\u001b[0m         \u001b[39mraise\u001b[39;00m EnsureError(msg\u001b[39m.\u001b[39mformat(\n\u001b[0;32m    846\u001b[0m             arg\u001b[39m=\u001b[39marg, f\u001b[39m=\u001b[39m\u001b[39mself\u001b[39m\u001b[39m.\u001b[39mf, t\u001b[39m=\u001b[39mtempl, valt\u001b[39m=\u001b[39m\u001b[39mtype\u001b[39m(value)\n\u001b[0;32m    847\u001b[0m         ))\n\u001b[0;32m    849\u001b[0m return_val \u001b[39m=\u001b[39m \u001b[39mself\u001b[39m\u001b[39m.\u001b[39mf(\u001b[39m*\u001b[39margs, \u001b[39m*\u001b[39m\u001b[39m*\u001b[39mkwargs)\n\u001b[0;32m    850\u001b[0m \u001b[39mif\u001b[39;00m \u001b[39mnot\u001b[39;00m \u001b[39misinstance\u001b[39m(return_val, \u001b[39mself\u001b[39m\u001b[39m.\u001b[39mreturn_templ):\n",
215 |       "\u001b[1;31mEnsureError\u001b[0m: Argument y of type <class 'str'> to <function get_product at 0x000001C5AB4FB700> does not match annotation type <class 'int'>"
216 |      ]
217 |     }
218 |    ],
219 |    "source": [
220 |     "get_product(x = 2, y = \"4\")"
221 |    ]
222 |   },
223 |   {
224 |    "cell_type": "code",
225 |    "execution_count": null,
226 |    "metadata": {},
227 |    "outputs": [],
228 |    "source": []
229 |   }
230 |  ],
231 |  "metadata": {
232 |   "kernelspec": {
233 |    "display_name": "textS",
234 |    "language": "python",
235 |    "name": "python3"
236 |   },
237 |   "language_info": {
238 |    "codemirror_mode": {
239 |     "name": "ipython",
240 |     "version": 3
241 |    },
242 |    "file_extension": ".py",
243 |    "mimetype": "text/x-python",
244 |    "name": "python",
245 |    "nbconvert_exporter": "python",
246 |    "pygments_lexer": "ipython3",
247 |    "version": "3.8.16"
248 |   },
249 |   "orig_nbformat": 4
250 |  },
251 |  "nbformat": 4,
252 |  "nbformat_minor": 2
253 | }
254 | 


--------------------------------------------------------------------------------
/research/01_data_ingestion.ipynb:
--------------------------------------------------------------------------------
  1 | {
  2 |  "cells": [
  3 |   {
  4 |    "cell_type": "code",
  5 |    "execution_count": 1,
  6 |    "metadata": {},
  7 |    "outputs": [],
  8 |    "source": [
  9 |     "import os"
 10 |    ]
 11 |   },
 12 |   {
 13 |    "cell_type": "code",
 14 |    "execution_count": 2,
 15 |    "metadata": {},
 16 |    "outputs": [
 17 |     {
 18 |      "data": {
 19 |       "text/plain": [
 20 |        "'d:\\\\Bappy\\\\YouTube\\\\Text-Summarizer-Project\\\\research'"
 21 |       ]
 22 |      },
 23 |      "execution_count": 2,
 24 |      "metadata": {},
 25 |      "output_type": "execute_result"
 26 |     }
 27 |    ],
 28 |    "source": [
 29 |     "%pwd"
 30 |    ]
 31 |   },
 32 |   {
 33 |    "cell_type": "code",
 34 |    "execution_count": 3,
 35 |    "metadata": {},
 36 |    "outputs": [],
 37 |    "source": [
 38 |     "os.chdir(\"../\")"
 39 |    ]
 40 |   },
 41 |   {
 42 |    "cell_type": "code",
 43 |    "execution_count": 4,
 44 |    "metadata": {},
 45 |    "outputs": [
 46 |     {
 47 |      "data": {
 48 |       "text/plain": [
 49 |        "'d:\\\\Bappy\\\\YouTube\\\\Text-Summarizer-Project'"
 50 |       ]
 51 |      },
 52 |      "execution_count": 4,
 53 |      "metadata": {},
 54 |      "output_type": "execute_result"
 55 |     }
 56 |    ],
 57 |    "source": [
 58 |     "%pwd"
 59 |    ]
 60 |   },
 61 |   {
 62 |    "cell_type": "code",
 63 |    "execution_count": 5,
 64 |    "metadata": {},
 65 |    "outputs": [],
 66 |    "source": [
 67 |     "from dataclasses import dataclass\n",
 68 |     "from pathlib import Path\n",
 69 |     "\n",
 70 |     "@dataclass(frozen=True)\n",
 71 |     "class DataIngestionConfig:\n",
 72 |     "    root_dir: Path\n",
 73 |     "    source_URL: str\n",
 74 |     "    local_data_file: Path\n",
 75 |     "    unzip_dir: Path"
 76 |    ]
 77 |   },
 78 |   {
 79 |    "cell_type": "code",
 80 |    "execution_count": 6,
 81 |    "metadata": {},
 82 |    "outputs": [],
 83 |    "source": [
 84 |     "from textSummarizer.constants import *\n",
 85 |     "from textSummarizer.utils.common import read_yaml, create_directories"
 86 |    ]
 87 |   },
 88 |   {
 89 |    "cell_type": "code",
 90 |    "execution_count": 7,
 91 |    "metadata": {},
 92 |    "outputs": [],
 93 |    "source": [
 94 |     "class ConfigurationManager:\n",
 95 |     "    def __init__(\n",
 96 |     "        self,\n",
 97 |     "        config_filepath = CONFIG_FILE_PATH,\n",
 98 |     "        params_filepath = PARAMS_FILE_PATH):\n",
 99 |     "\n",
100 |     "        self.config = read_yaml(config_filepath)\n",
101 |     "        self.params = read_yaml(params_filepath)\n",
102 |     "\n",
103 |     "        create_directories([self.config.artifacts_root])\n",
104 |     "\n",
105 |     "    \n",
106 |     "\n",
107 |     "    def get_data_ingestion_config(self) -> DataIngestionConfig:\n",
108 |     "        config = self.config.data_ingestion\n",
109 |     "\n",
110 |     "        create_directories([config.root_dir])\n",
111 |     "\n",
112 |     "        data_ingestion_config = DataIngestionConfig(\n",
113 |     "            root_dir=config.root_dir,\n",
114 |     "            source_URL=config.source_URL,\n",
115 |     "            local_data_file=config.local_data_file,\n",
116 |     "            unzip_dir=config.unzip_dir \n",
117 |     "        )\n",
118 |     "\n",
119 |     "        return data_ingestion_config"
120 |    ]
121 |   },
122 |   {
123 |    "cell_type": "code",
124 |    "execution_count": 8,
125 |    "metadata": {},
126 |    "outputs": [],
127 |    "source": [
128 |     "import os\n",
129 |     "import urllib.request as request\n",
130 |     "import zipfile\n",
131 |     "from textSummarizer.logging import logger\n",
132 |     "from textSummarizer.utils.common import get_size"
133 |    ]
134 |   },
135 |   {
136 |    "cell_type": "code",
137 |    "execution_count": 9,
138 |    "metadata": {},
139 |    "outputs": [],
140 |    "source": [
141 |     "class DataIngestion:\n",
142 |     "    def __init__(self, config: DataIngestionConfig):\n",
143 |     "        self.config = config\n",
144 |     "\n",
145 |     "\n",
146 |     "    \n",
147 |     "    def download_file(self):\n",
148 |     "        if not os.path.exists(self.config.local_data_file):\n",
149 |     "            filename, headers = request.urlretrieve(\n",
150 |     "                url = self.config.source_URL,\n",
151 |     "                filename = self.config.local_data_file\n",
152 |     "            )\n",
153 |     "            logger.info(f\"{filename} download! with following info: \\n{headers}\")\n",
154 |     "        else:\n",
155 |     "            logger.info(f\"File already exists of size: {get_size(Path(self.config.local_data_file))}\")  \n",
156 |     "\n",
157 |     "        \n",
158 |     "    \n",
159 |     "    def extract_zip_file(self):\n",
160 |     "        \"\"\"\n",
161 |     "        zip_file_path: str\n",
162 |     "        Extracts the zip file into the data directory\n",
163 |     "        Function returns None\n",
164 |     "        \"\"\"\n",
165 |     "        unzip_path = self.config.unzip_dir\n",
166 |     "        os.makedirs(unzip_path, exist_ok=True)\n",
167 |     "        with zipfile.ZipFile(self.config.local_data_file, 'r') as zip_ref:\n",
168 |     "            zip_ref.extractall(unzip_path)"
169 |    ]
170 |   },
171 |   {
172 |    "cell_type": "code",
173 |    "execution_count": 10,
174 |    "metadata": {},
175 |    "outputs": [
176 |     {
177 |      "name": "stdout",
178 |      "output_type": "stream",
179 |      "text": [
180 |       "[2023-05-17 10:39:37,034: INFO: common: yaml file: config\\config.yaml loaded successfully]\n",
181 |       "[2023-05-17 10:39:37,038: INFO: common: yaml file: params.yaml loaded successfully]\n",
182 |       "[2023-05-17 10:39:37,040: INFO: common: created directory at: artifacts]\n",
183 |       "[2023-05-17 10:39:37,042: INFO: common: created directory at: artifacts/data_ingestion]\n",
184 |       "[2023-05-17 10:39:51,282: INFO: 1434958058: artifacts/data_ingestion/data.zip download! with following info: \n",
185 |       "Connection: close\n",
186 |       "Content-Length: 7903594\n",
187 |       "Cache-Control: max-age=300\n",
188 |       "Content-Security-Policy: default-src 'none'; style-src 'unsafe-inline'; sandbox\n",
189 |       "Content-Type: application/zip\n",
190 |       "ETag: \"dbc016a060da18070593b83afff580c9b300f0b6ea4147a7988433e04df246ca\"\n",
191 |       "Strict-Transport-Security: max-age=31536000\n",
192 |       "X-Content-Type-Options: nosniff\n",
193 |       "X-Frame-Options: deny\n",
194 |       "X-XSS-Protection: 1; mode=block\n",
195 |       "X-GitHub-Request-Id: 38C6:7AC2:33B271D:3D24998:64645A8C\n",
196 |       "Accept-Ranges: bytes\n",
197 |       "Date: Wed, 17 May 2023 04:39:41 GMT\n",
198 |       "Via: 1.1 varnish\n",
199 |       "X-Served-By: cache-mrs10532-MRS\n",
200 |       "X-Cache: MISS\n",
201 |       "X-Cache-Hits: 0\n",
202 |       "X-Timer: S1684298381.769825,VS0,VE670\n",
203 |       "Vary: Authorization,Accept-Encoding,Origin\n",
204 |       "Access-Control-Allow-Origin: *\n",
205 |       "X-Fastly-Request-ID: b2ff330ea1bb7f1da4c072d6a895f5e14951d76e\n",
206 |       "Expires: Wed, 17 May 2023 04:44:41 GMT\n",
207 |       "Source-Age: 0\n",
208 |       "\n",
209 |       "]\n"
210 |      ]
211 |     }
212 |    ],
213 |    "source": [
214 |     "try:\n",
215 |     "    config = ConfigurationManager()\n",
216 |     "    data_ingestion_config = config.get_data_ingestion_config()\n",
217 |     "    data_ingestion = DataIngestion(config=data_ingestion_config)\n",
218 |     "    data_ingestion.download_file()\n",
219 |     "    data_ingestion.extract_zip_file()\n",
220 |     "except Exception as e:\n",
221 |     "    raise e"
222 |    ]
223 |   },
224 |   {
225 |    "cell_type": "code",
226 |    "execution_count": null,
227 |    "metadata": {},
228 |    "outputs": [],
229 |    "source": []
230 |   }
231 |  ],
232 |  "metadata": {
233 |   "kernelspec": {
234 |    "display_name": "textS",
235 |    "language": "python",
236 |    "name": "python3"
237 |   },
238 |   "language_info": {
239 |    "codemirror_mode": {
240 |     "name": "ipython",
241 |     "version": 3
242 |    },
243 |    "file_extension": ".py",
244 |    "mimetype": "text/x-python",
245 |    "name": "python",
246 |    "nbconvert_exporter": "python",
247 |    "pygments_lexer": "ipython3",
248 |    "version": "3.8.16"
249 |   },
250 |   "orig_nbformat": 4
251 |  },
252 |  "nbformat": 4,
253 |  "nbformat_minor": 2
254 | }
255 | 


--------------------------------------------------------------------------------
/research/05_Model_evaluation.ipynb:
--------------------------------------------------------------------------------
  1 | {
  2 |  "cells": [
  3 |   {
  4 |    "cell_type": "code",
  5 |    "execution_count": 1,
  6 |    "metadata": {},
  7 |    "outputs": [],
  8 |    "source": [
  9 |     "import os"
 10 |    ]
 11 |   },
 12 |   {
 13 |    "cell_type": "code",
 14 |    "execution_count": 2,
 15 |    "metadata": {},
 16 |    "outputs": [
 17 |     {
 18 |      "data": {
 19 |       "text/plain": [
 20 |        "'d:\\\\Bappy\\\\YouTube\\\\Text-Summarizer-Project\\\\research'"
 21 |       ]
 22 |      },
 23 |      "execution_count": 2,
 24 |      "metadata": {},
 25 |      "output_type": "execute_result"
 26 |     }
 27 |    ],
 28 |    "source": [
 29 |     "%pwd"
 30 |    ]
 31 |   },
 32 |   {
 33 |    "cell_type": "code",
 34 |    "execution_count": 3,
 35 |    "metadata": {},
 36 |    "outputs": [],
 37 |    "source": [
 38 |     "os.chdir(\"../\")"
 39 |    ]
 40 |   },
 41 |   {
 42 |    "cell_type": "code",
 43 |    "execution_count": 4,
 44 |    "metadata": {},
 45 |    "outputs": [
 46 |     {
 47 |      "data": {
 48 |       "text/plain": [
 49 |        "'d:\\\\Bappy\\\\YouTube\\\\Text-Summarizer-Project'"
 50 |       ]
 51 |      },
 52 |      "execution_count": 4,
 53 |      "metadata": {},
 54 |      "output_type": "execute_result"
 55 |     }
 56 |    ],
 57 |    "source": [
 58 |     "%pwd"
 59 |    ]
 60 |   },
 61 |   {
 62 |    "cell_type": "code",
 63 |    "execution_count": 5,
 64 |    "metadata": {},
 65 |    "outputs": [],
 66 |    "source": [
 67 |     "from dataclasses import dataclass\n",
 68 |     "from pathlib import Path\n",
 69 |     "\n",
 70 |     "\n",
 71 |     "@dataclass(frozen=True)\n",
 72 |     "class ModelEvaluationConfig:\n",
 73 |     "    root_dir: Path\n",
 74 |     "    data_path: Path\n",
 75 |     "    model_path: Path\n",
 76 |     "    tokenizer_path: Path\n",
 77 |     "    metric_file_name: Path"
 78 |    ]
 79 |   },
 80 |   {
 81 |    "cell_type": "code",
 82 |    "execution_count": 6,
 83 |    "metadata": {},
 84 |    "outputs": [],
 85 |    "source": [
 86 |     "from textSummarizer.constants import *\n",
 87 |     "from textSummarizer.utils.common import read_yaml, create_directories"
 88 |    ]
 89 |   },
 90 |   {
 91 |    "cell_type": "code",
 92 |    "execution_count": 7,
 93 |    "metadata": {},
 94 |    "outputs": [],
 95 |    "source": [
 96 |     "class ConfigurationManager:\n",
 97 |     "    def __init__(\n",
 98 |     "        self,\n",
 99 |     "        config_filepath = CONFIG_FILE_PATH,\n",
100 |     "        params_filepath = PARAMS_FILE_PATH):\n",
101 |     "\n",
102 |     "        self.config = read_yaml(config_filepath)\n",
103 |     "        self.params = read_yaml(params_filepath)\n",
104 |     "\n",
105 |     "        create_directories([self.config.artifacts_root])\n",
106 |     "\n",
107 |     "\n",
108 |     "    \n",
109 |     "    def get_model_evaluation_config(self) -> ModelEvaluationConfig:\n",
110 |     "        config = self.config.model_evaluation\n",
111 |     "\n",
112 |     "        create_directories([config.root_dir])\n",
113 |     "\n",
114 |     "        model_evaluation_config = ModelEvaluationConfig(\n",
115 |     "            root_dir=config.root_dir,\n",
116 |     "            data_path=config.data_path,\n",
117 |     "            model_path = config.model_path,\n",
118 |     "            tokenizer_path = config.tokenizer_path,\n",
119 |     "            metric_file_name = config.metric_file_name\n",
120 |     "           \n",
121 |     "        )\n",
122 |     "\n",
123 |     "        return model_evaluation_config\n"
124 |    ]
125 |   },
126 |   {
127 |    "cell_type": "code",
128 |    "execution_count": 9,
129 |    "metadata": {},
130 |    "outputs": [],
131 |    "source": [
132 |     "from transformers import AutoModelForSeq2SeqLM, AutoTokenizer\n",
133 |     "from datasets import load_dataset, load_from_disk, load_metric\n",
134 |     "import torch\n",
135 |     "import pandas as pd\n",
136 |     "from tqdm import tqdm"
137 |    ]
138 |   },
139 |   {
140 |    "cell_type": "code",
141 |    "execution_count": 10,
142 |    "metadata": {},
143 |    "outputs": [],
144 |    "source": [
145 |     "class ModelEvaluation:\n",
146 |     "    def __init__(self, config: ModelEvaluationConfig):\n",
147 |     "        self.config = config\n",
148 |     "\n",
149 |     "\n",
150 |     "    \n",
151 |     "    def generate_batch_sized_chunks(self,list_of_elements, batch_size):\n",
152 |     "        \"\"\"split the dataset into smaller batches that we can process simultaneously\n",
153 |     "        Yield successive batch-sized chunks from list_of_elements.\"\"\"\n",
154 |     "        for i in range(0, len(list_of_elements), batch_size):\n",
155 |     "            yield list_of_elements[i : i + batch_size]\n",
156 |     "\n",
157 |     "    \n",
158 |     "    def calculate_metric_on_test_ds(self,dataset, metric, model, tokenizer, \n",
159 |     "                               batch_size=16, device=\"cuda\" if torch.cuda.is_available() else \"cpu\", \n",
160 |     "                               column_text=\"article\", \n",
161 |     "                               column_summary=\"highlights\"):\n",
162 |     "        article_batches = list(self.generate_batch_sized_chunks(dataset[column_text], batch_size))\n",
163 |     "        target_batches = list(self.generate_batch_sized_chunks(dataset[column_summary], batch_size))\n",
164 |     "\n",
165 |     "        for article_batch, target_batch in tqdm(\n",
166 |     "            zip(article_batches, target_batches), total=len(article_batches)):\n",
167 |     "            \n",
168 |     "            inputs = tokenizer(article_batch, max_length=1024,  truncation=True, \n",
169 |     "                            padding=\"max_length\", return_tensors=\"pt\")\n",
170 |     "            \n",
171 |     "            summaries = model.generate(input_ids=inputs[\"input_ids\"].to(device),\n",
172 |     "                            attention_mask=inputs[\"attention_mask\"].to(device), \n",
173 |     "                            length_penalty=0.8, num_beams=8, max_length=128)\n",
174 |     "            ''' parameter for length penalty ensures that the model does not generate sequences that are too long. '''\n",
175 |     "            \n",
176 |     "            # Finally, we decode the generated texts, \n",
177 |     "            # replace the  token, and add the decoded texts with the references to the metric.\n",
178 |     "            decoded_summaries = [tokenizer.decode(s, skip_special_tokens=True, \n",
179 |     "                                    clean_up_tokenization_spaces=True) \n",
180 |     "                for s in summaries]      \n",
181 |     "            \n",
182 |     "            decoded_summaries = [d.replace(\"\", \" \") for d in decoded_summaries]\n",
183 |     "            \n",
184 |     "            \n",
185 |     "            metric.add_batch(predictions=decoded_summaries, references=target_batch)\n",
186 |     "            \n",
187 |     "        #  Finally compute and return the ROUGE scores.\n",
188 |     "        score = metric.compute()\n",
189 |     "        return score\n",
190 |     "\n",
191 |     "\n",
192 |     "    def evaluate(self):\n",
193 |     "        device = \"cuda\" if torch.cuda.is_available() else \"cpu\"\n",
194 |     "        tokenizer = AutoTokenizer.from_pretrained(self.config.tokenizer_path)\n",
195 |     "        model_pegasus = AutoModelForSeq2SeqLM.from_pretrained(self.config.model_path).to(device)\n",
196 |     "       \n",
197 |     "        #loading data \n",
198 |     "        dataset_samsum_pt = load_from_disk(self.config.data_path)\n",
199 |     "\n",
200 |     "\n",
201 |     "        rouge_names = [\"rouge1\", \"rouge2\", \"rougeL\", \"rougeLsum\"]\n",
202 |     "  \n",
203 |     "        rouge_metric = load_metric('rouge')\n",
204 |     "\n",
205 |     "        score = self.calculate_metric_on_test_ds(\n",
206 |     "        dataset_samsum_pt['test'][0:10], rouge_metric, model_pegasus, tokenizer, batch_size = 2, column_text = 'dialogue', column_summary= 'summary'\n",
207 |     "            )\n",
208 |     "\n",
209 |     "        rouge_dict = dict((rn, score[rn].mid.fmeasure ) for rn in rouge_names )\n",
210 |     "\n",
211 |     "        df = pd.DataFrame(rouge_dict, index = ['pegasus'] )\n",
212 |     "        df.to_csv(self.config.metric_file_name, index=False)\n",
213 |     "\n",
214 |     "        \n",
215 |     "\n"
216 |    ]
217 |   },
218 |   {
219 |    "cell_type": "code",
220 |    "execution_count": 11,
221 |    "metadata": {},
222 |    "outputs": [
223 |     {
224 |      "name": "stdout",
225 |      "output_type": "stream",
226 |      "text": [
227 |       "[2023-05-18 20:14:03,142: INFO: common: yaml file: config\\config.yaml loaded successfully]\n",
228 |       "[2023-05-18 20:14:03,151: INFO: common: yaml file: params.yaml loaded successfully]\n",
229 |       "[2023-05-18 20:14:03,153: INFO: common: created directory at: artifacts]\n",
230 |       "[2023-05-18 20:14:03,155: INFO: common: created directory at: artifacts/model_evaluation]\n"
231 |      ]
232 |     },
233 |     {
234 |      "name": "stderr",
235 |      "output_type": "stream",
236 |      "text": [
237 |       "C:\\Users\\bokti\\AppData\\Local\\Temp\\ipykernel_25280\\2973449339.py:59: FutureWarning: load_metric is deprecated and will be removed in the next major version of datasets. Use 'evaluate.load' instead, from the new library 🤗 Evaluate: https://huggingface.co/docs/evaluate\n",
238 |       "  rouge_metric = load_metric('rouge')\n",
239 |       "100%|██████████| 5/5 [03:54<00:00, 46.91s/it]"
240 |      ]
241 |     },
242 |     {
243 |      "name": "stdout",
244 |      "output_type": "stream",
245 |      "text": [
246 |       "[2023-05-18 20:18:18,394: INFO: rouge_scorer: Using default tokenizer.]\n"
247 |      ]
248 |     },
249 |     {
250 |      "name": "stderr",
251 |      "output_type": "stream",
252 |      "text": [
253 |       "\n"
254 |      ]
255 |     }
256 |    ],
257 |    "source": [
258 |     "try:\n",
259 |     "    config = ConfigurationManager()\n",
260 |     "    model_evaluation_config = config.get_model_evaluation_config()\n",
261 |     "    model_evaluation_config = ModelEvaluation(config=model_evaluation_config)\n",
262 |     "    model_evaluation_config.evaluate()\n",
263 |     "except Exception as e:\n",
264 |     "    raise e"
265 |    ]
266 |   },
267 |   {
268 |    "cell_type": "code",
269 |    "execution_count": null,
270 |    "metadata": {},
271 |    "outputs": [],
272 |    "source": []
273 |   }
274 |  ],
275 |  "metadata": {
276 |   "kernelspec": {
277 |    "display_name": "textS",
278 |    "language": "python",
279 |    "name": "python3"
280 |   },
281 |   "language_info": {
282 |    "codemirror_mode": {
283 |     "name": "ipython",
284 |     "version": 3
285 |    },
286 |    "file_extension": ".py",
287 |    "mimetype": "text/x-python",
288 |    "name": "python",
289 |    "nbconvert_exporter": "python",
290 |    "pygments_lexer": "ipython3",
291 |    "version": "3.8.16"
292 |   },
293 |   "orig_nbformat": 4
294 |  },
295 |  "nbformat": 4,
296 |  "nbformat_minor": 2
297 | }
298 | 


--------------------------------------------------------------------------------
/research/04_model_trainer.ipynb:
--------------------------------------------------------------------------------
  1 | {
  2 |  "cells": [
  3 |   {
  4 |    "cell_type": "code",
  5 |    "execution_count": 1,
  6 |    "metadata": {},
  7 |    "outputs": [],
  8 |    "source": [
  9 |     "import os"
 10 |    ]
 11 |   },
 12 |   {
 13 |    "cell_type": "code",
 14 |    "execution_count": 2,
 15 |    "metadata": {},
 16 |    "outputs": [
 17 |     {
 18 |      "data": {
 19 |       "text/plain": [
 20 |        "'d:\\\\Bappy\\\\YouTube\\\\Text-Summarizer-Project\\\\research'"
 21 |       ]
 22 |      },
 23 |      "execution_count": 2,
 24 |      "metadata": {},
 25 |      "output_type": "execute_result"
 26 |     }
 27 |    ],
 28 |    "source": [
 29 |     "%pwd"
 30 |    ]
 31 |   },
 32 |   {
 33 |    "cell_type": "code",
 34 |    "execution_count": 3,
 35 |    "metadata": {},
 36 |    "outputs": [],
 37 |    "source": [
 38 |     "os.chdir(\"../\")"
 39 |    ]
 40 |   },
 41 |   {
 42 |    "cell_type": "code",
 43 |    "execution_count": 4,
 44 |    "metadata": {},
 45 |    "outputs": [
 46 |     {
 47 |      "data": {
 48 |       "text/plain": [
 49 |        "'d:\\\\Bappy\\\\YouTube\\\\Text-Summarizer-Project'"
 50 |       ]
 51 |      },
 52 |      "execution_count": 4,
 53 |      "metadata": {},
 54 |      "output_type": "execute_result"
 55 |     }
 56 |    ],
 57 |    "source": [
 58 |     "%pwd"
 59 |    ]
 60 |   },
 61 |   {
 62 |    "cell_type": "code",
 63 |    "execution_count": 5,
 64 |    "metadata": {},
 65 |    "outputs": [],
 66 |    "source": [
 67 |     "from dataclasses import dataclass\n",
 68 |     "from pathlib import Path\n",
 69 |     "\n",
 70 |     "\n",
 71 |     "@dataclass(frozen=True)\n",
 72 |     "class ModelTrainerConfig:\n",
 73 |     "    root_dir: Path\n",
 74 |     "    data_path: Path\n",
 75 |     "    model_ckpt: Path\n",
 76 |     "    num_train_epochs: int\n",
 77 |     "    warmup_steps: int\n",
 78 |     "    per_device_train_batch_size: int\n",
 79 |     "    weight_decay: float\n",
 80 |     "    logging_steps: int\n",
 81 |     "    evaluation_strategy: str\n",
 82 |     "    eval_steps: int\n",
 83 |     "    save_steps: float\n",
 84 |     "    gradient_accumulation_steps: int"
 85 |    ]
 86 |   },
 87 |   {
 88 |    "cell_type": "code",
 89 |    "execution_count": 6,
 90 |    "metadata": {},
 91 |    "outputs": [],
 92 |    "source": [
 93 |     "from textSummarizer.constants import *\n",
 94 |     "from textSummarizer.utils.common import read_yaml, create_directories"
 95 |    ]
 96 |   },
 97 |   {
 98 |    "cell_type": "code",
 99 |    "execution_count": 7,
100 |    "metadata": {},
101 |    "outputs": [],
102 |    "source": [
103 |     "class ConfigurationManager:\n",
104 |     "    def __init__(\n",
105 |     "        self,\n",
106 |     "        config_filepath = CONFIG_FILE_PATH,\n",
107 |     "        params_filepath = PARAMS_FILE_PATH):\n",
108 |     "\n",
109 |     "        self.config = read_yaml(config_filepath)\n",
110 |     "        self.params = read_yaml(params_filepath)\n",
111 |     "\n",
112 |     "        create_directories([self.config.artifacts_root])\n",
113 |     "\n",
114 |     "\n",
115 |     "    \n",
116 |     "    def get_model_trainer_config(self) -> ModelTrainerConfig:\n",
117 |     "        config = self.config.model_trainer\n",
118 |     "        params = self.params.TrainingArguments\n",
119 |     "\n",
120 |     "        create_directories([config.root_dir])\n",
121 |     "\n",
122 |     "        model_trainer_config = ModelTrainerConfig(\n",
123 |     "            root_dir=config.root_dir,\n",
124 |     "            data_path=config.data_path,\n",
125 |     "            model_ckpt = config.model_ckpt,\n",
126 |     "            num_train_epochs = params.num_train_epochs,\n",
127 |     "            warmup_steps = params.warmup_steps,\n",
128 |     "            per_device_train_batch_size = params.per_device_train_batch_size,\n",
129 |     "            weight_decay = params.weight_decay,\n",
130 |     "            logging_steps = params.logging_steps,\n",
131 |     "            evaluation_strategy = params.evaluation_strategy,\n",
132 |     "            eval_steps = params.evaluation_strategy,\n",
133 |     "            save_steps = params.save_steps,\n",
134 |     "            gradient_accumulation_steps = params.gradient_accumulation_steps\n",
135 |     "        )\n",
136 |     "\n",
137 |     "        return model_trainer_config"
138 |    ]
139 |   },
140 |   {
141 |    "cell_type": "code",
142 |    "execution_count": 8,
143 |    "metadata": {},
144 |    "outputs": [
145 |     {
146 |      "name": "stderr",
147 |      "output_type": "stream",
148 |      "text": [
149 |       "d:\\Softwares\\anaconda3\\envs\\textS\\lib\\site-packages\\tqdm\\auto.py:21: TqdmWarning: IProgress not found. Please update jupyter and ipywidgets. See https://ipywidgets.readthedocs.io/en/stable/user_install.html\n",
150 |       "  from .autonotebook import tqdm as notebook_tqdm\n"
151 |      ]
152 |     }
153 |    ],
154 |    "source": [
155 |     "from transformers import TrainingArguments, Trainer\n",
156 |     "from transformers import DataCollatorForSeq2Seq\n",
157 |     "from transformers import AutoModelForSeq2SeqLM, AutoTokenizer\n",
158 |     "from datasets import load_dataset, load_from_disk\n",
159 |     "import torch"
160 |    ]
161 |   },
162 |   {
163 |    "cell_type": "code",
164 |    "execution_count": 9,
165 |    "metadata": {},
166 |    "outputs": [],
167 |    "source": [
168 |     "class ModelTrainer:\n",
169 |     "    def __init__(self, config: ModelTrainerConfig):\n",
170 |     "        self.config = config\n",
171 |     "\n",
172 |     "\n",
173 |     "    \n",
174 |     "    def train(self):\n",
175 |     "        device = \"cuda\" if torch.cuda.is_available() else \"cpu\"\n",
176 |     "        tokenizer = AutoTokenizer.from_pretrained(self.config.model_ckpt)\n",
177 |     "        model_pegasus = AutoModelForSeq2SeqLM.from_pretrained(self.config.model_ckpt).to(device)\n",
178 |     "        seq2seq_data_collator = DataCollatorForSeq2Seq(tokenizer, model=model_pegasus)\n",
179 |     "        \n",
180 |     "        #loading data \n",
181 |     "        dataset_samsum_pt = load_from_disk(self.config.data_path)\n",
182 |     "\n",
183 |     "        # trainer_args = TrainingArguments(\n",
184 |     "        #     output_dir=self.config.root_dir, num_train_epochs=self.config.num_train_epochs, warmup_steps=self.config.warmup_steps,\n",
185 |     "        #     per_device_train_batch_size=self.config.per_device_train_batch_size, per_device_eval_batch_size=self.config.per_device_train_batch_size,\n",
186 |     "        #     weight_decay=self.config.weight_decay, logging_steps=self.config.logging_steps,\n",
187 |     "        #     evaluation_strategy=self.config.evaluation_strategy, eval_steps=self.config.eval_steps, save_steps=1e6,\n",
188 |     "        #     gradient_accumulation_steps=self.config.gradient_accumulation_steps\n",
189 |     "        # ) \n",
190 |     "\n",
191 |     "\n",
192 |     "        trainer_args = TrainingArguments(\n",
193 |     "            output_dir=self.config.root_dir, num_train_epochs=1, warmup_steps=500,\n",
194 |     "            per_device_train_batch_size=1, per_device_eval_batch_size=1,\n",
195 |     "            weight_decay=0.01, logging_steps=10,\n",
196 |     "            evaluation_strategy='steps', eval_steps=500, save_steps=1e6,\n",
197 |     "            gradient_accumulation_steps=16\n",
198 |     "        ) \n",
199 |     "\n",
200 |     "        trainer = Trainer(model=model_pegasus, args=trainer_args,\n",
201 |     "                  tokenizer=tokenizer, data_collator=seq2seq_data_collator,\n",
202 |     "                  train_dataset=dataset_samsum_pt[\"train\"], \n",
203 |     "                  eval_dataset=dataset_samsum_pt[\"validation\"])\n",
204 |     "        \n",
205 |     "        trainer.train()\n",
206 |     "\n",
207 |     "        ## Save model\n",
208 |     "        model_pegasus.save_pretrained(os.path.join(self.config.root_dir,\"pegasus-samsum-model\"))\n",
209 |     "        ## Save tokenizer\n",
210 |     "        tokenizer.save_pretrained(os.path.join(self.config.root_dir,\"tokenizer\"))\n"
211 |    ]
212 |   },
213 |   {
214 |    "cell_type": "code",
215 |    "execution_count": 10,
216 |    "metadata": {},
217 |    "outputs": [
218 |     {
219 |      "name": "stdout",
220 |      "output_type": "stream",
221 |      "text": [
222 |       "[2023-05-18 12:54:11,649: INFO: common: yaml file: config\\config.yaml loaded successfully]\n",
223 |       "[2023-05-18 12:54:11,652: INFO: common: yaml file: params.yaml loaded successfully]\n",
224 |       "[2023-05-18 12:54:11,654: INFO: common: created directory at: artifacts]\n",
225 |       "[2023-05-18 12:54:11,655: INFO: common: created directory at: artifacts/model_trainer]\n"
226 |      ]
227 |     },
228 |     {
229 |      "name": "stderr",
230 |      "output_type": "stream",
231 |      "text": [
232 |       "d:\\Softwares\\anaconda3\\envs\\textS\\lib\\site-packages\\transformers\\optimization.py:407: FutureWarning: This implementation of AdamW is deprecated and will be removed in a future version. Use the PyTorch implementation torch.optim.AdamW instead, or set `no_deprecation_warning=True` to disable this warning\n",
233 |       "  warnings.warn(\n",
234 |       "  0%|          | 0/51 [00:00<?, ?it/s]You're using a PegasusTokenizerFast tokenizer. Please note that with a fast tokenizer, using the `__call__` method is faster than using a method to encode the text followed by a call to the `pad` method to get a padded encoding.\n",
235 |       " 20%|█▉        | 10/51 [07:49<30:32, 44.70s/it]"
236 |      ]
237 |     },
238 |     {
239 |      "name": "stdout",
240 |      "output_type": "stream",
241 |      "text": [
242 |       "{'loss': 3.3137, 'learning_rate': 1.0000000000000002e-06, 'epoch': 0.2}\n"
243 |      ]
244 |     },
245 |     {
246 |      "name": "stderr",
247 |      "output_type": "stream",
248 |      "text": [
249 |       " 39%|███▉      | 20/51 [14:44<21:16, 41.19s/it]"
250 |      ]
251 |     },
252 |     {
253 |      "name": "stdout",
254 |      "output_type": "stream",
255 |      "text": [
256 |       "{'loss': 3.1, 'learning_rate': 2.0000000000000003e-06, 'epoch': 0.39}\n"
257 |      ]
258 |     },
259 |     {
260 |      "name": "stderr",
261 |      "output_type": "stream",
262 |      "text": [
263 |       " 59%|█████▉    | 30/51 [22:08<15:10, 43.37s/it]"
264 |      ]
265 |     },
266 |     {
267 |      "name": "stdout",
268 |      "output_type": "stream",
269 |      "text": [
270 |       "{'loss': 3.0839, 'learning_rate': 3e-06, 'epoch': 0.59}\n"
271 |      ]
272 |     },
273 |     {
274 |      "name": "stderr",
275 |      "output_type": "stream",
276 |      "text": [
277 |       " 78%|███████▊  | 40/51 [29:04<07:32, 41.17s/it]"
278 |      ]
279 |     },
280 |     {
281 |      "name": "stdout",
282 |      "output_type": "stream",
283 |      "text": [
284 |       "{'loss': 2.9821, 'learning_rate': 4.000000000000001e-06, 'epoch': 0.78}\n"
285 |      ]
286 |     },
287 |     {
288 |      "name": "stderr",
289 |      "output_type": "stream",
290 |      "text": [
291 |       " 98%|█████████▊| 50/51 [35:40<00:34, 34.19s/it]"
292 |      ]
293 |     },
294 |     {
295 |      "name": "stdout",
296 |      "output_type": "stream",
297 |      "text": [
298 |       "{'loss': 3.1034, 'learning_rate': 5e-06, 'epoch': 0.98}\n"
299 |      ]
300 |     },
301 |     {
302 |      "name": "stderr",
303 |      "output_type": "stream",
304 |      "text": [
305 |       "100%|██████████| 51/51 [36:24<00:00, 42.84s/it]\n"
306 |      ]
307 |     },
308 |     {
309 |      "name": "stdout",
310 |      "output_type": "stream",
311 |      "text": [
312 |       "{'train_runtime': 2184.9179, 'train_samples_per_second': 0.375, 'train_steps_per_second': 0.023, 'train_loss': 3.11656564357234, 'epoch': 1.0}\n"
313 |      ]
314 |     }
315 |    ],
316 |    "source": [
317 |     "try:\n",
318 |     "    config = ConfigurationManager()\n",
319 |     "    model_trainer_config = config.get_model_trainer_config()\n",
320 |     "    model_trainer_config = ModelTrainer(config=model_trainer_config)\n",
321 |     "    model_trainer_config.train()\n",
322 |     "except Exception as e:\n",
323 |     "    raise e"
324 |    ]
325 |   },
326 |   {
327 |    "cell_type": "code",
328 |    "execution_count": null,
329 |    "metadata": {},
330 |    "outputs": [],
331 |    "source": []
332 |   }
333 |  ],
334 |  "metadata": {
335 |   "kernelspec": {
336 |    "display_name": "textS",
337 |    "language": "python",
338 |    "name": "python3"
339 |   },
340 |   "language_info": {
341 |    "codemirror_mode": {
342 |     "name": "ipython",
343 |     "version": 3
344 |    },
345 |    "file_extension": ".py",
346 |    "mimetype": "text/x-python",
347 |    "name": "python",
348 |    "nbconvert_exporter": "python",
349 |    "pygments_lexer": "ipython3",
350 |    "version": "3.8.16"
351 |   },
352 |   "orig_nbformat": 4
353 |  },
354 |  "nbformat": 4,
355 |  "nbformat_minor": 2
356 | }
357 | 


--------------------------------------------------------------------------------