├── README.md ├── params.yaml └── src ├── data_ingestion.py ├── data_preprocessing.py ├── feature_engineering.py ├── model_building.py └── model_evaluation.py /README.md: -------------------------------------------------------------------------------- 1 | # ml-pipelines-using-dvc 2 | Code of how to build a ml-pipeline using DVC 3 | -------------------------------------------------------------------------------- /params.yaml: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/campusx-official/ml-pipelines-using-dvc/2c1c60c7b05a92cf2ec743f36b84d69c01d34cd0/params.yaml -------------------------------------------------------------------------------- /src/data_ingestion.py: -------------------------------------------------------------------------------- 1 | import numpy as np 2 | import pandas as pd 3 | import os 4 | from sklearn.model_selection import train_test_split 5 | import yaml 6 | 7 | import logging 8 | 9 | # logging configure 10 | 11 | logger = logging.getLogger('data_ingestion') 12 | logger.setLevel('DEBUG') 13 | 14 | console_handler = logging.StreamHandler() 15 | console_handler.setLevel('DEBUG') 16 | 17 | file_handler = logging.FileHandler('errors.log') 18 | file_handler.setLevel('ERROR') 19 | 20 | formatter = logging.Formatter('%(asctime)s - %(name)s - %(levelname)s - %(message)s') 21 | console_handler.setFormatter(formatter) 22 | file_handler.setFormatter(formatter) 23 | 24 | logger.addHandler(console_handler) 25 | logger.addHandler(file_handler) 26 | 27 | def load_params(params_path: str) -> float: 28 | try: 29 | with open(params_path, 'r') as file: 30 | params = yaml.safe_load(file) 31 | test_size = params['data_ingestion']['test_size'] 32 | logger.debug('test size retrieved') 33 | return test_size 34 | except FileNotFoundError: 35 | logger.error('File not found') 36 | raise 37 | except yaml.YAMLError as e: 38 | logger.error('yaml error') 39 | raise 40 | except Exception as e: 41 | logger.error('some error occured') 42 | raise 43 | 44 | def load_data(data_url: str) -> pd.DataFrame: 45 | try: 46 | df = pd.read_csv(data_url) 47 | return df 48 | except pd.errors.ParserError as e: 49 | print(f"Error: Failed to parse the CSV file from {data_url}.") 50 | print(e) 51 | raise 52 | except Exception as e: 53 | print(f"Error: An unexpected error occurred while loading the data.") 54 | print(e) 55 | raise 56 | 57 | def preprocess_data(df: pd.DataFrame) -> pd.DataFrame: 58 | try: 59 | df.drop(columns=['tweet_id'], inplace=True) 60 | final_df = df[df['sentiment'].isin(['happiness', 'sadness'])] 61 | final_df['sentiment'].replace({'happiness': 1, 'sadness': 0}, inplace=True) 62 | return final_df 63 | except KeyError as e: 64 | print(f"Error: Missing column {e} in the dataframe.") 65 | raise 66 | except Exception as e: 67 | print(f"Error: An unexpected error occurred during preprocessing.") 68 | print(e) 69 | raise 70 | 71 | def save_data(train_data: pd.DataFrame, test_data: pd.DataFrame, data_path: str) -> None: 72 | try: 73 | data_path = os.path.join(data_path, 'raw') 74 | os.makedirs(data_path, exist_ok=True) 75 | train_data.to_csv(os.path.join(data_path, "train.csv"), index=False) 76 | test_data.to_csv(os.path.join(data_path, "test.csv"), index=False) 77 | except Exception as e: 78 | print(f"Error: An unexpected error occurred while saving the data.") 79 | print(e) 80 | raise 81 | 82 | def main(): 83 | try: 84 | test_size = load_params(params_path='params1.yaml') 85 | df = load_data(data_url='https://raw.githubusercontent.com/campusx-official/jupyter-masterclass/main/tweet_emotions.csv') 86 | final_df = preprocess_data(df) 87 | train_data, test_data = train_test_split(final_df, test_size=test_size, random_state=42) 88 | save_data(train_data, test_data, data_path='data') 89 | except Exception as e: 90 | print(f"Error: {e}") 91 | print("Failed to complete the data ingestion process.") 92 | 93 | if __name__ == '__main__': 94 | main() 95 | -------------------------------------------------------------------------------- /src/data_preprocessing.py: -------------------------------------------------------------------------------- 1 | import numpy as np 2 | import pandas as pd 3 | 4 | import os 5 | 6 | import re 7 | import nltk 8 | import string 9 | from nltk.corpus import stopwords 10 | from nltk.stem import SnowballStemmer, WordNetLemmatizer 11 | 12 | # fetch the data from data/raw 13 | train_data = pd.read_csv('./data/raw/train.csv') 14 | test_data = pd.read_csv('./data/raw/test.csv') 15 | 16 | # transform the data 17 | nltk.download('wordnet') 18 | nltk.download('stopwords') 19 | 20 | def lemmatization(text): 21 | lemmatizer= WordNetLemmatizer() 22 | 23 | text = text.split() 24 | 25 | text=[lemmatizer.lemmatize(y) for y in text] 26 | 27 | return " " .join(text) 28 | 29 | def remove_stop_words(text): 30 | stop_words = set(stopwords.words("english")) 31 | Text=[i for i in str(text).split() if i not in stop_words] 32 | return " ".join(Text) 33 | 34 | def removing_numbers(text): 35 | text=''.join([i for i in text if not i.isdigit()]) 36 | return text 37 | 38 | def lower_case(text): 39 | 40 | text = text.split() 41 | 42 | text=[y.lower() for y in text] 43 | 44 | return " " .join(text) 45 | 46 | def removing_punctuations(text): 47 | ## Remove punctuations 48 | text = re.sub('[%s]' % re.escape("""!"#$%&'()*+,،-./:;<=>؟?@[\]^_`{|}~"""), ' ', text) 49 | text = text.replace('؛',"", ) 50 | 51 | ## remove extra whitespace 52 | text = re.sub('\s+', ' ', text) 53 | text = " ".join(text.split()) 54 | return text.strip() 55 | 56 | def removing_urls(text): 57 | url_pattern = re.compile(r'https?://\S+|www\.\S+') 58 | return url_pattern.sub(r'', text) 59 | 60 | def remove_small_sentences(df): 61 | for i in range(len(df)): 62 | if len(df.text.iloc[i].split()) < 3: 63 | df.text.iloc[i] = np.nan 64 | 65 | def normalize_text(df): 66 | df.content=df.content.apply(lambda content : lower_case(content)) 67 | df.content=df.content.apply(lambda content : remove_stop_words(content)) 68 | df.content=df.content.apply(lambda content : removing_numbers(content)) 69 | df.content=df.content.apply(lambda content : removing_punctuations(content)) 70 | df.content=df.content.apply(lambda content : removing_urls(content)) 71 | df.content=df.content.apply(lambda content : lemmatization(content)) 72 | return df 73 | 74 | train_processed_data = normalize_text(train_data) 75 | test_processed_data = normalize_text(test_data) 76 | 77 | # store the data inside data/processed 78 | data_path = os.path.join("data","processed") 79 | 80 | os.makedirs(data_path) 81 | 82 | train_processed_data.to_csv(os.path.join(data_path,"train_processed.csv")) 83 | test_processed_data.to_csv(os.path.join(data_path,"test_processed.csv")) -------------------------------------------------------------------------------- /src/feature_engineering.py: -------------------------------------------------------------------------------- 1 | import numpy as np 2 | import pandas as pd 3 | 4 | import os 5 | 6 | from sklearn.feature_extraction.text import CountVectorizer 7 | 8 | # fetch the data from data/processed 9 | train_data = pd.read_csv('./data/processed/train_processed.csv') 10 | test_data = pd.read_csv('./data/processed/test_processed.csv') 11 | 12 | train_data.fillna('',inplace=True) 13 | test_data.fillna('',inplace=True) 14 | 15 | # apply BoW 16 | X_train = train_data['content'].values 17 | y_train = train_data['sentiment'].values 18 | 19 | X_test = test_data['content'].values 20 | y_test = test_data['sentiment'].values 21 | 22 | # Apply Bag of Words (CountVectorizer) 23 | vectorizer = CountVectorizer(max_features=50) 24 | 25 | # Fit the vectorizer on the training data and transform it 26 | X_train_bow = vectorizer.fit_transform(X_train) 27 | 28 | # Transform the test data using the same vectorizer 29 | X_test_bow = vectorizer.transform(X_test) 30 | 31 | train_df = pd.DataFrame(X_train_bow.toarray()) 32 | 33 | train_df['label'] = y_train 34 | 35 | test_df = pd.DataFrame(X_test_bow.toarray()) 36 | 37 | test_df['label'] = y_test 38 | 39 | # store the data inside data/features 40 | data_path = os.path.join("data","features") 41 | 42 | os.makedirs(data_path) 43 | 44 | train_df.to_csv(os.path.join(data_path,"train_bow.csv")) 45 | test_df.to_csv(os.path.join(data_path,"test_bow.csv")) 46 | 47 | -------------------------------------------------------------------------------- /src/model_building.py: -------------------------------------------------------------------------------- 1 | import numpy as np 2 | import pandas as pd 3 | import pickle 4 | 5 | from sklearn.ensemble import GradientBoostingClassifier 6 | 7 | # fetch the data from data/processed 8 | train_data = pd.read_csv('./data/features/train_bow.csv') 9 | 10 | X_train = train_data.iloc[:,0:-1].values 11 | y_train = train_data.iloc[:,-1].values 12 | 13 | # Define and train the XGBoost model 14 | 15 | clf = GradientBoostingClassifier(n_estimators=50) 16 | clf.fit(X_train, y_train) 17 | 18 | # save 19 | pickle.dump(clf, open('model.pkl','wb')) 20 | 21 | -------------------------------------------------------------------------------- /src/model_evaluation.py: -------------------------------------------------------------------------------- 1 | import numpy as np 2 | import pandas as pd 3 | 4 | import pickle 5 | import json 6 | 7 | from sklearn.metrics import accuracy_score 8 | from sklearn.metrics import precision_score, recall_score, roc_auc_score 9 | 10 | clf = pickle.load(open('model.pkl','rb')) 11 | test_data = pd.read_csv('./data/features/test_bow.csv') 12 | 13 | X_test = test_data.iloc[:,0:-1].values 14 | y_test = test_data.iloc[:,-1].values 15 | 16 | y_pred = clf.predict(X_test) 17 | y_pred_proba = clf.predict_proba(X_test)[:, 1] 18 | 19 | # Calculate evaluation metrics 20 | accuracy = accuracy_score(y_test, y_pred) 21 | precision = precision_score(y_test, y_pred) 22 | recall = recall_score(y_test, y_pred) 23 | auc = roc_auc_score(y_test, y_pred_proba) 24 | 25 | metrics_dict={ 26 | 'accuracy':accuracy, 27 | 'precision':precision, 28 | 'recall':recall, 29 | 'auc':auc 30 | } 31 | 32 | with open('metrics.json', 'w') as file: 33 | json.dump(metrics_dict, file, indent=4) --------------------------------------------------------------------------------