├── README.md
├── params.yaml
└── src
    ├── data_ingestion.py
    ├── data_preprocessing.py
    ├── feature_engineering.py
    ├── model_building.py
    └── model_evaluation.py


/README.md:
--------------------------------------------------------------------------------
1 | # ml-pipelines-using-dvc
2 | Code of how to build a ml-pipeline using DVC
3 | 


--------------------------------------------------------------------------------
/params.yaml:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/campusx-official/ml-pipelines-using-dvc/2c1c60c7b05a92cf2ec743f36b84d69c01d34cd0/params.yaml


--------------------------------------------------------------------------------
/src/data_ingestion.py:
--------------------------------------------------------------------------------
 1 | import numpy as np
 2 | import pandas as pd
 3 | import os
 4 | from sklearn.model_selection import train_test_split
 5 | import yaml
 6 | 
 7 | import logging
 8 | 
 9 | # logging configure
10 | 
11 | logger = logging.getLogger('data_ingestion')
12 | logger.setLevel('DEBUG')
13 | 
14 | console_handler = logging.StreamHandler()
15 | console_handler.setLevel('DEBUG')
16 | 
17 | file_handler = logging.FileHandler('errors.log')
18 | file_handler.setLevel('ERROR')
19 | 
20 | formatter = logging.Formatter('%(asctime)s - %(name)s - %(levelname)s - %(message)s')
21 | console_handler.setFormatter(formatter)
22 | file_handler.setFormatter(formatter)
23 | 
24 | logger.addHandler(console_handler)
25 | logger.addHandler(file_handler)
26 | 
27 | def load_params(params_path: str) -> float:
28 |     try:
29 |         with open(params_path, 'r') as file:
30 |             params = yaml.safe_load(file)
31 |         test_size = params['data_ingestion']['test_size']
32 |         logger.debug('test size retrieved')
33 |         return test_size
34 |     except FileNotFoundError:
35 |         logger.error('File not found')
36 |         raise
37 |     except yaml.YAMLError as e:
38 |         logger.error('yaml error')
39 |         raise
40 |     except Exception as e:
41 |         logger.error('some error occured')
42 |         raise
43 | 
44 | def load_data(data_url: str) -> pd.DataFrame:
45 |     try:
46 |         df = pd.read_csv(data_url)
47 |         return df
48 |     except pd.errors.ParserError as e:
49 |         print(f"Error: Failed to parse the CSV file from {data_url}.")
50 |         print(e)
51 |         raise
52 |     except Exception as e:
53 |         print(f"Error: An unexpected error occurred while loading the data.")
54 |         print(e)
55 |         raise
56 | 
57 | def preprocess_data(df: pd.DataFrame) -> pd.DataFrame:
58 |     try:
59 |         df.drop(columns=['tweet_id'], inplace=True)
60 |         final_df = df[df['sentiment'].isin(['happiness', 'sadness'])]
61 |         final_df['sentiment'].replace({'happiness': 1, 'sadness': 0}, inplace=True)
62 |         return final_df
63 |     except KeyError as e:
64 |         print(f"Error: Missing column {e} in the dataframe.")
65 |         raise
66 |     except Exception as e:
67 |         print(f"Error: An unexpected error occurred during preprocessing.")
68 |         print(e)
69 |         raise
70 | 
71 | def save_data(train_data: pd.DataFrame, test_data: pd.DataFrame, data_path: str) -> None:
72 |     try:
73 |         data_path = os.path.join(data_path, 'raw')
74 |         os.makedirs(data_path, exist_ok=True)
75 |         train_data.to_csv(os.path.join(data_path, "train.csv"), index=False)
76 |         test_data.to_csv(os.path.join(data_path, "test.csv"), index=False)
77 |     except Exception as e:
78 |         print(f"Error: An unexpected error occurred while saving the data.")
79 |         print(e)
80 |         raise
81 | 
82 | def main():
83 |     try:
84 |         test_size = load_params(params_path='params1.yaml')
85 |         df = load_data(data_url='https://raw.githubusercontent.com/campusx-official/jupyter-masterclass/main/tweet_emotions.csv')
86 |         final_df = preprocess_data(df)
87 |         train_data, test_data = train_test_split(final_df, test_size=test_size, random_state=42)
88 |         save_data(train_data, test_data, data_path='data')
89 |     except Exception as e:
90 |         print(f"Error: {e}")
91 |         print("Failed to complete the data ingestion process.")
92 | 
93 | if __name__ == '__main__':
94 |     main()
95 | 


--------------------------------------------------------------------------------
/src/data_preprocessing.py:
--------------------------------------------------------------------------------
 1 | import numpy as np
 2 | import pandas as pd
 3 | 
 4 | import os
 5 | 
 6 | import re
 7 | import nltk
 8 | import string
 9 | from nltk.corpus import stopwords
10 | from nltk.stem import SnowballStemmer, WordNetLemmatizer
11 | 
12 | # fetch the data from data/raw
13 | train_data = pd.read_csv('./data/raw/train.csv')
14 | test_data = pd.read_csv('./data/raw/test.csv')
15 | 
16 | # transform the data
17 | nltk.download('wordnet')
18 | nltk.download('stopwords')
19 | 
20 | def lemmatization(text):
21 |     lemmatizer= WordNetLemmatizer()
22 | 
23 |     text = text.split()
24 | 
25 |     text=[lemmatizer.lemmatize(y) for y in text]
26 | 
27 |     return " " .join(text)
28 | 
29 | def remove_stop_words(text):
30 |     stop_words = set(stopwords.words("english"))
31 |     Text=[i for i in str(text).split() if i not in stop_words]
32 |     return " ".join(Text)
33 | 
34 | def removing_numbers(text):
35 |     text=''.join([i for i in text if not i.isdigit()])
36 |     return text
37 | 
38 | def lower_case(text):
39 | 
40 |     text = text.split()
41 | 
42 |     text=[y.lower() for y in text]
43 | 
44 |     return " " .join(text)
45 | 
46 | def removing_punctuations(text):
47 |     ## Remove punctuations
48 |     text = re.sub('[%s]' % re.escape("""!"#$%&'()*+,،-./:;<=>؟?@[\]^_`{|}~"""), ' ', text)
49 |     text = text.replace('؛',"", )
50 | 
51 |     ## remove extra whitespace
52 |     text = re.sub('\s+', ' ', text)
53 |     text =  " ".join(text.split())
54 |     return text.strip()
55 | 
56 | def removing_urls(text):
57 |     url_pattern = re.compile(r'https?://\S+|www\.\S+')
58 |     return url_pattern.sub(r'', text)
59 | 
60 | def remove_small_sentences(df):
61 |     for i in range(len(df)):
62 |         if len(df.text.iloc[i].split()) < 3:
63 |             df.text.iloc[i] = np.nan
64 | 
65 | def normalize_text(df):
66 |     df.content=df.content.apply(lambda content : lower_case(content))
67 |     df.content=df.content.apply(lambda content : remove_stop_words(content))
68 |     df.content=df.content.apply(lambda content : removing_numbers(content))
69 |     df.content=df.content.apply(lambda content : removing_punctuations(content))
70 |     df.content=df.content.apply(lambda content : removing_urls(content))
71 |     df.content=df.content.apply(lambda content : lemmatization(content))
72 |     return df
73 | 
74 | train_processed_data = normalize_text(train_data)
75 | test_processed_data = normalize_text(test_data)
76 | 
77 | # store the data inside data/processed
78 | data_path = os.path.join("data","processed")
79 | 
80 | os.makedirs(data_path)
81 | 
82 | train_processed_data.to_csv(os.path.join(data_path,"train_processed.csv"))
83 | test_processed_data.to_csv(os.path.join(data_path,"test_processed.csv"))


--------------------------------------------------------------------------------
/src/feature_engineering.py:
--------------------------------------------------------------------------------
 1 | import numpy as np
 2 | import pandas as pd
 3 | 
 4 | import os
 5 | 
 6 | from sklearn.feature_extraction.text import CountVectorizer
 7 | 
 8 | # fetch the data from data/processed
 9 | train_data = pd.read_csv('./data/processed/train_processed.csv')
10 | test_data = pd.read_csv('./data/processed/test_processed.csv')
11 | 
12 | train_data.fillna('',inplace=True)
13 | test_data.fillna('',inplace=True)
14 | 
15 | # apply BoW
16 | X_train = train_data['content'].values
17 | y_train = train_data['sentiment'].values
18 | 
19 | X_test = test_data['content'].values
20 | y_test = test_data['sentiment'].values
21 | 
22 | # Apply Bag of Words (CountVectorizer)
23 | vectorizer = CountVectorizer(max_features=50)
24 | 
25 | # Fit the vectorizer on the training data and transform it
26 | X_train_bow = vectorizer.fit_transform(X_train)
27 | 
28 | # Transform the test data using the same vectorizer
29 | X_test_bow = vectorizer.transform(X_test)
30 | 
31 | train_df = pd.DataFrame(X_train_bow.toarray())
32 | 
33 | train_df['label'] = y_train
34 | 
35 | test_df = pd.DataFrame(X_test_bow.toarray())
36 | 
37 | test_df['label'] = y_test
38 | 
39 | # store the data inside data/features
40 | data_path = os.path.join("data","features")
41 | 
42 | os.makedirs(data_path)
43 | 
44 | train_df.to_csv(os.path.join(data_path,"train_bow.csv"))
45 | test_df.to_csv(os.path.join(data_path,"test_bow.csv"))
46 | 
47 | 


--------------------------------------------------------------------------------
/src/model_building.py:
--------------------------------------------------------------------------------
 1 | import numpy as np
 2 | import pandas as pd
 3 | import pickle
 4 | 
 5 | from sklearn.ensemble import GradientBoostingClassifier
 6 | 
 7 | # fetch the data from data/processed
 8 | train_data = pd.read_csv('./data/features/train_bow.csv')
 9 | 
10 | X_train = train_data.iloc[:,0:-1].values
11 | y_train = train_data.iloc[:,-1].values
12 | 
13 | # Define and train the XGBoost model
14 | 
15 | clf = GradientBoostingClassifier(n_estimators=50)
16 | clf.fit(X_train, y_train)
17 | 
18 | # save
19 | pickle.dump(clf, open('model.pkl','wb'))
20 | 
21 | 


--------------------------------------------------------------------------------
/src/model_evaluation.py:
--------------------------------------------------------------------------------
 1 | import numpy as np
 2 | import pandas as pd
 3 | 
 4 | import pickle
 5 | import json
 6 | 
 7 | from sklearn.metrics import accuracy_score
 8 | from sklearn.metrics import precision_score, recall_score, roc_auc_score
 9 | 
10 | clf = pickle.load(open('model.pkl','rb'))
11 | test_data = pd.read_csv('./data/features/test_bow.csv')
12 | 
13 | X_test = test_data.iloc[:,0:-1].values
14 | y_test = test_data.iloc[:,-1].values
15 | 
16 | y_pred = clf.predict(X_test)
17 | y_pred_proba = clf.predict_proba(X_test)[:, 1]
18 | 
19 | # Calculate evaluation metrics
20 | accuracy = accuracy_score(y_test, y_pred)
21 | precision = precision_score(y_test, y_pred)
22 | recall = recall_score(y_test, y_pred)
23 | auc = roc_auc_score(y_test, y_pred_proba)
24 | 
25 | metrics_dict={
26 |     'accuracy':accuracy,
27 |     'precision':precision,
28 |     'recall':recall,
29 |     'auc':auc
30 | }
31 | 
32 | with open('metrics.json', 'w') as file:
33 |     json.dump(metrics_dict, file, indent=4)


--------------------------------------------------------------------------------