├── assets ├── eval │ └── .gitignore ├── .gitignore └── original_data.dvc ├── params.yaml ├── src ├── __init__.py ├── config.py ├── pipeline.py ├── preprocess.py ├── utils.py ├── featurize.py └── model.py ├── .gitignore ├── app ├── __init__.py └── app.py ├── python-setup.py ├── .dvc ├── .gitignore └── config ├── deploy.sh ├── .dockeringore ├── Dockerfile ├── dvc.yaml ├── kubernetes.yaml.tpl ├── dvc.lock ├── demo_steps.md ├── cloudbuild.yaml ├── requirements.txt ├── README.md └── LICENSE /assets/eval/.gitignore: -------------------------------------------------------------------------------- 1 | /scores.json 2 | -------------------------------------------------------------------------------- /params.yaml: -------------------------------------------------------------------------------- 1 | model: 2 | split: 0.30 3 | random: 20 -------------------------------------------------------------------------------- /src/__init__.py: -------------------------------------------------------------------------------- 1 | import sys, os 2 | 3 | sys.path.insert(0,'src/') -------------------------------------------------------------------------------- /.gitignore: -------------------------------------------------------------------------------- 1 | dvc-remote 2 | venv 3 | *.pyc 4 | __pycache__ 5 | .DS_Store 6 | .vscode -------------------------------------------------------------------------------- /assets/.gitignore: -------------------------------------------------------------------------------- 1 | /original_data 2 | /preprocessed 3 | /featurized 4 | /models 5 | -------------------------------------------------------------------------------- /app/__init__.py: -------------------------------------------------------------------------------- 1 | import sys, os 2 | 3 | sys.path.insert(0,'src/') 4 | from app.app import app -------------------------------------------------------------------------------- /python-setup.py: -------------------------------------------------------------------------------- 1 | import nltk 2 | 3 | nltk.download('stopwords') 4 | nltk.download('punkt') -------------------------------------------------------------------------------- /assets/original_data.dvc: -------------------------------------------------------------------------------- 1 | outs: 2 | - md5: ff02c28948dc2b8c2ce42d6fcae7c385.dir 3 | path: original_data 4 | -------------------------------------------------------------------------------- /.dvc/.gitignore: -------------------------------------------------------------------------------- 1 | /config.local 2 | /updater 3 | /lock 4 | /updater.lock 5 | /tmp 6 | /state-journal 7 | /state-wal 8 | /state 9 | /cache 10 | -------------------------------------------------------------------------------- /deploy.sh: -------------------------------------------------------------------------------- 1 | set -x 2 | export PYTHONPATH=. 3 | python python-setup.py 4 | dvc pull -r gcs 5 | dvc repro 6 | gunicorn app:app -w 6 --threads 10 -b 0.0.0.0:80 -------------------------------------------------------------------------------- /.dockeringore: -------------------------------------------------------------------------------- 1 | dvc-remote 2 | venv 3 | *.pyc 4 | __pycache__ 5 | .DS_Store 6 | .vscode 7 | assets/original_data 8 | assets/preprocessed 9 | assets/featurized 10 | assets/models -------------------------------------------------------------------------------- /.dvc/config: -------------------------------------------------------------------------------- 1 | [core] 2 | remote = origin 3 | ['remote "origin"'] 4 | url = ../dvc-remote 5 | ['remote "gdrive"'] 6 | url = gdrive://1tsTpwfWTSMWjRDrGqkCC9aDHXA_NY89O 7 | ['remote "gcs"'] 8 | url = gs://mlops-dvc-storage 9 | -------------------------------------------------------------------------------- /src/config.py: -------------------------------------------------------------------------------- 1 | class Config: 2 | BASE = "assets" 3 | ORIGINAL_DATASET_PATH = f"{BASE}/original_data" 4 | TRAIN_DATASET = f"{ORIGINAL_DATASET_PATH}/train.csv" 5 | TEST_DATASET = f"{ORIGINAL_DATASET_PATH}/test.csv" 6 | PREPROCESS_PATH = f"{BASE}/preprocessed" 7 | FEATURES_PATH = f"{BASE}/featurized" 8 | MODELS_PATH = f"{BASE}/models" 9 | EVAL_PATH = f"{BASE}/eval" 10 | 11 | PREPROCESSED_TRAIN = f"{PREPROCESS_PATH}/train.csv" 12 | PREPROCESSED_TEST = f"{PREPROCESS_PATH}/test.csv" -------------------------------------------------------------------------------- /Dockerfile: -------------------------------------------------------------------------------- 1 | FROM python:3.7 2 | 3 | RUN apt install python3.7-distutils 4 | 5 | # set working directory 6 | WORKDIR /project 7 | 8 | # set environment variables 9 | ENV PYTHONDONTWRITEBYTECODE 1 10 | ENV PYTHONUNBUFFERED 1 11 | 12 | # add requirements 13 | RUN git clone https://github.com/bhavaniravi/mlops-with-dvc.git . 14 | 15 | # install requirements 16 | RUN pip install -r requirements.txt 17 | 18 | # run server 19 | #CMD python manage.py runserver 20 | CMD bash ./deploy.sh 21 | 22 | # expose 23 | EXPOSE 5000 24 | -------------------------------------------------------------------------------- /src/pipeline.py: -------------------------------------------------------------------------------- 1 | import os, sys 2 | sys.path.insert(0,f'{os.getcwd()}/src') 3 | 4 | from preprocess import preprocess_df 5 | from featurize import featurize_df 6 | from model import infer_df 7 | from config import Config 8 | import pandas as pd 9 | import pickle 10 | 11 | 12 | def predict(text): 13 | df = pd.DataFrame([{"id": 1, "tweet":text}]) 14 | df = preprocess_df(df) 15 | features = featurize_df(df) 16 | model = pickle.load(open(f"{Config.MODELS_PATH}/model.pickle", "rb")) 17 | result_df = infer_df(model, features) 18 | return result_df 19 | -------------------------------------------------------------------------------- /dvc.yaml: -------------------------------------------------------------------------------- 1 | stages: 2 | preprocess: 3 | cmd: python src/preprocess.py 4 | deps: 5 | - assets/original_data/train.csv 6 | - src/preprocess.py 7 | outs: 8 | - assets/preprocessed/ 9 | featurize: 10 | cmd: python src/featurize.py 11 | deps: 12 | - assets/preprocessed/test.csv 13 | - assets/preprocessed/train.csv 14 | - src/preprocess.py 15 | outs: 16 | - assets/featurized/ 17 | train_test_eval: 18 | cmd: python src/model.py 19 | deps: 20 | - assets/featurized 21 | - src/model.py 22 | outs: 23 | - assets/models 24 | metrics: 25 | - assets/eval/scores.json: 26 | cache: true 27 | params: 28 | - model.random 29 | - model.split 30 | -------------------------------------------------------------------------------- /app/app.py: -------------------------------------------------------------------------------- 1 | from flask import Flask 2 | from flask import Blueprint, request, render_template, jsonify 3 | 4 | import sys, os 5 | sys.path.insert(0,f'{os.getcwd()}/src') 6 | import pipeline 7 | 8 | 9 | 10 | app = Flask(__name__) 11 | 12 | 13 | 14 | @app.route('/predict', methods=["POST", "GET"]) 15 | def predict(): 16 | if not request.args: 17 | return {"status": "Failure", "message": "Nothing to predict"} 18 | text = request.args.get('tweet') 19 | prediction = pipeline.predict(text) 20 | abusive_scale = prediction["probability"][0] 21 | print (abusive_scale) 22 | is_abusive = 0 23 | if abusive_scale > 0.015: 24 | is_abusive = 1 25 | 26 | return {"status": "success", "is_abusive": str(is_abusive)} 27 | 28 | 29 | 30 | 31 | if __name__ == "__main__": 32 | app.run(host='0.0.0.0', debug=True) -------------------------------------------------------------------------------- /kubernetes.yaml.tpl: -------------------------------------------------------------------------------- 1 | --- 2 | apiVersion: "v1" 3 | kind: "Namespace" 4 | metadata: 5 | name: APP_NAME 6 | --- 7 | apiVersion: "apps/v1" 8 | kind: "Deployment" 9 | metadata: 10 | name: APP_NAME 11 | namespace: APP_NAME 12 | labels: 13 | app: APP_NAME 14 | spec: 15 | replicas: 1 16 | selector: 17 | matchLabels: 18 | app: APP_NAME 19 | template: 20 | metadata: 21 | labels: 22 | app: APP_NAME 23 | spec: 24 | containers: 25 | - name: "mlops-image-1" 26 | image: "gcr.io/GOOGLE_CLOUD_PROJECT/APP_NAME:COMMIT_SHA" 27 | --- 28 | kind: Service 29 | apiVersion: v1 30 | metadata: 31 | namespace: "mlops" 32 | name: APP_NAME 33 | spec: 34 | selector: 35 | app: APP_NAME 36 | ports: 37 | - protocol: TCP 38 | port: 80 39 | targetPort: 80 40 | name: APP_NAME 41 | type: LoadBalancer 42 | 43 | -------------------------------------------------------------------------------- /src/preprocess.py: -------------------------------------------------------------------------------- 1 | import pandas as pd 2 | from config import Config 3 | import utils 4 | 5 | def preprocess_df(df): 6 | df['lengths'] = df['tweet'].apply(len) 7 | df['tweet'] = df['tweet'].apply(utils.encode) 8 | df["tweet"] = df["tweet"].apply(utils.remove_mention) 9 | df['tweet'] = df['tweet'].apply(utils.remove_puncuation) 10 | return df 11 | 12 | def preprocess(force=False): 13 | try: 14 | if force: 15 | raise FileNotFoundError 16 | train_df = pd.read_csv(Config.PREPROCESSED_TRAIN) 17 | test_df = pd.read_csv(Config.PREPROCESSED_TEST) 18 | except FileNotFoundError: 19 | train_df = pd.read_csv(Config.TRAIN_DATASET) 20 | test_df = pd.read_csv(Config.TEST_DATASET) 21 | 22 | train_df = preprocess_df(train_df) 23 | test_df = preprocess_df(test_df) 24 | 25 | import os 26 | os.mkdir(Config.PREPROCESS_PATH) 27 | 28 | train_df.to_csv(Config.PREPROCESSED_TRAIN) 29 | test_df.to_csv(Config.PREPROCESSED_TEST) 30 | 31 | if __name__ == '__main__': 32 | preprocess(force=False) 33 | 34 | -------------------------------------------------------------------------------- /src/utils.py: -------------------------------------------------------------------------------- 1 | import string 2 | import nltk 3 | from nltk.corpus import stopwords 4 | from nltk.tokenize import word_tokenize 5 | import re 6 | 7 | stopwords = stopwords.words('english') 8 | 9 | def remove_mention(text): 10 | processed = re.sub(r"(?:\@|http?\://|https?\://|www)\S+", "", text) 11 | text = " ".join(processed.split()) 12 | return text 13 | 14 | 15 | def remove_puncuation(text): 16 | punc_removed = [] 17 | for char in text: 18 | if char not in string.punctuation: 19 | punc_removed.append(char) 20 | 21 | return ''.join(punc_removed) 22 | 23 | 24 | def get_hashtags(text): 25 | hashtags = [] 26 | for word in text.split(" "): 27 | if word.startswith("#") and len(word[1:]) > 3: 28 | hashtags.append(word[1:]) 29 | return hashtags 30 | 31 | 32 | 33 | def remove_stop_words(text): 34 | text_tokens = word_tokenize(text) 35 | tokens_without_sw = [word for word in text_tokens if word not in stopwords and len(word) > 2] 36 | return ' '.join(tokens_without_sw) 37 | 38 | 39 | def encode(text): 40 | return text.encode('ascii',errors='ignore').decode() 41 | -------------------------------------------------------------------------------- /dvc.lock: -------------------------------------------------------------------------------- 1 | preprocess: 2 | cmd: python src/preprocess.py 3 | deps: 4 | - path: assets/original_data/train.csv 5 | md5: c3396dad17e1311181ff4c1d47ce8179 6 | - path: src/preprocess.py 7 | md5: a494ed66a55c1bde40999823eac1055f 8 | outs: 9 | - path: assets/preprocessed/ 10 | md5: bf42369d85ed69a898a80674d8c3a9b9.dir 11 | featurize: 12 | cmd: python src/featurize.py 13 | deps: 14 | - path: assets/preprocessed/test.csv 15 | md5: 8c8c92530cb5a0727af93808e962d18e 16 | - path: assets/preprocessed/train.csv 17 | md5: 7e2928f367e8560ab4b95d032d548bb2 18 | - path: src/preprocess.py 19 | md5: a494ed66a55c1bde40999823eac1055f 20 | outs: 21 | - path: assets/featurized/ 22 | md5: c8c126f368c57d909217a4437458a301.dir 23 | train_test_eval: 24 | cmd: python src/model.py 25 | deps: 26 | - path: assets/featurized 27 | md5: 2b97c3dfb7abb2df2e43d499d8aaef5d.dir 28 | - path: src/model.py 29 | md5: 31fb8697413ea4146905adc6defa5788 30 | params: 31 | params.yaml: 32 | model.random: 20 33 | model.split: 0.3 34 | outs: 35 | - path: assets/eval/scores.json 36 | md5: 616aa98dcda4fd75c9c2f15a8ffc5fb3 37 | - path: assets/models 38 | md5: ca9a44f96c1e6df0d0567744999f5242.dir 39 | -------------------------------------------------------------------------------- /demo_steps.md: -------------------------------------------------------------------------------- 1 | # MLOps 2 | 3 | ## Data and model versioning 4 | 5 | ### Define a project structure 6 | 7 | ``` 8 | ├── app 9 | │ └── app.py 10 | ├── assets 11 | │ ├── eval 12 | │ ├── featurized 13 | │ ├── models 14 | │ ├── original_data 15 | │ │ ├── test.csv 16 | │ │ └── train.csv 17 | │ └── preprocessed 18 | └── src 19 | ├── config.py 20 | ├── preprocess.py 21 | ├── featurize.py 22 | └── utils.py 23 | ``` 24 | 25 | ### Initializing the project 26 | 27 | ``` 28 | git init 29 | dvc init 30 | ``` 31 | 32 | ### Add original Data 33 | 34 | ``` 35 | dvc add assets/original_data 36 | ``` 37 | 38 | ### Create pipeline scripts 39 | 40 | > Edit preprocessing.py 41 | 42 | ### Create pipeline stage 43 | 44 | ``` 45 | dvc run -n preprocess -d assets/original_data -o assets/preprocessed python src/preprocess.py 46 | ``` 47 | 48 | ### Set remote 49 | 50 | ``` 51 | dvc remote add -d local ./dvc-remote 52 | ``` 53 | 54 | ### Push changes 55 | 56 | ``` 57 | dvc push -r local 58 | ``` 59 | 60 | ## Change data and reproduce steps 61 | 62 | ``` 63 | dvc repro 64 | ``` 65 | 66 | 67 | 68 | ## CI/CD Pipeline 69 | 70 | 1. Setup GCS remote folder 71 | 72 | ``` 73 | dvc remote add gcs gs://mlops-dvc-storage 74 | dvc push -r gcs 75 | ``` 76 | 77 | 2. Setup DockerFile 78 | 79 | 3. Setup kubernetes yaml 80 | 81 | 4. Setup CI/CD script 82 | 83 | > Commit and push code -------------------------------------------------------------------------------- /cloudbuild.yaml: -------------------------------------------------------------------------------- 1 | steps: 2 | # This step generates the new manifest 3 | - name: 'gcr.io/cloud-builders/gcloud' 4 | id: Generate manifest 5 | entrypoint: /bin/sh 6 | args: 7 | - '-c' 8 | - | 9 | sed "s/GOOGLE_CLOUD_PROJECT/${PROJECT_ID}/g" kubernetes.yaml.tpl | \ 10 | sed "s/APP_NAME/mlops/g" | \ 11 | sed "s/APP_IP_ADDRESS/$(gcloud compute addresses describe mlops --region us-central1 | grep 'address:'|sed 's|^address: ||')/g" | \ 12 | sed "s/COMMIT_SHA/${SHORT_SHA}/g" > kubernetes.yaml 13 | 14 | - name: 'gcr.io/cloud-builders/gcloud' 15 | id: Print Kubernetes config 16 | entrypoint: /bin/sh 17 | args: 18 | - '-c' 19 | - | 20 | cat kubernetes.yaml 21 | 22 | # This step builds the container image. 23 | - name: 'gcr.io/cloud-builders/docker' 24 | id: Build 25 | args: 26 | - 'build' 27 | - '-t' 28 | - 'gcr.io/$PROJECT_ID/mlops:$SHORT_SHA' 29 | - '.' 30 | 31 | # This step pushes the image to Container Registry 32 | # The PROJECT_ID and SHORT_SHA variables are automatically 33 | # replaced by Cloud Build. 34 | - name: 'gcr.io/cloud-builders/docker' 35 | id: Push 36 | args: 37 | - 'push' 38 | - 'gcr.io/$PROJECT_ID/mlops:$SHORT_SHA' 39 | 40 | # This step deploys the new version of our container image 41 | # in the hello-cloudbuild Kubernetes Engine cluster. 42 | - name: 'gcr.io/cloud-builders/kubectl' 43 | id: Deploy 44 | args: 45 | - 'apply' 46 | - '-f' 47 | - 'kubernetes.yaml' 48 | env: 49 | - 'CLOUDSDK_COMPUTE_ZONE=us-central1-a' 50 | - 'CLOUDSDK_CONTAINER_CLUSTER=mlops-cluster' -------------------------------------------------------------------------------- /src/featurize.py: -------------------------------------------------------------------------------- 1 | import pandas as pd 2 | from config import Config 3 | from src import utils 4 | from sklearn.feature_extraction.text import CountVectorizer 5 | from numpy import savetxt 6 | import pickle, os 7 | 8 | def featurize_df(df): 9 | vectorizer_analyzer = pickle.load(open(f"{Config.FEATURES_PATH}/vectorizer.pickle", "rb")) 10 | return vectorizer_analyzer.transform(df['tweet']) 11 | 12 | def featurize(dataset, vectorizer_analyzer): 13 | fit_and_transform = dataset == "train" 14 | preped_df = pd.read_csv(f"{Config.PREPROCESS_PATH}/{dataset}.csv") 15 | preped_df["tweet"] = preped_df["tweet"].fillna("") 16 | if fit_and_transform: 17 | countvectorizer_tweets = vectorizer_analyzer.fit_transform(preped_df['tweet']) 18 | pickle.dump(vectorizer_analyzer, open(f"{Config.FEATURES_PATH}/vectorizer.pickle", "wb")) 19 | else: 20 | countvectorizer_tweets = vectorizer_analyzer.transform(preped_df['tweet']) 21 | return countvectorizer_tweets 22 | 23 | 24 | def create_featurizer(dataset, force=False): 25 | if not os.path.isdir(Config.FEATURES_PATH): 26 | os.mkdir(Config.FEATURES_PATH) 27 | try: 28 | if force: 29 | raise FileNotFoundError("Forcing creation") 30 | vectorizer_analyzer = pickle.load(open(f"{Config.FEATURES_PATH}/vectorizer.pickle", "rb")) 31 | except FileNotFoundError as e: 32 | vectorizer_analyzer = CountVectorizer() 33 | fit_and_transform=True 34 | if dataset == "test": 35 | raise Exception("Cannot transform without fitting") 36 | featurize(dataset, vectorizer_analyzer) 37 | 38 | 39 | if __name__ == '__main__': 40 | create_featurizer("train") 41 | create_featurizer("test") -------------------------------------------------------------------------------- /requirements.txt: -------------------------------------------------------------------------------- 1 | appdirs==1.4.4 2 | atpublic==2.0 3 | cachetools==4.1.1 4 | certifi==2020.6.20 5 | cffi==1.14.3 6 | chardet==3.0.4 7 | click==7.1.2 8 | colorama==0.4.3 9 | commonmark==0.9.1 10 | configobj==5.0.6 11 | cryptography==3.1.1 12 | decorator==4.4.2 13 | dictdiffer==0.8.1 14 | distro==1.5.0 15 | dpath==2.0.1 16 | dvc==1.7.9 17 | Flask==1.1.2 18 | flatten-dict==0.3.0 19 | flufl.lock==3.2 20 | funcy==1.15 21 | future==0.18.2 22 | gitdb==4.0.5 23 | GitPython==3.1.8 24 | google-api-core==1.22.2 25 | google-api-python-client==1.12.2 26 | google-auth==1.21.3 27 | google-auth-httplib2==0.0.4 28 | google-cloud-core==1.4.2 29 | google-cloud-storage==1.31.2 30 | google-crc32c==1.0.0 31 | google-resumable-media==1.1.0 32 | googleapis-common-protos==1.52.0 33 | grandalf==0.6 34 | gunicorn==20.0.4 35 | httplib2==0.18.1 36 | idna==2.10 37 | itsdangerous==1.1.0 38 | Jinja2==2.11.2 39 | joblib==0.16.0 40 | jsonpath-ng==1.5.2 41 | MarkupSafe==1.1.1 42 | nanotime==0.5.2 43 | networkx==2.4 44 | nltk==3.5 45 | numpy==1.19.2 46 | oauth2client==4.1.3 47 | packaging==20.4 48 | pandas==1.1.2 49 | pathlib2==2.3.5 50 | pathspec==0.8.0 51 | ply==3.11 52 | protobuf==3.13.0 53 | pyasn1==0.4.8 54 | pyasn1-modules==0.2.8 55 | pycparser==2.20 56 | pydot==1.4.1 57 | PyDrive2==1.6.2 58 | Pygments==2.7.1 59 | pygtrie==2.3.2 60 | pyOpenSSL==19.1.0 61 | pyparsing==2.4.7 62 | python-dateutil==2.8.1 63 | pytz==2020.1 64 | PyYAML==5.3.1 65 | regex==2020.7.14 66 | requests==2.24.0 67 | rich==7.1.0 68 | rsa==4.6 69 | ruamel.yaml==0.16.12 70 | ruamel.yaml.clib==0.2.2 71 | scikit-learn==0.23.2 72 | scipy==1.5.2 73 | shortuuid==1.0.1 74 | shtab==1.3.1 75 | six==1.15.0 76 | sklearn==0.0 77 | smmap==3.0.4 78 | tabulate==0.8.7 79 | threadpoolctl==2.1.0 80 | toml==0.10.1 81 | tqdm==4.49.0 82 | typing-extensions==3.7.4.3 83 | uritemplate==3.0.1 84 | urllib3==1.25.10 85 | voluptuous==0.12.0 86 | Werkzeug==1.0.1 87 | zc.lockfile==2.0 88 | -------------------------------------------------------------------------------- /README.md: -------------------------------------------------------------------------------- 1 | # MLOps - Devfest 2 | 3 | The project is designed to for a talk at #devfestindia-2020 4 | 5 | ## Video 6 | 7 | [DevfestIndia Talk Video](https://youtu.be/chtdrBvJpZ4?t=1101) 8 | 9 | ## Topics to be convered 10 | 11 | 1. Data versioning 12 | 2. Building training pipelines 13 | 3. Versioning models 14 | 4. Deploying using docker and kubernetes 15 | 16 | --- 17 | 18 | ## Recreate Steps for new project 19 | 20 | [Checkout demo steps here](demo_steps.md) 21 | 22 | 23 | 24 | --- 25 | 26 | ## Run this project 27 | 28 | ### Pre-Requisites 29 | 30 | 1. Python 3.7 31 | 2. Pip 32 | 33 | ### Setup the project 34 | 35 | 1. Clone the repo 36 | 37 | ``` 38 | git clone 39 | cd 40 | ``` 41 | 42 | 2. Pull the data from gdrive 43 | 44 | ``` 45 | dvc pull -r gdrive 46 | ``` 47 | 48 | > It will ask for authorization, please enter the auth key from the URL 49 | 50 | 3. Install requirements 51 | 52 | ``` 53 | pip install -r requirements.txt 54 | ``` 55 | 56 | ### Run project 57 | 58 | 59 | 1. Run the application 60 | 61 | ``` 62 | python app/app.py 63 | ``` 64 | 65 | 2. Update model/any ML step and run 66 | 67 | ``` 68 | dvc repro 69 | dvc push -r gdrive 70 | ``` 71 | 72 | 73 | --- 74 | 75 | ## DVC Notes 76 | 77 | > How is the pipeline created? 78 | 79 | ``` 80 | dvc run -n preprocess -d src/preprocess.py -d assets/original_data/train.csv -o assets/preprocessed/ python src/preprocess.python 81 | dvc run -n featurize -d src/preprocess.py -d assets/preprocessed/train.csv -d assets/preprocessed/train.csv -o assets/featurized/ python src/featurize.py 82 | dvc run -fn train_test_eval -d src/model.py -d assets/featurized -p model.random,model.split -o assets/models -M assets/eval/scores.json python src/model.py 83 | dvc run -fn train_test_eval -d src/model.py -d assets/featurized -p model.random,model.split -o assets/models -M assets/eval/scores.json python src/model.py 84 | ``` 85 | -------------------------------------------------------------------------------- /src/model.py: -------------------------------------------------------------------------------- 1 | from sklearn.naive_bayes import MultinomialNB 2 | import pickle 3 | import featurize 4 | import pandas as pd 5 | import json 6 | from config import Config 7 | from sklearn.model_selection import train_test_split 8 | from sklearn.metrics import confusion_matrix 9 | from sklearn.metrics import accuracy_score 10 | import os 11 | import yaml 12 | 13 | params = yaml.safe_load(open('params.yaml'))['model'] 14 | 15 | def train(model, X_train, y_train): 16 | fitted_model = model.fit(X_train,y_train) 17 | 18 | 19 | def test(model, X_test, y_test): 20 | prediction = model.predict(X_test) 21 | cm = confusion_matrix(y_test, prediction) 22 | score = accuracy_score(y_test, prediction) 23 | with open(f"{Config.EVAL_PATH}/scores.json", "w") as f: 24 | f.write(json.dumps({"acc": score})) 25 | print ("Accuracy ::", score) 26 | print ("CM :: \n", cm) 27 | 28 | def infer_df(model, df): 29 | 30 | result_df = pd.DataFrame() 31 | result_df["infer"] = model.predict(df) 32 | result_df["probability"] = model.predict_proba(df)[0][1] 33 | return result_df 34 | 35 | def infer(model): 36 | vectorizer_analyzer = pickle.load(open(f"{Config.FEATURES_PATH}/vectorizer.pickle", "rb")) 37 | X_infer = featurize.featurize("test", vectorizer_analyzer) 38 | result_df = infer_df(model, X_infer) 39 | 40 | inference_df = pd.read_csv(f"{Config.ORIGINAL_DATASET_PATH}/test.csv") 41 | result_df["id"] = inference_df["id"] 42 | result_df.to_csv(f"{Config.MODELS_PATH}/result.csv", index=False) 43 | 44 | 45 | 46 | def run(model): 47 | train_df = pd.read_csv(Config.TRAIN_DATASET) 48 | 49 | 50 | vectorizer_analyzer = pickle.load(open(f"{Config.FEATURES_PATH}/vectorizer.pickle", "rb")) 51 | X_features = featurize.featurize("train", vectorizer_analyzer) 52 | X_train, X_test, y_train, y_test = train_test_split(X_features, train_df["label"], test_size = params["split"], random_state = params["random"]) 53 | # print ("Train, test :: ", X_train.shape, X_test.shape) 54 | del X_features 55 | 56 | train(model, X_train, y_train) 57 | test(model, X_test, y_test) 58 | result = infer(model) 59 | 60 | import os 61 | if not os.path.isdir(Config.MODELS_PATH): 62 | os.mkdir(Config.MODELS_PATH) 63 | 64 | if not os.path.isdir(Config.EVAL_PATH): 65 | os.mkdir(Config.EVAL_PATH) 66 | 67 | if __name__ == '__main__': 68 | # from sklearn.linear_model import SGDClassifier 69 | # model = SGDClassifier() 70 | model = MultinomialNB() 71 | run(model) 72 | pickle.dump(model, open(f"{Config.MODELS_PATH}/model.pickle", "wb")) -------------------------------------------------------------------------------- /LICENSE: -------------------------------------------------------------------------------- 1 | Apache License 2 | Version 2.0, January 2004 3 | http://www.apache.org/licenses/ 4 | 5 | TERMS AND CONDITIONS FOR USE, REPRODUCTION, AND DISTRIBUTION 6 | 7 | 1. Definitions. 8 | 9 | "License" shall mean the terms and conditions for use, reproduction, 10 | and distribution as defined by Sections 1 through 9 of this document. 11 | 12 | "Licensor" shall mean the copyright owner or entity authorized by 13 | the copyright owner that is granting the License. 14 | 15 | "Legal Entity" shall mean the union of the acting entity and all 16 | other entities that control, are controlled by, or are under common 17 | control with that entity. For the purposes of this definition, 18 | "control" means (i) the power, direct or indirect, to cause the 19 | direction or management of such entity, whether by contract or 20 | otherwise, or (ii) ownership of fifty percent (50%) or more of the 21 | outstanding shares, or (iii) beneficial ownership of such entity. 22 | 23 | "You" (or "Your") shall mean an individual or Legal Entity 24 | exercising permissions granted by this License. 25 | 26 | "Source" form shall mean the preferred form for making modifications, 27 | including but not limited to software source code, documentation 28 | source, and configuration files. 29 | 30 | "Object" form shall mean any form resulting from mechanical 31 | transformation or translation of a Source form, including but 32 | not limited to compiled object code, generated documentation, 33 | and conversions to other media types. 34 | 35 | "Work" shall mean the work of authorship, whether in Source or 36 | Object form, made available under the License, as indicated by a 37 | copyright notice that is included in or attached to the work 38 | (an example is provided in the Appendix below). 39 | 40 | "Derivative Works" shall mean any work, whether in Source or Object 41 | form, that is based on (or derived from) the Work and for which the 42 | editorial revisions, annotations, elaborations, or other modifications 43 | represent, as a whole, an original work of authorship. For the purposes 44 | of this License, Derivative Works shall not include works that remain 45 | separable from, or merely link (or bind by name) to the interfaces of, 46 | the Work and Derivative Works thereof. 47 | 48 | "Contribution" shall mean any work of authorship, including 49 | the original version of the Work and any modifications or additions 50 | to that Work or Derivative Works thereof, that is intentionally 51 | submitted to Licensor for inclusion in the Work by the copyright owner 52 | or by an individual or Legal Entity authorized to submit on behalf of 53 | the copyright owner. For the purposes of this definition, "submitted" 54 | means any form of electronic, verbal, or written communication sent 55 | to the Licensor or its representatives, including but not limited to 56 | communication on electronic mailing lists, source code control systems, 57 | and issue tracking systems that are managed by, or on behalf of, the 58 | Licensor for the purpose of discussing and improving the Work, but 59 | excluding communication that is conspicuously marked or otherwise 60 | designated in writing by the copyright owner as "Not a Contribution." 61 | 62 | "Contributor" shall mean Licensor and any individual or Legal Entity 63 | on behalf of whom a Contribution has been received by Licensor and 64 | subsequently incorporated within the Work. 65 | 66 | 2. Grant of Copyright License. Subject to the terms and conditions of 67 | this License, each Contributor hereby grants to You a perpetual, 68 | worldwide, non-exclusive, no-charge, royalty-free, irrevocable 69 | copyright license to reproduce, prepare Derivative Works of, 70 | publicly display, publicly perform, sublicense, and distribute the 71 | Work and such Derivative Works in Source or Object form. 72 | 73 | 3. Grant of Patent License. Subject to the terms and conditions of 74 | this License, each Contributor hereby grants to You a perpetual, 75 | worldwide, non-exclusive, no-charge, royalty-free, irrevocable 76 | (except as stated in this section) patent license to make, have made, 77 | use, offer to sell, sell, import, and otherwise transfer the Work, 78 | where such license applies only to those patent claims licensable 79 | by such Contributor that are necessarily infringed by their 80 | Contribution(s) alone or by combination of their Contribution(s) 81 | with the Work to which such Contribution(s) was submitted. If You 82 | institute patent litigation against any entity (including a 83 | cross-claim or counterclaim in a lawsuit) alleging that the Work 84 | or a Contribution incorporated within the Work constitutes direct 85 | or contributory patent infringement, then any patent licenses 86 | granted to You under this License for that Work shall terminate 87 | as of the date such litigation is filed. 88 | 89 | 4. Redistribution. You may reproduce and distribute copies of the 90 | Work or Derivative Works thereof in any medium, with or without 91 | modifications, and in Source or Object form, provided that You 92 | meet the following conditions: 93 | 94 | (a) You must give any other recipients of the Work or 95 | Derivative Works a copy of this License; and 96 | 97 | (b) You must cause any modified files to carry prominent notices 98 | stating that You changed the files; and 99 | 100 | (c) You must retain, in the Source form of any Derivative Works 101 | that You distribute, all copyright, patent, trademark, and 102 | attribution notices from the Source form of the Work, 103 | excluding those notices that do not pertain to any part of 104 | the Derivative Works; and 105 | 106 | (d) If the Work includes a "NOTICE" text file as part of its 107 | distribution, then any Derivative Works that You distribute must 108 | include a readable copy of the attribution notices contained 109 | within such NOTICE file, excluding those notices that do not 110 | pertain to any part of the Derivative Works, in at least one 111 | of the following places: within a NOTICE text file distributed 112 | as part of the Derivative Works; within the Source form or 113 | documentation, if provided along with the Derivative Works; or, 114 | within a display generated by the Derivative Works, if and 115 | wherever such third-party notices normally appear. The contents 116 | of the NOTICE file are for informational purposes only and 117 | do not modify the License. You may add Your own attribution 118 | notices within Derivative Works that You distribute, alongside 119 | or as an addendum to the NOTICE text from the Work, provided 120 | that such additional attribution notices cannot be construed 121 | as modifying the License. 122 | 123 | You may add Your own copyright statement to Your modifications and 124 | may provide additional or different license terms and conditions 125 | for use, reproduction, or distribution of Your modifications, or 126 | for any such Derivative Works as a whole, provided Your use, 127 | reproduction, and distribution of the Work otherwise complies with 128 | the conditions stated in this License. 129 | 130 | 5. Submission of Contributions. Unless You explicitly state otherwise, 131 | any Contribution intentionally submitted for inclusion in the Work 132 | by You to the Licensor shall be under the terms and conditions of 133 | this License, without any additional terms or conditions. 134 | Notwithstanding the above, nothing herein shall supersede or modify 135 | the terms of any separate license agreement you may have executed 136 | with Licensor regarding such Contributions. 137 | 138 | 6. Trademarks. This License does not grant permission to use the trade 139 | names, trademarks, service marks, or product names of the Licensor, 140 | except as required for reasonable and customary use in describing the 141 | origin of the Work and reproducing the content of the NOTICE file. 142 | 143 | 7. Disclaimer of Warranty. Unless required by applicable law or 144 | agreed to in writing, Licensor provides the Work (and each 145 | Contributor provides its Contributions) on an "AS IS" BASIS, 146 | WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or 147 | implied, including, without limitation, any warranties or conditions 148 | of TITLE, NON-INFRINGEMENT, MERCHANTABILITY, or FITNESS FOR A 149 | PARTICULAR PURPOSE. You are solely responsible for determining the 150 | appropriateness of using or redistributing the Work and assume any 151 | risks associated with Your exercise of permissions under this License. 152 | 153 | 8. Limitation of Liability. In no event and under no legal theory, 154 | whether in tort (including negligence), contract, or otherwise, 155 | unless required by applicable law (such as deliberate and grossly 156 | negligent acts) or agreed to in writing, shall any Contributor be 157 | liable to You for damages, including any direct, indirect, special, 158 | incidental, or consequential damages of any character arising as a 159 | result of this License or out of the use or inability to use the 160 | Work (including but not limited to damages for loss of goodwill, 161 | work stoppage, computer failure or malfunction, or any and all 162 | other commercial damages or losses), even if such Contributor 163 | has been advised of the possibility of such damages. 164 | 165 | 9. Accepting Warranty or Additional Liability. While redistributing 166 | the Work or Derivative Works thereof, You may choose to offer, 167 | and charge a fee for, acceptance of support, warranty, indemnity, 168 | or other liability obligations and/or rights consistent with this 169 | License. However, in accepting such obligations, You may act only 170 | on Your own behalf and on Your sole responsibility, not on behalf 171 | of any other Contributor, and only if You agree to indemnify, 172 | defend, and hold each Contributor harmless for any liability 173 | incurred by, or claims asserted against, such Contributor by reason 174 | of your accepting any such warranty or additional liability. 175 | 176 | END OF TERMS AND CONDITIONS 177 | 178 | APPENDIX: How to apply the Apache License to your work. 179 | 180 | To apply the Apache License to your work, attach the following 181 | boilerplate notice, with the fields enclosed by brackets "[]" 182 | replaced with your own identifying information. (Don't include 183 | the brackets!) The text should be enclosed in the appropriate 184 | comment syntax for the file format. We also recommend that a 185 | file or class name and description of purpose be included on the 186 | same "printed page" as the copyright notice for easier 187 | identification within third-party archives. 188 | 189 | Copyright [yyyy] [name of copyright owner] 190 | 191 | Licensed under the Apache License, Version 2.0 (the "License"); 192 | you may not use this file except in compliance with the License. 193 | You may obtain a copy of the License at 194 | 195 | http://www.apache.org/licenses/LICENSE-2.0 196 | 197 | Unless required by applicable law or agreed to in writing, software 198 | distributed under the License is distributed on an "AS IS" BASIS, 199 | WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 200 | See the License for the specific language governing permissions and 201 | limitations under the License. 202 | --------------------------------------------------------------------------------