├── .github
    └── workflows
    │   ├── benchmark_and_test_model.yml
    │   ├── deploy.yml
    │   └── run_dags.yml
├── .gitignore
├── Makefile
├── README.md
├── config
    └── config.toml
├── dags
    ├── etl_twitter_dag.py
    ├── model_training_dag.py
    └── task_definitions
    │   ├── etl_task_definitions.py
    │   └── model_training.py
├── dependencies
    ├── Dockerfile
    └── requirements.txt
├── docker-compose.yaml
├── images
    ├── Sagemaker_endpoint.jpg
    ├── architecture_diagram.jpeg
    ├── ecr_image.PNG
    ├── etl_dag.PNG
    ├── mlflow_exps.PNG
    ├── model_dag.PNG
    ├── model_plot.png
    ├── model_registry_latest1.PNG
    ├── model_registry_latest2.PNG
    └── model_registry_org.PNG
├── scripts
    ├── behavioral_test.py
    ├── deploy.py
    ├── stage_model_to_production.py
    └── test_data
    │   ├── sample_test_data_for_mft.parquet
    │   └── test_data.parquet
├── test_results
    ├── Invariance_latest_test_results.csv
    ├── Invariance_production_test_results.csv
    ├── MFT_latest_test_results.csv
    └── MFT_production_test_results.csv
└── utils
    ├── experiment_tracking.py
    ├── helper.py
    ├── model.py
    └── prepare_data.py


/.github/workflows/benchmark_and_test_model.yml:
--------------------------------------------------------------------------------
 1 | # Name of the workflow
 2 | name: Test and benchmark models 
 3 | 
 4 | on:
 5 |   push:
 6 |     branches: [main]
 7 | 
 8 | jobs:
 9 |   build:
10 |     runs-on: self-hosted
11 |     steps:
12 |     - uses: actions/checkout@v2
13 |     
14 |     - name: Test and benchmark models
15 |       id: test_benchmark
16 |       env:
17 |         AWS_ACCESS_KEY_ID: ${{ secrets.AWS_ACCESS_KEY_ID }}
18 |         AWS_SECRET_ACCESS_KEY: ${{ secrets.AWS_SECRET_ACCESS_KEY }}
19 |         REGION: ${{ secrets.REGION }}
20 | 
21 |       run: |
22 |         python3 -m pip install --upgrade pip
23 |         pip install -r ./dependencies/requirements.txt
24 |         python -m spacy download en_core_web_sm
25 |         python ./scripts/stage_model_to_production.py
26 | 
27 | 


--------------------------------------------------------------------------------
/.github/workflows/deploy.yml:
--------------------------------------------------------------------------------
 1 | # Name of the workflow
 2 | name: Deploy to sagemaker
 3 | 
 4 | on: workflow_dispatch
 5 | 
 6 | jobs:
 7 |   build:
 8 |     runs-on: self-hosted
 9 |     steps:
10 |     - uses: actions/checkout@v2
11 |     
12 |     - name: Deploy production-ready image from AWS ECR to Sagemaker
13 |       id: deploy_to_prod
14 |       env:
15 |         AWS_ACCESS_KEY_ID: ${{ secrets.AWS_ACCESS_KEY_ID }}
16 |         AWS_SECRET_ACCESS_KEY: ${{ secrets.AWS_SECRET_ACCESS_KEY }}
17 |         REGION: ${{ secrets.REGION }}
18 |         IMAGE_URI: ${{ secrets.IMAGE_URI }}
19 |         ARN_ROLE: ${{ secrets.ARN_ROLE }}
20 | 
21 |       run: |
22 |         python3 -m pip install --upgrade pip
23 |         pip install -r ./dependencies/requirements.txt
24 |         python ./scripts/deploy.py


--------------------------------------------------------------------------------
/.github/workflows/run_dags.yml:
--------------------------------------------------------------------------------
 1 | # Name of the workflow
 2 | name: Run Airflow DAG 
 3 | 
 4 | on: workflow_dispatch
 5 | 
 6 | jobs:
 7 |   build:
 8 |     runs-on: self-hosted
 9 |     steps:
10 |     - uses: actions/checkout@v2
11 |       with:
12 |         # Loading the secrets
13 |         secrets: |
14 |           "AWS_ACCESS_KEY_ID=${{ secrets.AWS_ACCESS_KEY_ID }}"
15 |           "AWS_SECRET_ACCESS_KEY=${{ secrets.AWS_SECRET_ACCESS_KEY }}"
16 |           "REGION=${{ secrets.REGION }}"
17 |           "LOGIN=${{ secrets.LOGIN }}"
18 |           "PASSWORD=${{ secrets.PASSWORD }}"
19 |           "HOST=${{ secrets.HOST }}"
20 |           "ACCOUNT=${{ secrets.ACCOUNT }}"
21 |           "WAREHOUSE=${{ secrets.WAREHOUSE }}"
22 |           "DATABASE=${{ secrets.DATABASE }}"
23 |           "SCHEMA=${{ secrets.SCHEMA }}"
24 | 
25 |     - name: Run airflow dag
26 |       run: make run_dag


--------------------------------------------------------------------------------
/.gitignore:
--------------------------------------------------------------------------------
  1 | # Byte-compiled / optimized / DLL files
  2 | __pycache__/
  3 | *.py[cod]
  4 | *$py.class
  5 | 
  6 | # C extensions
  7 | *.so
  8 | 
  9 | # Distribution / packaging
 10 | .Python
 11 | build/
 12 | develop-eggs/
 13 | dist/
 14 | downloads/
 15 | eggs/
 16 | .eggs/
 17 | lib/
 18 | lib64/
 19 | parts/
 20 | sdist/
 21 | var/
 22 | wheels/
 23 | pip-wheel-metadata/
 24 | share/python-wheels/
 25 | *.egg-info/
 26 | .installed.cfg
 27 | *.egg
 28 | MANIFEST
 29 | 
 30 | # PyInstaller
 31 | #  Usually these files are written by a python script from a template
 32 | #  before PyInstaller builds the exe, so as to inject date/other infos into it.
 33 | *.manifest
 34 | *.spec
 35 | 
 36 | # Installer logs
 37 | pip-log.txt
 38 | pip-delete-this-directory.txt
 39 | 
 40 | # Unit test / coverage reports
 41 | htmlcov/
 42 | .tox/
 43 | .nox/
 44 | .coverage
 45 | .coverage.*
 46 | .cache
 47 | nosetests.xml
 48 | coverage.xml
 49 | *.cover
 50 | *.py,cover
 51 | .hypothesis/
 52 | .pytest_cache/
 53 | 
 54 | # Translations
 55 | *.mo
 56 | *.pot
 57 | 
 58 | # Django stuff:
 59 | *.log
 60 | local_settings.py
 61 | db.sqlite3
 62 | db.sqlite3-journal
 63 | 
 64 | # Flask stuff:
 65 | instance/
 66 | .webassets-cache
 67 | 
 68 | # Scrapy stuff:
 69 | .scrapy
 70 | 
 71 | # Sphinx documentation
 72 | docs/_build/
 73 | 
 74 | # PyBuilder
 75 | target/
 76 | 
 77 | # Jupyter Notebook
 78 | .ipynb_checkpoints
 79 | 
 80 | # IPython
 81 | profile_default/
 82 | ipython_config.py
 83 | 
 84 | # pyenv
 85 | .python-version
 86 | 
 87 | # pipenv
 88 | #   According to pypa/pipenv#598, it is recommended to include Pipfile.lock in version control.
 89 | #   However, in case of collaboration, if having platform-specific dependencies or dependencies
 90 | #   having no cross-platform support, pipenv may install dependencies that don't work, or not
 91 | #   install all needed dependencies.
 92 | #Pipfile.lock
 93 | 
 94 | # PEP 582; used by e.g. github.com/David-OConnor/pyflow
 95 | __pypackages__/
 96 | 
 97 | # Celery stuff
 98 | celerybeat-schedule
 99 | celerybeat.pid
100 | 
101 | # SageMath parsed files
102 | *.sage.py
103 | 
104 | # Environments
105 | .env
106 | .venv
107 | env/
108 | venv/
109 | ENV/
110 | env.bak/
111 | venv.bak/
112 | 
113 | # Spyder project settings
114 | .spyderproject
115 | .spyproject
116 | 
117 | # Rope project settings
118 | .ropeproject
119 | 
120 | # mkdocs documentation
121 | /site
122 | 
123 | # mypy
124 | .mypy_cache/
125 | .dmypy.json
126 | dmypy.json
127 | 
128 | # Pyre type checker
129 | .pyre/
130 | 


--------------------------------------------------------------------------------
/Makefile:
--------------------------------------------------------------------------------
 1 | # Makefile to run Airflow DAG in docker container with external dependencies
 2 | 
 3 | include .env
 4 | 
 5 | run_dag:
 6 | #	Build extended airflow docker image with required pip dependencies
 7 | 	docker build . -f ./dependencies/Dockerfile --tag extending_airflow:latest
 8 | #	Rebuild airflow webserver and scheduler with our newly build image
 9 | 	docker-compose up -d --no-deps --build airflow-webserver airflow-scheduler
10 | 
11 | #	Start all required containers to run all airflow services 
12 | 	docker-compose -f docker-compose.yaml up -d
13 | 	docker ps
14 | 	sleep 15
15 | 
16 | #	Triggering DAG for the first time by accessing the webserver container
17 | 	docker exec -it  twitter_bot_airflow-webserver_1 bash -c "airflow dags trigger twitter_data_pipeline_dag_etl
18 | 
19 | stop_dag:
20 | 	docker-compose -f docker-compose.yaml down


--------------------------------------------------------------------------------
/README.md:
--------------------------------------------------------------------------------
  1 | # Sentiment analysis from MLOps paradigm
  2 | 
  3 | ![benchmark](https://github.com/Jithsaavvy/Sentiment-analysis-from-MLOps-paradigm/workflows/Test%20and%20benchmark%20models/badge.svg)
  4 | ![deploy](https://github.com/Jithsaavvy/Sentiment-analysis-from-MLOps-paradigm/workflows/Deploy%20to%20sagemaker/badge.svg)
  5 | 
  6 | This project promulgates an **automated end-to-end ML pipeline** that trains a **bi-directional LSTM** network for sentiment analysis task, **tracks** experiments, **pushes** trained models to **model registry**, benchmark them by means of **model testing** and **evaluation**, pushes the best model into production followed by **dockerizing** the production model artifacts into a deployable image and **deploys** the same into cloud instance via **CI/CD**. 
  7 | 
  8 | ## Author
  9 | 
 10 | - [@Jithin Sasikumar](https://www.github.com/Jithsaavvy)
 11 | 
 12 | ## Languages and Tools
 13 | 
 14 | <div align="">
 15 | <a href="https://www.python.org" target="_blank" rel="noreferrer"><img src="https://raw.githubusercontent.com/devicons/devicon/master/icons/python/python-original.svg" alt="python" width="40" height="40"/></a>
 16 | <a href="https://www.tensorflow.org" target="_blank" rel="noreferrer"><img src="https://www.vectorlogo.zone/logos/tensorflow/tensorflow-icon.svg" alt="tensorflow" width="40" height="40"/></a>
 17 | <a href="https://www.docker.com/" target="_blank" rel="noreferrer"><img src="https://raw.githubusercontent.com/devicons/devicon/master/icons/docker/docker-original-wordmark.svg" alt="docker" width="40" height="40"/></a>
 18 | <a href="https://airflow.apache.org/" target="_blank" rel="noreferrer"> <img src="https://upload.wikimedia.org/wikipedia/commons/thumb/d/de/AirflowLogo.png/1200px-AirflowLogo.png" alt="airflow" width="95" height="43"/></a>
 19 | <a href="https://github.com/features/actions" target="_blank" rel="noreferrer"> <img src="https://res.cloudinary.com/practicaldev/image/fetch/s--2mFgk66y--/c_limit,f_auto,fl_progressive,q_80,w_375/https://dev-to-uploads.s3.amazonaws.com/uploads/badge/badge_image/78/github-actions-runner-up-badge.png" alt="actions" width="52" height="49"/></a>
 20 | <a href="https://www.mlflow.org/docs/latest/python_api/mlflow.html" target="_blank" rel="noreferrer"> <img src="https://www.mlflow.org/docs/latest/_static/MLflow-logo-final-black.png" alt="mlflow" width="98" height="44"/></a>
 21 | <a href="https://aws.amazon.com/s3/" target="_blank" rel="noreferrer"> <img src="https://www.liblogo.com/img-logo/aw314s096-aws-s3-logo-setting-up-aws-s3-for-open-edx-blog.png" alt="s3" width="73" height="44"/></a> 
 22 | <a href="https://aws.amazon.com/ec2/" target="_blank" rel="noreferrer"> <img src="https://www.filepicker.io/api/file/sBtCHoB8TwiRNve8nRRp" alt="ec2" width="81" height="44"/></a> 
 23 | <a href="https://aws.amazon.com/ecr/" target="_blank" rel="noreferrer"> <img src="https://sysdig.es/wp-content/uploads/logo-amazon-ecr-885x240-1.png" alt="ecr" width="145" height="41"/></a>
 24 | <a href="https://aws.amazon.com/sagemaker/" target="_blank" rel="noreferrer"> <img src="https://miro.medium.com/max/800/1*Epd215c8urNyJ_DJqkKHJg.png" alt="sagemaker" width="115" height="50"/></a> 
 25 | <a href="https://aws.amazon.com/sagemaker/" target="_blank" rel="noreferrer"> <img src="https://upload.wikimedia.org/wikipedia/commons/thumb/f/ff/Snowflake_Logo.svg/2560px-Snowflake_Logo.svg.png" alt="sagemaker" width="110" height="37"/></a> 
 26 | </div>
 27 | <br>
 28 | 
 29 | ## Motivation
 30 | 
 31 | In a machine learning (ML) project, it comprises of a chain of tasks like data collection, pre-processing, transforming datasets, feature extraction, model training, model selection, evaluation, deployment. For a small-scale project, these tasks can be managed manually but, as the scalability and scope of the project increases, manual process is really a pain. The actual problem arises when the model has to be productionalized in order to make value out of it. MLOps defines various disciplines to nullify such problems and work efficiently. Thus, pipelines are crucial in an ML project and automating such end-to-end pipelines are also vital.
 32 | 
 33 | ## Description
 34 | 
 35 | The project is a concoction of research (sentiment analysis, NLP, BERT, biLSTM), development (text normalization, ETL, transformation, deep neural network training, evaluation, model testing) and deployment (building and packaging model artifacts, tracking, docker, workflows, pipelines, cloud) by integrating CI/CD pipelines with automated releases.
 36 | 
 37 | | ![flowchart](./images/architecture_diagram.jpeg) |
 38 | |:--:|
 39 | | <b>Figure 1: Complete end-to-end project pipeline</b>|
 40 | 
 41 | ## Technical facets
 42 | 
 43 | 1. Setting up `Airflow` in docker for `workflow orchestration`.
 44 | 2. Writing a `Dockerfile` that creates a base docker image with all dependencies installed and secrets containing sensitive credentials and access tokens are mounted.
 45 | 2. Defining `ETL` and `model training` workflow followed by scheduling them for orchestration.
 46 | 3. Executing `Airflow DAGs`:
 47 |     - **ETL** - Performs Extract, Transform and Load operation on twitter data. As a result, raw tweets scraped from twitter are processed and loaded into `Snowflake data warehouse` as database table.
 48 |     - **Model_training** - Deep end-to-end `biLSTM` model is trained using `Tensorflow` by fetching processed data from data warehouse.
 49 | 4. Tracking the entire model training using `MLflow server` hosted on `AWS EC2 instance` from which trained model artifacts, metrics and parameters are logged.
 50 | 5. Using `AWS S3 buckets` to store the model artifacts and data.
 51 | 6. Adding the trained model to `MLflow model registry` on `AWS EC2 instance` that facilitates in managing, maintaining, versioning, staging, testing and productionalizing the model collaboratively.
 52 | 7. Automating the `pipeline` as follows:
 53 |     - Initialize `GitHub Actions` workflows.
 54 |     - `benchmark_and_test_model.yml` => In order to productionalize a model, simply evaluating the model is not sufficient. So, it is very important to test them. Thus, the best model is pushed into **production stage** by means of **benchmarking** (`behavioral testing` + evaluation).
 55 |     - `deploy.yml` => The production model from model registry in `EC2 instance` is packaged into a docker image with all required dependencies & metadata as a `deployable model artifact` and pushed into `Amazon ECR` **(CI job)**. The deployable image is then deployed into `AWS Sagemaker` instance which creates an **endpoint** that can be used to communicate with the model for inference **(CD job)**.
 56 |     - `run_dags.yml` - Triggers Airflow DAG run that performs ETL and model training task based on schedule.
 57 |     -  `release.yml` => A new release will be created automatically when tags are pushed to the repository.
 58 | 
 59 | 
 60 | ## Directory structure
 61 | 
 62 | ```
 63 | ├── .github
 64 | │    └── workflows
 65 | │        ├── benchmark_and_test_model.yaml
 66 | |        ├── deploy.yaml
 67 | |        ├── release.yaml
 68 | |        └── run_dags.yaml
 69 | ├── config
 70 | │   └── config.toml
 71 | ├── dags                                # Directory where every Airflow DAG is defined
 72 | │   ├── etl_twitter_dag.py
 73 | │   ├── model_training_dag.py
 74 | │   └── task_definitions
 75 | │       ├── etl_task_definitions.py
 76 | │       └── model_training.py
 77 | ├── dependencies                       
 78 | │   ├── Dockerfile
 79 | │   └── requirements.txt
 80 | ├── docker-compose.yaml                 # Airflow and it's components run as docker containers
 81 | ├── images
 82 | ├── Makefile                            # Set of docker commands for Airflow run
 83 | ├── README.md
 84 | ├── scripts                             # Contains code for model testing, evaluation and deployment to AWS Sagemaker
 85 | │   ├── behavioral_test.py
 86 | │   ├── deploy.py
 87 | │   ├── stage_model_to_production.py
 88 | │   └── test_data
 89 | │       ├── sample_test_data_for_mft.parquet
 90 | │       └── test_data.parquet
 91 | ├── test_results
 92 | └── utils
 93 |     ├── experiment_tracking.py
 94 |     ├── helper.py
 95 |     ├── model.py
 96 |     └── prepare_data.py
 97 | 
 98 | ```
 99 | 
100 | ## Pipeline
101 | ### Dependencies & Secrets management
102 | 
103 | As mentioned above, Airflow is running in a docker container. In order to install dependencies, a docker image is build with all installed dependencies and it will be used as a base image for `docker-compose`. The dependencies are listed in [requirements.txt](./dependencies/requirements.txt). A more better way would be, to use any kind of dependency management tools like **Poetry** for organizational projects but, it is out of scope  for this project.
104 | 
105 | One important challenge would be, to manage &  handle sensitive information such as **credentials, access tokens** etc needed for Airflow to connect with other services like `AWS S3`, `Snowflake`, `EC2`. It is vulnerable to use any such sensitive info during `docker build` as they will be exposed as a result of layer caching during image build. The secure way is to mount them into the image as **docker secrets** and then export it as environment variables, so that they aren't leaked. It can be done as follows:
106 | 
107 | -  Create secrets using the command
108 | ```
109 | docker secret create
110 | ```
111 | 
112 | - Mount those secrets into `/run/secrets/` of the container
113 | ```
114 | RUN --mount=type=secret,id=<YOUR_SECRET> \
115 |     export <YOUR_SECRET_ID>=$(cat /run/secrets/<YOUR_SECRET>)
116 | ```
117 | 
118 | #### ProTip to do the same in production environment
119 | 
120 | The aforementioned steps are not well suitable for production. To do so, use `docker stack`. For more info, refer [here](https://docs.docker.com/engine/swarm/stack-deploy/)
121 | 
122 | ### Workflow Orchestration - Airflow
123 | 
124 | [Apache Airflow](https://airflow.apache.org/) is used to orchestrate workflows in this project. The workflows are represented as **Directed Acyclic Graph** `(DAG)`.
125 | 
126 | ### DAGS
127 | 
128 | ### ETL
129 | It is a data workflow that performs Extract Transform Load `(ETL)` task defined in [etl_twitter_dag.py](./dags/etl_twitter_dag.py) on scheduled interval. It performs the following tasks:
130 | - The raw tweets are scraped from twitter using [snscrape](https://pypi.org/project/snscrape/) library and loaded to `AWS S3 bucket`.
131 | - They are cleaned using **regular expressions** and labelled by calculating **polarity** and also loaded to the same `S3 bucket`.
132 | - The labelled data is normalized and preprocessed using NLP techniques and loaded as **database table** to `Snowflake data warehouse` which can be used for analysis and model training.
133 | - The data is stored in the `parquet` format for efficient storage and retrieval.
134 | 
135 | | ![flowchart](./images/etl_dag.PNG) |
136 | |:--:|
137 | | <b>Figure 2: ETL Data pipeline - Airflow</b>|
138 | 
139 | ### Model training
140 | It is a model training workflow that trains deep end-to-end `biLSTM` network with `BERT tokenizer`. Detailed explanation of biLSTM model can be found [here](#bi-directional-lstm-model). The DAG performs the following tasks:
141 | 
142 | - Preprocessed data loaded as a result of ETL pipeline is fetched from the database
143 | table of **snowflake data warehouse** as a **dataframe**.
144 | - External (user-build) **docker container** with `tensorflow GPU` and other dependencies installed, is used to train the model. It is facilitated in Airflow by `DockerOperator` as:
145 | ```
146 | DockerOperator(
147 |                 task_id = "train_model_task",
148 |                 image = "model_training_tf:latest",
149 |                 auto_remove = True,
150 |                 docker_url = "unix://var/run/docker.sock",
151 |                 api_version = "auto",
152 |                 command = "python3 model_training.py"
153 |             )
154 | ```
155 | - The **GPU accelerated** training for the above task is defined in [model_training.py](./dags/task_definitions/model_training.py). Additionally, `BERT tokenizer` is used instead of normal tokenizer **(i.e.)** The texts are tokenized and each tokens are encoded into unique IDs referred as `input_ids`. Finally, they are transformed as `tensorflow datasets` for efficient input pipeline and fed into the model. All these are defined in [prepare_data.py](./utils/prepare_data.py).
156 | 
157 | | ![flowchart](./images/model_dag.PNG) |
158 | |:--:|
159 | | <b>Figure 3: Model training pipeline - Airflow</b>|
160 | 
161 | **Note:** <br>
162 | *GPU used for training*: NVIDIA GeForce GTX 980M with `8GB GDDR5` memory
163 | 
164 | ### Bi-directional LSTM model
165 | biLSTM network encompassing an
166 | `embedding layer`, stack of `biLSTM layers` followed by `fully connected dense layers`
167 | with `dropout` is used for this project. The **model plot** is depicted in the below image:
168 | 
169 | <p align="center">
170 |   <img src="./images/model_plot.png" />
171 | </p>
172 | 
173 | ### MLflow Server
174 | 
175 | All the experiments are tracked and logged by [MLflow](https://mlflow.org/docs/latest/tracking.html). It is not done locally in a **localhost**, instead the `MLflow Server` is installed and hosted in `AWS EC2 instance` as a **remote tracking server** which paves way for centralized access. The **trained model artifacts** are saved in `AWS S3 bucket` which serves as an artifact store and parameters, metrics (per epoch), all other metadata are logged into EC2 instance itself.
176 | 
177 | | ![flowchart](./images/mlflow_exps.PNG) |
178 | |:--:|
179 | | <b>Figure 4: All experiment runs on MLflow Server - EC2 Instance</b>|
180 | 
181 | ### MLflow Model Registry
182 | 
183 | The models to be staged and tested are pushed to the model registry which serves as a **centralized model store**. It facilitates to manage, version, stage, test and productionalize the model and provides functionalities to work on the models collaboratively.
184 | 
185 | | ![flowchart](./images/model_registry_org.PNG) |
186 | |:--:|
187 | | <b>Figure 5: Model Registry with already existing production model and staged model - EC2 Instance </b>|
188 | 
189 | ### Benchmarking
190 | 
191 | The model with latest version and model in production stage are benchmarked by means of behavioral testing and evaluation. This is done to find out whether the latest model outperforms the current production model. If yes, it triggers the `CI/CD` workflow job.
192 | 
193 | Model testing differs from model evaluation. For instance, a model with high evaluation metric doesn't always guarantee to be the best performing model because, it might fail in some specific scenarios. To solve and quantify that, **model testing** is an important aspect in production.
194 | 
195 | ### Behavioral testing
196 | 
197 | It is based on this [paper](https://homes.cs.washington.edu/~marcotcr/acl20_checklist.pdf) to test the behavior of the model in specific conditions. [Checklist](https://github.com/marcotcr/checklist) library is used for performing both the tests. These testing functions are defined in [behavioral_test.py](./scripts/behavioral_test.py). Three different types of tests are proposed in the paper but only two of them are performed in this project namely:
198 | - Minimum Functionality test (MFT)
199 | - Invariance test (INV)
200 | 
201 | ### MFT:
202 | MFT is inspired from unit test. A specific behavior (or) capability of the model is tested.
203 | 
204 | | 1. |         **Model**         |                                                 Sentiment Analysis                                                 |
205 | |----|:-------------------------:|:------------------------------------------------------------------------------------------------------------------:|
206 | | 2. |        **Dataset**        | Perturbed dataset created from a small subset of test dataset with labels. Original texts are negated as perturbed |
207 | | 3. | **Minimum functionality** |                             Negations (i.e.) how well the model handles negated inputs                             |
208 | | 4. |        **Example**        |   *Original text*: This product is very good - **Positive** <br> *Negated text*: This product is not very good - **Negative** |
209 | | 5. |   **Expected behavior**   |              Model should be generalized to predict correct labels for both original and negated text              |
210 |  
211 | ### INV
212 | 
213 | Label-preserving perturbations are applied to the test data. Despite perturbing the data, the model is expected to give the same prediction.
214 | 
215 | |  1. |         **Model**         |                                                       Sentiment Analysis                                                      |
216 | |:---:|:-------------------------:|:-----------------------------------------------------------------------------------------------------------------------------:|
217 | |  2. |        **Dataset**        |                                Larger subset of test dataset is perturbed by adding invariances and their contexts are preserved                               |
218 | | 3.  | **Invariance** |                      Typos and expanding contractions (i.e.) how well the model handle these invariances                      |
219 | |  4. |        **Example**        | *Original text*: I haven't liked this product - **Negative** <br> *Invariance text*: I have not liekd this prodcut - **Negative** |
220 | |  5. |   **Expected behavior**   |  Model should be generalized to handles these invariances and predict same label for both original and invariance texts  |
221 | 
222 | Benchmarking (defined in [stage_model_to_production.py](./scripts/stage_model_to_production.py)) is done as follows:
223 | - Latest and current production models are pulled from the model registry.
224 | - Test data (fresh data that the model hasn't seen during training) is fetched from S3 bucket.
225 | - **Behavioral testing** (perturbed data) and **evaluation** (original test data) is performed for both the models and metrics are returned.
226 | - If the latest model outperform the current production model, then push latest model into production and archive current production model.
227 | 
228 | ```
229 | productionalize_ = Productionalize(tracking_uri = config["model-tracking"]["mlflow_tracking_uri"],
230 |                                     test_data = config["files"]["test_data"],
231 |                                     model_name = config["model-registry"]["model_name"],
232 |                                     batch_size = config["train-parameters"]["batch_size"],
233 |                                     sequence_length = config["train-parameters"]["sequence_length"]
234 |                                     )
235 | 
236 | accuracy_latest_model, accuracy_production_model = productionalize_.benchmark_models()
237 | 
238 | success_ = productionalize_.push_new_model_to_production(accuracy_latest_model, accuracy_production_model)
239 | ```
240 | 
241 | | ![flowchart](./images/model_registry_latest1.PNG) |
242 | |:--:|
243 | | <b>Figure 6: Model Registry with latest model pushed to production model and archiving the other one - EC2 Instance </b>|
244 | 
245 | | ![flowchart](./images//model_registry_latest2.PNG) |
246 | |:--:|
247 | | <b>Figure 7: Model Registry with latest production model - EC2 Instance </b>|
248 | 
249 | ### CI/CD
250 | 
251 | It involves packaging the model artifacts into an image and deploy them to cloud instance. The steps are as follows:
252 | - The model registry in **EC2 instance** holds the **latest production model** that have passed both testing and evaluation.
253 | - The production model from the model registry is packaged and build into a docker image with all required dependencies & metadata as a **deployable model artifact**.
254 | - This artifact is then pushed into **Amazon ECR** that serves as a container registry. 
255 | 
256 | | ![flowchart](./images/ecr_image.PNG) |
257 | |:--:|
258 | | <b>Figure 8: Deployable docker image pushed to AWS ECR </b>|
259 | 
260 | - Finally, the deployable image from ECR is deployed into `AWS Sagemaker` instance which creates an **endpoint** that can be used to communicate with the model for inferencing.
261 | - The endpoint can be tested using some tools like `Postman`.
262 | - The aforementioned steps are defined in [deploy.py](./scripts/deploy.py). All the necessary secrets are exported as environment variables. Specific IAM role and user have been created for deployment.
263 | 
264 | ```
265 | sagemaker._deploy(
266 |                 mode = 'create',
267 |                 app_name = app_name,
268 |                 model_uri = model_uri,
269 |                 image_url = docker_image_url,
270 |                 execution_role_arn = role,
271 |                 instance_type = 'ml.m5.xlarge',
272 |                 instance_count = 1,
273 |                 region_name = region
274 |                 )
275 | ```
276 | 
277 | | ![flowchart](./images/Sagemaker_endpoint.jpg) |
278 | |:--:|
279 | | <b>Figure 9: Production model deployed to AWS Sagemaker </b>|
280 | 
281 | **Note:** <br>
282 | *Every AWS resources created for this project will be deleted after the pipeline is executed successfully. This is done on purpose, to restrict and limit any incurring additional cost!!*
283 | 
284 | ## Feedback
285 | 
286 | If you have any feedback, please reach out to me at jithsasikumar@gmail.com
287 | 
288 | ## Bug / Issues
289 | 
290 | If you come across any bugs (or) issues related to code, model, implementation, results, pipeline etc, please feel free to open a [new issue here](https://github.com/Jithsaavvy/Sentiment-analysis-from-MLOps-paradigm/issues/new) by describing your search query and expected result.
291 | 
292 | ## References
293 | 
294 | [Paper - Beyond Accuracy: Behavioral Testing of NLP models with CheckList](https://homes.cs.washington.edu/~marcotcr/acl20_checklist.pdf)
295 | 
296 | [https://github.com/marcotcr/checklist](https://github.com/marcotcr/checklist)
297 | 
298 | [AWS Documentations](https://docs.aws.amazon.com/)
299 | 
300 | [Airflow Docs](https://airflow.apache.org/docs/)


--------------------------------------------------------------------------------
/config/config.toml:
--------------------------------------------------------------------------------
 1 | [tweets-scraping]
 2 | search_query = "mlops"
 3 | tweet_limit = 50000
 4 | 
 5 | [aws]
 6 | connection_id = "s3_connection"
 7 | s3_bucket_name = "twitter-data-bucket"
 8 | temp_data_path = "/opt/airflow/dags/"
 9 | 
10 | [files]
11 | raw_file_name = "raw_tweets.parquet"
12 | labelled_file_name = "labelled_tweets.parquet"
13 | preprocessed_file_name = "preprocessed_tweets.parquet"
14 | test_data = "./scripts/test_data/test_data.parquet"
15 | 
16 | [train-parameters]
17 | batch_size = 128
18 | num_classes = 3
19 | embedding_dim = 128
20 | sequence_length = 512
21 | num_epochs = 4
22 | learning_rate = 2e-3
23 | 
24 | [model-tracking]
25 | experiment = false
26 | experiment_name = "sentiment_classifier"
27 | run_name = "sc_run3"
28 | mlflow_tracking_uri = "http://ec2-44-203-120-100.compute-1.amazonaws.com:5000/"
29 | 
30 | [model-registry]
31 | model_name = "sentiment_classifier"
32 | filter_string = "name LIKE 'sentiment%'"
33 | 
34 | [model-deploy]
35 | endpoint_name = "sentiment-classifier"
36 | 
37 | [misc]
38 | query = "SELECT * from PROCESSED_TWEET"
39 | table_name = "PROCESSED_TWEETS"


--------------------------------------------------------------------------------
/dags/etl_twitter_dag.py:
--------------------------------------------------------------------------------
  1 | #!/usr/bin/env python3
  2 | 
  3 | """
  4 | @author: Jithin Sasikumar
  5 | 
  6 | Script to define the data pipeline as Airflow DAG that performs ETL (Extract Load Transform) tasks such as
  7 | scraping tweets from twitter, labelling, cleaning, normalizing and preprocessing the raw data to be used
  8 | for analysis and model training on scheduled interval.
  9 | """
 10 | 
 11 | import os
 12 | import json
 13 | import sys
 14 | from datetime import datetime
 15 | from airflow.decorators import task, dag
 16 | from airflow.utils.task_group import TaskGroup
 17 | from airflow.operators.python import PythonOperator
 18 | from airflow.providers.amazon.aws.hooks.s3 import S3Hook
 19 | from airflow.providers.snowflake.hooks.snowflake import SnowflakeHook
 20 | from snowflake.connector.pandas_tools import write_pandas
 21 | from airflow.models.connection import Connection
 22 | from task_definitions.etl_task_definitions import scrap_raw_tweets_from_web, preprocess_tweets
 23 | from task_definitions.etl_task_definitions import add_sentiment_labels_to_tweets
 24 | 
 25 | sys.path.append(os.path.join(os.path.dirname(__file__), ".."))
 26 | from utils.helper import Config, Connections
 27 | from utils.helper import load_dataframe
 28 | 
 29 | 
 30 | # Load all configurations from config.toml
 31 | config = Config()
 32 | 
 33 | @dag(dag_id = "etl", start_date = datetime(2023,1,1), schedule_interval = "@monthly", catchup = False)
 34 | def twitter_data_pipeline_dag_etl() -> None:
 35 |     """
 36 |     Data pipeline for performing ETL task that has to be used for training.
 37 | 
 38 |     Returns
 39 |     -------
 40 |             None
 41 |     """
 42 | 
 43 |     @task(task_id = "configure_connections")
 44 |     def set_connections() -> None:
 45 |         """
 46 |         Task 1 => Configure and establish respective connections for external services like
 47 |         AWS S3 buckets and Snowflake data warehouse. The credentials are stored as docker secrets
 48 |         in respective containers and accessed as environment variables for secure usage which
 49 |         restricts them from getting leaked in the docker image or repository.
 50 | 
 51 |         Note:
 52 |             AWS credentials are generated using specific IAM users and roles.
 53 | 
 54 |         Returns
 55 |         -------
 56 |             None
 57 |         """
 58 |         
 59 |         # AWS S3 connection
 60 |         aws_access_key_id = os.environ["AWS_ACCESS_KEY_ID"]
 61 |         aws_secret_access_key = os.environ["AWS_SECRET_ACCESS_KEY"]
 62 |         aws_region_name = os.environ["REGION"]
 63 |         s3_credentials = json.dumps(
 64 |                                 dict(
 65 |                                     aws_access_key_id = aws_access_key_id,
 66 |                                     aws_secret_access_key = aws_secret_access_key,
 67 |                                     aws_region_name = aws_region_name,
 68 |                                     )
 69 |                                 )
 70 | 
 71 |         s3_connection = Connection(conn_id = "s3_connection",
 72 |                                    conn_type = "S3",
 73 |                                    extra = s3_credentials
 74 |                                   )
 75 |         s3_conn_response = Connections(s3_connection).create_connections()
 76 | 
 77 |         # Snowflake connection
 78 |         login = os.environ["LOGIN"]
 79 |         password = os.environ["PASSWORD"]
 80 |         host_name = os.environ["HOST"]
 81 | 
 82 |         snowflake_connection = Connection(conn_id = "snowflake_conn",
 83 |                                           conn_type = "Snowflake",
 84 |                                           host = host_name,
 85 |                                           login = login,
 86 |                                           password = password
 87 |                                         )
 88 | 
 89 |         snowflake_conn_response = Connections(snowflake_connection).create_connections()
 90 | 
 91 | 
 92 |         if not s3_conn_response and snowflake_conn_response:
 93 |             print("Connection not established!!")
 94 | 
 95 |     #Instantiating S3 hook for respective tasks
 96 |     s3_hook = S3Hook(aws_conn_id = config["aws"]["connection_id"])
 97 | 
 98 |     # Task 2 => Refer respective task definition for documentation
 99 |     scrap_raw_tweets_from_web_ = PythonOperator(
100 |                                     task_id = "scrap_raw_tweets_from_web",
101 |                                     python_callable = scrap_raw_tweets_from_web,
102 |                                     op_kwargs = {
103 |                                         's3_hook': s3_hook,
104 |                                         'bucket_name': config["aws"]["s3_bucket_name"],
105 |                                         'search_query': config["tweets-scraping"]["search_query"],
106 |                                         'tweet_limit': config["tweets-scraping"]["tweet_limit"],
107 |                                         'raw_file_name': config["files"]["raw_file_name"]
108 |                                         }
109 |                                     )
110 | 
111 |     @task(task_id = "download_from_s3")
112 |     def download_data_from_s3_bucket(temp_data_path: str, file_name: str) -> None:
113 |         """
114 |         Task 3 => Download data stored in S3 buckets for usage.
115 | 
116 |         Parameters
117 |         ----------
118 |         temp_data_path: str
119 |             Path to save downloaded file.
120 |         file_name: str
121 |             Name of the downloaded file.
122 | 
123 |         Returns
124 |         -------
125 |             None
126 |         """
127 | 
128 |         # Creating a S3 hook using the connection created via task 1.
129 |         downloaded_file = s3_hook.download_file(
130 |                                             key = file_name,
131 |                                             bucket_name = config["aws"]["s3_bucket_name"],
132 |                                             local_path = temp_data_path
133 |                                             )
134 |         os.rename(src = downloaded_file, destination = f"{temp_data_path}/{file_name}")
135 | 
136 |     with TaskGroup(group_id = "sentiment_labelling") as group1:
137 |         #Task 4 => Refer respective task definition for documentation
138 |         add_sentiment_labels_to_scrapped_tweets_ = PythonOperator(
139 |                                                 task_id = "add_sentiment_labels_to_scrapped_tweets",
140 |                                                 python_callable = add_sentiment_labels_to_tweets,
141 |                                                 op_kwargs = {
142 |                                                     's3_hook': s3_hook,
143 |                                                     'bucket_name': config["aws"]["s3_bucket_name"],
144 |                                                     'temp_data_path': config["aws"]["temp_data_path"],
145 |                                                     'raw_file_name': config["files"]["raw_file_name"],
146 |                                                     'labelled_file_name': config["files"]["labelled_file_name"],
147 |                                                 }
148 |                                             )
149 | 
150 |         # Prioritizing every downstream tasks pertaining to task group 1
151 |         download_data_from_s3_bucket(config["aws"]["temp_data_path"], config["files"]["raw_file_name"]) >> add_sentiment_labels_to_scrapped_tweets_
152 | 
153 | 
154 |     with TaskGroup(group_id = "preprocess_tweets_using_NLP") as group2:
155 |         #Task 5 => Refer respective task definition for documentation
156 |         preprocess_tweets_ = PythonOperator(
157 |                                 task_id = "preprocess_labelled_tweets_using_nlp_techniques",
158 |                                 python_callable = preprocess_tweets,
159 |                                 op_kwargs = {
160 |                                     's3_hook': s3_hook,
161 |                                     'bucket_name': config["aws"]["s3_bucket_name"],
162 |                                     'temp_data_path': config["aws"]["temp_data_path"],
163 |                                     'labelled_file_name': config["files"]["labelled_file_name"],
164 |                                     'preprocessed_file_name': config["files"]["preprocessed_file_name"]
165 |                                 }
166 |                             )
167 |         
168 |         # Prioritizing every downstream tasks pertaining to task group 2
169 |         download_data_from_s3_bucket(config["aws"]["temp_data_path"], config["files"]["labelled_file_name"]) >> preprocess_tweets_
170 | 
171 |     @task(task_id = "load_processed_data_to_datawarehouse")
172 |     def load_processed_data_to_snowflake(processed_file: str, table_name: str) -> None:
173 |         """
174 |         Task 6 => Load and write final processed data into snowflake data warehouse. It loads the processed parquet
175 |         file as dataframe and loads it as a database table into the data warehouse.
176 | 
177 |         Parameters
178 |         ----------
179 |         processed_file: str
180 |             Name of preprocessed parquet file.
181 |         table_name: str
182 |             Name of the database table in snowflake data warehouse.
183 | 
184 |         Returns
185 |         -------
186 |             None
187 |         """
188 |         try:
189 |             # Similar to S3 hook, snowflake hook is used accordingly
190 |             snowflake_conn = SnowflakeHook(
191 |                                         snowflake_conn_id = "snowflake_conn",
192 |                                         account = os.environ["ACCOUNT"],
193 |                                         warehouse = os.environ["WAREHOUSE"],
194 |                                         database = os.environ["DATABASE"],
195 |                                         schema = os.environ["SCHEMA"],
196 |                                         role = os.environ["ROLE"]
197 |                                         )
198 | 
199 |             dataframe = load_dataframe(processed_file)
200 | 
201 |             # Functionality to write any pandas dataframe into snowflake
202 |             write_pandas(
203 |                         conn = snowflake_conn,
204 |                         df = dataframe,
205 |                         table_name = table_name,
206 |                         quote_identifiers = False
207 |                         )
208 |         
209 |         except Exception as exc:
210 |             raise ConnectionError("Something went wrong with the snowflake connection. Please check them!!") from exc
211 | 
212 |         finally:
213 |             snowflake_conn.close()
214 | 
215 |     # Prioritizing every downstream tasks pertaining to the entire DAG
216 |     set_connections() >> scrap_raw_tweets_from_web_>> group1 >> group2 >> load_processed_data_to_snowflake(config["files"]["preprocessed_file_name"], config["misc"]["table_name"])
217 | 
218 | 
219 | etl_dag = twitter_data_pipeline_dag_etl()


--------------------------------------------------------------------------------
/dags/model_training_dag.py:
--------------------------------------------------------------------------------
 1 | #!/usr/bin/env python3
 2 | 
 3 | """
 4 | @author: Jithin Sasikumar
 5 | 
 6 | Script to define model training pipeline as Airflow DAG that trains Bi-LSTM model with the
 7 | processed data from data warehouse. In the DAG, in order to improve the training time and
 8 | efficiency, the model training is done within an external (user-build) docker container with
 9 | tensorflow-gpu base image and it is not included in airflow docker compose.
10 | It is a GPU accelerated training.
11 | 
12 | """
13 | 
14 | import os
15 | import sys
16 | import pandas as pd
17 | from datetime import datetime
18 | from airflow.decorators import task, dag
19 | from airflow.providers.docker.operators.docker import DockerOperator
20 | from airflow.providers.snowflake.hooks.snowflake import SnowflakeHook
21 | 
22 | sys.path.append(os.path.join(os.path.dirname(__file__), ".."))
23 | from utils.helper import Config
24 | 
25 | # Load all configurations from config.toml
26 | config = Config()
27 | 
28 | @dag(dag_id = "model_training", start_date = datetime(2023,1,1), schedule_interval = "@monthly", catchup = False)
29 | def model_training_pipeline_dag() -> None:
30 |     """
31 |     Pipeline to perform the GPU accelerated model training within the user-build docker image
32 | 
33 |     Returns
34 |     -------
35 |             None
36 |     """
37 | 
38 |     @task(task_id = "load_data_from_warehouse")
39 |     def pull_snowflake_data_as_df(query: str) -> pd.DataFrame:
40 |         """
41 |         Task 1 => Loaded data as a result of ETL pipeline is fetched from the database
42 |         table of snowflake data warehouse as a dataframe. This will be used for
43 |         model training.
44 | 
45 |         Parameters
46 |         ----------
47 |         query: str
48 |             Database query
49 | 
50 |         Returns
51 |         -------
52 |         dataframe: pd.DataFrame
53 |             Fetched data
54 |         """
55 |         try:
56 |             snowflake_conn = SnowflakeHook(
57 |                                         snowflake_conn_id = "snowflake_conn",
58 |                                         account = os.environ["ACCOUNT"],
59 |                                         warehouse = os.environ["WAREHOUSE"],
60 |                                         database = os.environ["DATABASE"],
61 |                                         schema = os.environ["SCHEMA"],
62 |                                         role = os.environ["ROLE"]
63 |                                     )
64 | 
65 |             cursor = snowflake_conn.cursor().execute(query)
66 |             dataframe = cursor.fetch_pandas_all()
67 | 
68 |             return dataframe
69 | 
70 |         except Exception as exc:
71 |             raise ConnectionError("Snowflake connection error. Please check and try again!!") from exc
72 | 
73 |         finally:
74 |             cursor.close()
75 |             snowflake_conn.close()
76 | 
77 |     
78 |     # Task 2 => Refer /task_definitions/model_training.py for documentation
79 |     train_model = DockerOperator(
80 |                                 task_id = "train_model_task",
81 |                                 image = "model_training_tf:latest",
82 |                                 auto_remove = True,
83 |                                 docker_url = "unix://var/run/docker.sock",
84 |                                 api_version = "auto",
85 |                                 command = "python3 model_training.py"
86 |                                 )
87 | 
88 |     pull_snowflake_data_as_df(config["misc"]["query"]) >> train_model
89 | 
90 | model_train_dag = model_training_pipeline_dag()


--------------------------------------------------------------------------------
/dags/task_definitions/etl_task_definitions.py:
--------------------------------------------------------------------------------
  1 | """
  2 | @author: Jithin Sasikumar
  3 | 
  4 | Module that defines every task required for ETL data pipeline (DAG) to run successfully.
  5 | """
  6 | import os
  7 | import sys
  8 | import pandas as pd
  9 | import snscrape.modules.twitter as sntwitter
 10 | import nltk
 11 | from nltk.tokenize import word_tokenize
 12 | from nltk.corpus import stopwords
 13 | from nltk.stem import WordNetLemmatizer
 14 | from nltk.stem.porter import PorterStemmer
 15 | from airflow.providers.amazon.aws.hooks.s3 import S3Hook
 16 | 
 17 | sys.path.append(os.path.join(os.path.dirname(__file__), "..."))
 18 | from utils import helper
 19 | nltk.download('punkt')
 20 | nltk.download('stopwords')
 21 | stopwords_ = stopwords.words("english")
 22 | nltk.download('wordnet')
 23 | nltk.download('omw-1.4')
 24 | nltk.download('vader_lexicon')
 25 | 
 26 | def scrap_raw_tweets_from_web(**kwargs) -> None:
 27 |     """
 28 |     Scrap raw tweets from twitter using snscrape library and load it as parquet file to S3 bucket.
 29 | 
 30 |     Parameters
 31 |     ----------
 32 |     **kwargs: Arbitrary keyword arguments
 33 |         See below for expansion
 34 | 
 35 |     keyword arguments
 36 |     -----------------
 37 |     **s3_hook: S3Hook
 38 |         Instance of S3Hook to connect with specified S3 bucket.
 39 |     **bucket_name: str
 40 |         Name of S3 bucket to load resulting raw parquet file.
 41 |     **search_query: str
 42 |         Keyword or topic to scrap the tweets.
 43 |     **tweet_limit: int
 44 |         Limit of tweets to scrap from.
 45 |     **raw_file_name: str
 46 |         Name of raw parquet file to be loaded to S3.
 47 | 
 48 |     Returns
 49 |     -------
 50 |         None
 51 |     """
 52 |     tweets = list()
 53 |     try:
 54 |         for index, tweet in enumerate(sntwitter.TwitterSearchScraper(kwargs["search_query"]).get_items()):
 55 |             if index != kwargs["tweet_limit"]:
 56 |                 tweets.append([tweet.date, tweet.id, tweet.lang,
 57 |                                 tweet.user.username, tweet.content])
 58 | 
 59 |         raw_tweets_dataframe = pd.DataFrame(
 60 |                                             tweets,
 61 |                                             columns = [
 62 |                                                         'datetime', 'id',
 63 |                                                         'lang', 'username',
 64 |                                                         'raw_tweets'
 65 |                                                     ]
 66 |                                             )
 67 |         
 68 |         raw_tweets_dataframe.to_parquet(kwargs["raw_file_name"],
 69 |                                         index = False, engine = "pyarrow")
 70 |         kwargs["s3_hook"].load_file(
 71 |                                     filename = kwargs["raw_file_name"],
 72 |                                     key = kwargs["raw_file_name"],
 73 |                                     bucket_name = kwargs["bucket_name"]
 74 |                                 )
 75 | 
 76 |     except Exception as exc:
 77 |         raise Exception("Something went wrong with the tweet scraping task. Please check them!!") from exc
 78 | 
 79 | def add_sentiment_labels_to_tweets(**kwargs) -> None:
 80 |     """
 81 |     Calculate polarity of tweets and assign sentiment labels for the same fro S3 bucket as extracted raw tweets
 82 |     are unlabelled.
 83 | 
 84 |     Parameters
 85 |     ----------
 86 |     **kwargs: Arbitrary keyword arguments
 87 |         See below for expansion
 88 | 
 89 |     keyword arguments
 90 |     -----------------
 91 |     **s3_hook: S3Hook
 92 |         Instance of S3Hook to connect with specified S3 bucket.
 93 |     **bucket_name: str
 94 |         Name of S3 bucket to load resulting raw parquet file.
 95 |     **temp_data_path: str
 96 |         Path to save intermittent temp file as a buffer.
 97 |     **raw_file_name: str
 98 |         Name of raw parquet file from S3.
 99 |     **labelled_file_name: str
100 |         Name of file containing respective sentiment labels.
101 | 
102 |     Returns
103 |     -------
104 |         None
105 |     """
106 |     dataframe = pd.read_parquet(
107 |                                 path = f"{kwargs['temp_data_path']}/{kwargs['raw_file_name']}",
108 |                                 engine = "pyarrow"
109 |                             )
110 |     dataframe_en = dataframe[dataframe['lang'] == "en"]
111 |     dataframe_en["cleaned_tweets"] = dataframe_en["raw_tweets"].apply(
112 |                                                                 lambda text: helper.remove_noise(text)
113 |                                                                 )
114 |     dataframe_en["polarity"] = dataframe_en["cleaned_tweets"].apply(
115 |                                                                 lambda text: helper.calculate_polarity(text)
116 |                                                                 )
117 |     dataframe_en["sentiment"] = dataframe_en["polarity"].apply(
118 |                                                                 lambda score: helper.assign_sentiment_labels(score)
119 |                                                             )
120 | 
121 |     dataframe_en.to_parquet(kwargs["labelled_file_name"],
122 |                             index = True, engine = "pyarrow")
123 |     kwargs["s3_hook"].load_file(
124 |                                 filename = kwargs["labelled_file_name"],
125 |                                 key = kwargs["labelled_file_name"],
126 |                                 bucket_name = kwargs["bucket_name"]
127 |                             )
128 | 
129 | def preprocess_tweets(**kwargs) -> None:
130 |     """
131 |     Normalize and preprocess labelled tweets from S3 using NLP techniques which wil be used for
132 |     model training.
133 | 
134 |     Parameters
135 |     ----------
136 |     **kwargs: Arbitrary keyword arguments
137 |         See below for expansion
138 | 
139 |     keyword arguments
140 |     -----------------
141 |     **s3_hook: S3Hook
142 |         Instance of S3Hook to connect with specified S3 bucket.
143 |     **bucket_name: str
144 |         Name of S3 bucket to load resulting raw parquet file.
145 |     **temp_data_path: str
146 |         Path to save intermittent temp file as a buffer.
147 |     **labelled_file_name: str
148 |         Name of file containing respective sentiment labels.
149 |     *preprocessed_file_name: str
150 |         Name of the file to be loaded to s3 after preprocessing.
151 | 
152 |     Returns
153 |     -------
154 |         None
155 |     """
156 |     dataframe = pd.read_parquet(path = f"{kwargs['temp_data_path']}/{kwargs['labelled_file_name']}",
157 |                                 engine = "pyarrow")
158 |     dataframe = dataframe.iloc[: , 1:]
159 |     dataframe['cleaned_tweets'] = dataframe['cleaned_tweets'].astype(str).str.lower()
160 |     dataframe['tokenized_tweets'] = dataframe["cleaned_tweets"].apply(word_tokenize)
161 | 
162 |     #Remove stopwords
163 |     dataframe['tokenized_tweets'] = dataframe['tokenized_tweets'].apply(
164 |                                                                 lambda tokens: helper.remove_stopwords(tokens, stopwords_)
165 |                                                                 )
166 |     dataframe = helper.remove_less_frequent_words(dataframe)
167 | 
168 |     #Lemmatize each tweet
169 |     wordnet_lem = WordNetLemmatizer()
170 |     dataframe['lemmatized_tweets'] = dataframe['tokenized_strings'].apply(lambda tweet: " ".join([
171 |                                                                             wordnet_lem.lemmatize(word)
172 |                                                                             for word in tweet.split()]))
173 | 
174 |     #Stem each tweet
175 |     porter_stemmer = PorterStemmer()
176 |     dataframe['processed_tweets'] = dataframe['lemmatized_tweets'].apply(lambda tweet: " ".join([
177 |                                                                             porter_stemmer.stem(word)
178 |                                                                             for word in tweet.split()]))
179 | 
180 |     dataframe = dataframe.reindex(columns = [col for col in dataframe.columns if col != 'sentiment'] + ['sentiment'])
181 | 
182 |     # Encoding labels (integers) to sentiments
183 |     dataframe['labels'] = dataframe['sentiment'].map(
184 |                                                     {
185 |                                                         "neutral": 0,
186 |                                                         "negative": 1,
187 |                                                         "positive": 2
188 |                                                     }
189 |                                                 )
190 |     # Printing in console to ensure that the entire process is successful which can be later accessed from Airflow logs
191 |     print(dataframe.shape, dataframe.columns)
192 | 
193 |     dataframe.to_parquet(kwargs["preprocessed_file_name"],
194 |                         index = False, engine = "pyarrow")
195 |     kwargs["s3_hook"].load_file(
196 |                             filename = kwargs["preprocessed_file_name"],
197 |                             key = kwargs["preprocessed_file_name"],
198 |                             bucket_name = kwargs["bucket_name"]
199 |                         )


--------------------------------------------------------------------------------
/dags/task_definitions/model_training.py:
--------------------------------------------------------------------------------
  1 | #!/usr/bin/env python3
  2 | 
  3 | """
  4 | @author: Jithin Sasikumar
  5 | 
  6 | Script to perform Bi-directional LSTM training with BERT tokenizer. This script will be copied and
  7 | executed inside external (user-build) docker container with tensorflow GPU installed. This is
  8 | provided in this directory for reference.
  9 | 
 10 | Every training run will be tracked, artifacts are logged by MLflow tracking server hosted on AWS EC2 instance.
 11 | (i.e.) training is performed locally using GPU via user-build docker container and entire model tracking &
 12 | logging happens in the EC2 instance by the tracking server.
 13 | 
 14 | """
 15 | 
 16 | import os
 17 | import sys
 18 | import pandas as pd
 19 | from tqdm.auto import tqdm
 20 | from dataclasses import dataclass
 21 | import tensorflow as tf
 22 | from sklearn.model_selection import train_test_split
 23 | from keras.models import Sequential
 24 | from keras.utils import to_categorical
 25 | from keras import losses, optimizers, metrics
 26 | from transformers import BertTokenizer
 27 | 
 28 | sys.path.append(os.path.join(os.path.dirname(__file__), "..."))
 29 | from utils.helper import load_dataframe
 30 | from utils.prepare_data import Dataset
 31 | from utils.model import BiLSTM_Model
 32 | from utils.helper import Config
 33 | from utils.experiment_tracking import MLFlowTracker
 34 | 
 35 | config = Config()
 36 | 
 37 | @dataclass
 38 | class Train_parameters:
 39 |         """
 40 |         Dataclass for holding parameter values for training.
 41 | 
 42 |         Member variables
 43 |         ----------------
 44 |         batch_size: int
 45 |                 Number of samples per gradient update.
 46 |         num_classes: int
 47 |                 Number of output labels or classes.
 48 |         embedding_dim: int
 49 |                 Number of output embedding vectors for embedding layer.
 50 |         sequence_length: int
 51 |                 Size of each input sequence
 52 |         num_epochs: int
 53 |                 Number of epochs to train the model.
 54 |         """
 55 |         batch_size: int
 56 |         num_classes: int
 57 |         embedding_dim: int
 58 |         sequence_length: int
 59 |         num_epochs: int
 60 |         learning_rate: float
 61 | 
 62 | @dataclass
 63 | class Model_tracking_parameters:
 64 |         """
 65 |         Dataclass for holding parameter values for model tracking.
 66 | 
 67 |         Member variables
 68 |         ----------------
 69 |         experiment_name: str
 70 |                 Name of experiment to log as MLflow run.
 71 |         mlflow_tracking_uri: str
 72 |                 URI of EC2 instance where MLflow server is hosted.
 73 |         run_name: str
 74 |                 Name of training run pertaining to an experiment
 75 |         experiment: bool
 76 |                 True to create a new experiment, else False.
 77 |         """
 78 |         experiment_name: str
 79 |         mlflow_tracking_uri: str
 80 |         run_name: str
 81 |         experiment: bool
 82 | 
 83 | class Training:
 84 |         def __init__(self, training_args: Train_parameters,
 85 |                     model_tracking_args: Model_tracking_parameters
 86 |                 ):
 87 | 
 88 |                 """
 89 |                 Instance variables
 90 |                 ------------------
 91 |                 training_args: Train_parameters
 92 |                         Instance of Train_parameters
 93 |                 model_tracking_args: Model_tracking_parameters
 94 |                         Instance of Model_tracking_parameters
 95 |                 """
 96 |                 self.training_args = training_args
 97 |                 self.model_tracking_args = model_tracking_args
 98 | 
 99 |         def check_and_set_gpu(self) -> tf.config.LogicalDevice:
100 |                 """
101 |                 Configure and set GPU for model training, else use CPU by default.
102 | 
103 |                 Parameters
104 |                 ----------
105 |                         None
106 | 
107 |                 Returns
108 |                 -------
109 |                        logical_gpu: tf.config.LogicalDevice
110 |                                 List of initialized logical devices.
111 |                 
112 |                 Raises
113 |                 ------
114 |                 RuntimeError: Exception
115 |                         If GPU setting failed during runtime.
116 |                 """
117 |                 try:
118 |                         available_gpu_devices = tf.config.experimental.list_physical_devices("GPU")
119 |                         if len(available_gpu_devices) > 0:
120 |                                 # Since the system has only one GPU, setting it to the first GPU
121 |                                 tf.config.set_visible_devices(available_gpu_devices[0], "GPU")
122 |                                 # Allocating GPU memory based on the runtime
123 |                                 tf.config.experimental.set_memory_growth(available_gpu_devices[0], True)
124 |                                 logical_gpu = tf.config.list_logical_devices("GPU")
125 | 
126 |                 except Exception as exc:
127 |                         raise RuntimeError("Runtime failed in GPU setting. Please check and try again!!") from exc
128 | 
129 |                 return logical_gpu
130 | 
131 |         def train(self) -> None:
132 |                 """
133 |                 Method that initializes and performs model training.
134 | 
135 |                 Parameters
136 |                 ----------
137 |                         None
138 | 
139 |                 Returns
140 |                 -------
141 |                         None
142 |                 """
143 | 
144 |                 # Configure physical GPU to logical device in the runtime and assert whether it's successful
145 |                 gpu = self.check_and_set_gpu()
146 |                 assert len(gpu) > 0
147 | 
148 |                 tracker = MLFlowTracker(experiment_name = self.model_tracking_args.experiment_name,
149 |                                         tracking_uri = self.model_tracking_args.mlflow_tracking_uri,
150 |                                         run_name = self.model_tracking_args.run_name,
151 |                                         experiment = self.model_tracking_args.experiment)
152 |                 tracker.log()
153 |                 
154 |                 dataframe: pd.DataFrame = load_dataframe("./preprocessed_tweets.parquet")
155 |                 df = dataframe[['cleaned_tweets','labels']].iloc[0:35000].copy()
156 |                 train_dataframe, test_dataframe = train_test_split(df, test_size = 0.25,
157 |                                                                 random_state = 42,
158 |                                                                 stratify = df['labels'])
159 |                 train_dataframe.dropna(inplace = True)
160 |                 test_dataframe.dropna(inplace = True)
161 |                 
162 |                 y_train = to_categorical(train_dataframe['labels'], num_classes = self.training_args.num_classes)
163 |                 y_test = to_categorical(test_dataframe['labels'], num_classes = self.training_args.num_classes)
164 | 
165 |                 # Using the BERT tokenizer to tokenize every input tweets, rather than a normal tokenizer
166 |                 tokenizer = BertTokenizer.from_pretrained("bert-base-uncased")
167 |                 train_dataset: tf.data.Dataset.zip = Dataset(tokenizer = tokenizer, dataframe = train_dataframe,
168 |                                                             labels = y_train, batch_size = self.training_args.batch_size,
169 |                                                             max_length = self.training_args.sequence_length,
170 |                                                             train = True).encode_bert_tokens_to_tf_dataset()
171 | 
172 |                 test_dataset: tf.data.Dataset.zip = Dataset(tokenizer = tokenizer, dataframe = test_dataframe,
173 |                                                            labels = y_test, batch_size = self.training_args.batch_size,
174 |                                                            max_length = self.training_args.sequence_length,
175 |                                                            train = True).encode_bert_tokens_to_tf_dataset()
176 |                                         
177 |                 model: Sequential = BiLSTM_Model(
178 |                                 tokenizer.vocab_size,
179 |                                 self.training_args.num_classes,
180 |                                 self.training_args.embedding_dim,
181 |                                 self.training_args.sequence_length).create_model()
182 |                 
183 |                 print("Training started.....")
184 |                 model.compile(
185 |                              loss = losses.CategoricalCrossentropy(),
186 |                              optimizer = optimizers.Adam(
187 |                                                         learning_rate = self.training_args.learning_rate,
188 |                                                         epsilon=1e-08),
189 |                              metrics = [metrics.CategoricalAccuracy('accuracy')]
190 |                         )
191 |                 
192 |                 model.fit(
193 |                         train_dataset,
194 |                         validation_data = test_dataset,
195 |                         epochs = self.training_args.num_epochs,
196 |                         batch_size = self.training_args.batch_size
197 |                         )
198 | 
199 |                 tracker.end()
200 | 
201 | def main() -> None:
202 |         training_parameters_ = Train_parameters(
203 |                                         config["train-parameters"]["batch_size"],
204 |                                         config["train-parameters"]["num_classes"],
205 |                                         config["train-parameters"]["embedding_dim"],
206 |                                         config["train-parameters"]["sequence_length"],
207 |                                         config["train-parameters"]["num_epochs"],
208 |                                         config["train-parameters"]["learning_rate"],
209 |                                         )
210 | 
211 |         model_tracking_parameters_ = Model_tracking_parameters(
212 |                                         config["model-tracking"]["experiment_name"],
213 |                                         config["model-tracking"]["mlflow_tracking_uri"],
214 |                                         config["model-tracking"]["run_name"],
215 |                                         config["model-tracking"]["experiment"]
216 |                                         )
217 |         
218 |         model_training_ = Training(
219 |                                 training_parameters_,
220 |                                 model_tracking_parameters_
221 |                                 )
222 | 
223 |         model_training_.train()
224 | 
225 | if __name__ == "__main__":
226 |     main()


--------------------------------------------------------------------------------
/dependencies/Dockerfile:
--------------------------------------------------------------------------------
 1 | # Base docker image with all required dependencies and secrets mounted
 2 | 
 3 | FROM apache/airflow:2.4.1-python3.9
 4 | 
 5 | COPY ./dependencies/requirements.txt /requirements.txt
 6 | 
 7 | RUN pip install --user --upgrade pip
 8 | 
 9 | RUN pip install -r /requirements.txt
10 | 
11 | # Mounting every docker secrets into the docker image as environment variables, 
12 | # so that they aren't leaked & exposed by layer caching during image build
13 | RUN --mount=type=secret,id=AWS_ACCESS_KEY_ID \
14 |     --mount=type=secret,id=AWS_SECRET_ACCESS_KEY \
15 |     --mount=type=secret,id=REGION \
16 |     --mount=type=secret,id=LOGIN \
17 |     --mount=type=secret,id=PASSWORD \
18 |     --mount=type=secret,id=HOST \
19 |     --mount=type=secret,id=ACCOUNT \
20 |     --mount=type=secret,id=WAREHOUSE \
21 |     --mount=type=secret,id=DATABASE \
22 |     --mount=type=secret,id=SCHEMA \
23 |     export AWS_ACCESS_KEY_ID=$(cat /run/secrets/AWS_ACCESS_KEY_ID) && \
24 |     export AWS_SECRET_ACCESS_KEY=$(cat /run/secrets/AWS_SECRET_ACCESS_KEY) && \
25 |     export REGION=$(cat /run/secrets/REGION) && \
26 |     export LOGIN=$(cat /run/secrets/LOGIN) && \
27 |     export PASSWORD=$(cat /run/secrets/PASSWORD) && \
28 |     export HOST=$(cat /run/secrets/HOST) && \
29 |     export ACCOUNT=$(cat /run/secrets/ACCOUNT) && \
30 |     export WAREHOUSE=$(cat /run/secrets/WAREHOUSE) && \
31 |     export DATABASE=$(cat /run/secrets/DATABASE) && \
32 |     export SCHEMA=$(cat /run/secrets/SCHEMA)


--------------------------------------------------------------------------------
/dependencies/requirements.txt:
--------------------------------------------------------------------------------
 1 | pandas==1.3.5
 2 | nltk==3.7
 3 | textblob===0.17.1
 4 | snscrape==0.4.3.20220106
 5 | tomli==2.0.1
 6 | apache-airflow[amazon]==2.4.2
 7 | transformers==4.24.0
 8 | numpy==1.23.4
 9 | tensorflow==2.10.0
10 | pyOpenSSL==22.1.0
11 | pyarrow==8.0.0
12 | cryptography==38.0.1
13 | snowflake-connector-python==2.9.0
14 | apache-airflow-providers-snowflake==4.0.2
15 | apache-airflow-providers-docker==3.4.0
16 | spacy==3.5.0
17 | mlflow==2.1.1
18 | checklist==0.0.11
19 | 


--------------------------------------------------------------------------------
/docker-compose.yaml:
--------------------------------------------------------------------------------
  1 | # Licensed to the Apache Software Foundation (ASF) under one
  2 | # or more contributor license agreements.  See the NOTICE file
  3 | # distributed with this work for additional information
  4 | # regarding copyright ownership.  The ASF licenses this file
  5 | # to you under the Apache License, Version 2.0 (the
  6 | # "License"); you may not use this file except in compliance
  7 | # with the License.  You may obtain a copy of the License at
  8 | #
  9 | #   http://www.apache.org/licenses/LICENSE-2.0
 10 | #
 11 | # Unless required by applicable law or agreed to in writing,
 12 | # software distributed under the License is distributed on an
 13 | # "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
 14 | # KIND, either express or implied.  See the License for the
 15 | # specific language governing permissions and limitations
 16 | # under the License.
 17 | #
 18 | 
 19 | # Basic Airflow cluster configuration for CeleryExecutor with Redis and PostgreSQL.
 20 | #
 21 | # WARNING: This configuration is for local development. Do not use it in a production deployment.
 22 | #
 23 | # This configuration supports basic configuration using environment variables or an .env file
 24 | # The following variables are supported:
 25 | #
 26 | # AIRFLOW_IMAGE_NAME           - Docker image name used to run Airflow.
 27 | #                                Default: apache/airflow:2.4.1
 28 | # AIRFLOW_UID                  - User ID in Airflow containers
 29 | #                                Default: 50000
 30 | # Those configurations are useful mostly in case of standalone testing/running Airflow in test/try-out mode
 31 | #
 32 | # _AIRFLOW_WWW_USER_USERNAME   - Username for the administrator account (if requested).
 33 | #                                Default: airflow
 34 | # _AIRFLOW_WWW_USER_PASSWORD   - Password for the administrator account (if requested).
 35 | #                                Default: airflow
 36 | # _PIP_ADDITIONAL_REQUIREMENTS - Additional PIP requirements to add when starting all containers.
 37 | #                                Default: ''
 38 | #
 39 | # Feel free to modify this file to suit your needs.
 40 | ---
 41 | version: '3'
 42 | x-airflow-common:
 43 |   &airflow-common
 44 |   # In order to add custom dependencies or upgrade provider packages you can use your extended image.
 45 |   # Comment the image line, place your Dockerfile in the directory where you placed the docker-compose.yaml
 46 |   # and uncomment the "build" line below, Then run `docker-compose build` to build the images.
 47 |   image: ${AIRFLOW_IMAGE_NAME:-extending_airflow:latest}
 48 |   # build: .
 49 |   environment:
 50 |     &airflow-common-env
 51 |     AIRFLOW__CORE__EXECUTOR: CeleryExecutor
 52 |     AIRFLOW__DATABASE__SQL_ALCHEMY_CONN: postgresql+psycopg2://airflow:airflow@postgres/airflow
 53 |     # For backward compatibility, with Airflow <2.3
 54 |     AIRFLOW__CORE__SQL_ALCHEMY_CONN: postgresql+psycopg2://airflow:airflow@postgres/airflow
 55 |     AIRFLOW__CELERY__RESULT_BACKEND: db+postgresql://airflow:airflow@postgres/airflow
 56 |     AIRFLOW__CELERY__BROKER_URL: redis://:@redis:6379/0
 57 |     AIRFLOW__CORE__FERNET_KEY: ''
 58 |     AIRFLOW__CORE__DAGS_ARE_PAUSED_AT_CREATION: 'true'
 59 |     AIRFLOW__CORE__LOAD_EXAMPLES: 'false'
 60 |     AIRFLOW__API__AUTH_BACKEND: 'airflow.api.auth.backend.basic_auth'
 61 |     AWS_ACCESS_KEY_ID: /run/secrets/aws_access_key_id
 62 |     AWS_SECRET_ACCESS_KEY: /run/secrets/aws_secret_access_key
 63 |     REGION_NAME: /run/secrets/region_name
 64 |     LOGIN: /run/secrets/login
 65 |     PASSWORD: /run/secrets/password
 66 |     HOST: /run/secrets/host
 67 |     ACCOUNT: /run/secrets/account
 68 |     WAREHOUSE: /run/secrets/warehouse
 69 |     DATABASE: /run/secrets/database
 70 |     SCHEMA: /run/secrets/schema
 71 |   volumes:
 72 |     - ./dags:/opt/airflow/dags
 73 |     - ./logs:/opt/airflow/logs
 74 |     - ./plugins:/opt/airflow/plugins
 75 |     - ./config:/opt/airflow/config
 76 |   user: "${AIRFLOW_UID:-50000}:0"
 77 |   depends_on:
 78 |     &airflow-common-depends-on
 79 |     redis:
 80 |       condition: service_healthy
 81 |     postgres:
 82 |       condition: service_healthy
 83 | 
 84 | services:
 85 |   postgres:
 86 |     image: postgres:13
 87 |     environment:
 88 |       POSTGRES_USER: airflow
 89 |       POSTGRES_PASSWORD: airflow
 90 |       POSTGRES_DB: airflow
 91 |     volumes:
 92 |       - postgres-db-volume:/var/lib/postgresql/data
 93 |     healthcheck:
 94 |       test: ["CMD", "pg_isready", "-U", "airflow"]
 95 |       interval: 5s
 96 |       retries: 5
 97 |     restart: always
 98 | 
 99 |   redis:
100 |     image: redis:latest
101 |     expose:
102 |       - 6379
103 |     healthcheck:
104 |       test: ["CMD", "redis-cli", "ping"]
105 |       interval: 5s
106 |       timeout: 30s
107 |       retries: 50
108 |     restart: always
109 | 
110 |   airflow-webserver:
111 |     <<: *airflow-common
112 |     command: webserver
113 |     ports:
114 |       - 8080:8080
115 |     healthcheck:
116 |       test: ["CMD", "curl", "--fail", "http://localhost:8080/health"]
117 |       interval: 10s
118 |       timeout: 10s
119 |       retries: 5
120 |     restart: always
121 |     depends_on:
122 |       <<: *airflow-common-depends-on
123 |       airflow-init:
124 |         condition: service_completed_successfully
125 | 
126 |   airflow-scheduler:
127 |     <<: *airflow-common
128 |     command: scheduler
129 |     healthcheck:
130 |       test: ["CMD-SHELL", 'airflow jobs check --job-type SchedulerJob --hostname "$${HOSTNAME}"']
131 |       interval: 10s
132 |       timeout: 10s
133 |       retries: 5
134 |     restart: always
135 |     depends_on:
136 |       <<: *airflow-common-depends-on
137 |       airflow-init:
138 |         condition: service_completed_successfully
139 | 
140 |   airflow-worker:
141 |     <<: *airflow-common
142 |     command: celery worker
143 |     healthcheck:
144 |       test:
145 |         - "CMD-SHELL"
146 |         - 'celery --app airflow.executors.celery_executor.app inspect ping -d "celery@$${HOSTNAME}"'
147 |       interval: 10s
148 |       timeout: 10s
149 |       retries: 5
150 |     environment:
151 |       <<: *airflow-common-env
152 |       # Required to handle warm shutdown of the celery workers properly
153 |       # See https://airflow.apache.org/docs/docker-stack/entrypoint.html#signal-propagation
154 |       DUMB_INIT_SETSID: "0"
155 |     restart: always
156 |     depends_on:
157 |       <<: *airflow-common-depends-on
158 |       airflow-init:
159 |         condition: service_completed_successfully
160 | 
161 |   airflow-triggerer:
162 |     <<: *airflow-common
163 |     command: triggerer
164 |     healthcheck:
165 |       test: ["CMD-SHELL", 'airflow jobs check --job-type TriggererJob --hostname "$${HOSTNAME}"']
166 |       interval: 10s
167 |       timeout: 10s
168 |       retries: 5
169 |     restart: always
170 |     depends_on:
171 |       <<: *airflow-common-depends-on
172 |       airflow-init:
173 |         condition: service_completed_successfully
174 | 
175 |   airflow-init:
176 |     <<: *airflow-common
177 |     entrypoint: /bin/bash
178 |     # yamllint disable rule:line-length
179 |     command:
180 |       - -c
181 |       - |
182 |         function ver() {
183 |           printf "%04d%04d%04d%04d" $${1//./ }
184 |         }
185 |         airflow_version=$$(AIRFLOW__LOGGING__LOGGING_LEVEL=INFO && gosu airflow airflow version)
186 |         airflow_version_comparable=$$(ver $${airflow_version})
187 |         min_airflow_version=2.2.0
188 |         min_airflow_version_comparable=$$(ver $${min_airflow_version})
189 |         if (( airflow_version_comparable < min_airflow_version_comparable )); then
190 |           echo
191 |           echo -e "\033[1;31mERROR!!!: Too old Airflow version $${airflow_version}!\e[0m"
192 |           echo "The minimum Airflow version supported: $${min_airflow_version}. Only use this or higher!"
193 |           echo
194 |           exit 1
195 |         fi
196 |         if [[ -z "${AIRFLOW_UID}" ]]; then
197 |           echo
198 |           echo -e "\033[1;33mWARNING!!!: AIRFLOW_UID not set!\e[0m"
199 |           echo "If you are on Linux, you SHOULD follow the instructions below to set "
200 |           echo "AIRFLOW_UID environment variable, otherwise files will be owned by root."
201 |           echo "For other operating systems you can get rid of the warning with manually created .env file:"
202 |           echo "    See: https://airflow.apache.org/docs/apache-airflow/stable/howto/docker-compose/index.html#setting-the-right-airflow-user"
203 |           echo
204 |         fi
205 |         one_meg=1048576
206 |         mem_available=$$(($$(getconf _PHYS_PAGES) * $$(getconf PAGE_SIZE) / one_meg))
207 |         cpus_available=$$(grep -cE 'cpu[0-9]+' /proc/stat)
208 |         disk_available=$$(df / | tail -1 | awk '{print $$4}')
209 |         warning_resources="false"
210 |         if (( mem_available < 4000 )) ; then
211 |           echo
212 |           echo -e "\033[1;33mWARNING!!!: Not enough memory available for Docker.\e[0m"
213 |           echo "At least 4GB of memory required. You have $$(numfmt --to iec $$((mem_available * one_meg)))"
214 |           echo
215 |           warning_resources="true"
216 |         fi
217 |         if (( cpus_available < 2 )); then
218 |           echo
219 |           echo -e "\033[1;33mWARNING!!!: Not enough CPUS available for Docker.\e[0m"
220 |           echo "At least 2 CPUs recommended. You have $${cpus_available}"
221 |           echo
222 |           warning_resources="true"
223 |         fi
224 |         if (( disk_available < one_meg * 10 )); then
225 |           echo
226 |           echo -e "\033[1;33mWARNING!!!: Not enough Disk space available for Docker.\e[0m"
227 |           echo "At least 10 GBs recommended. You have $$(numfmt --to iec $$((disk_available * 1024 )))"
228 |           echo
229 |           warning_resources="true"
230 |         fi
231 |         if [[ $${warning_resources} == "true" ]]; then
232 |           echo
233 |           echo -e "\033[1;33mWARNING!!!: You have not enough resources to run Airflow (see above)!\e[0m"
234 |           echo "Please follow the instructions to increase amount of resources available:"
235 |           echo "   https://airflow.apache.org/docs/apache-airflow/stable/howto/docker-compose/index.html#before-you-begin"
236 |           echo
237 |         fi
238 |         mkdir -p /sources/logs /sources/dags /sources/plugins
239 |         chown -R "${AIRFLOW_UID}:0" /sources/{logs,dags,plugins}
240 |         exec /entrypoint airflow version
241 |     # yamllint enable rule:line-length
242 |     environment:
243 |       <<: *airflow-common-env
244 |       _AIRFLOW_DB_UPGRADE: 'true'
245 |       _AIRFLOW_WWW_USER_CREATE: 'true'
246 |       _AIRFLOW_WWW_USER_USERNAME: ${_AIRFLOW_WWW_USER_USERNAME:-airflow}
247 |       _AIRFLOW_WWW_USER_PASSWORD: ${_AIRFLOW_WWW_USER_PASSWORD:-airflow}
248 |       _PIP_ADDITIONAL_REQUIREMENTS: ''
249 |     user: "0:0"
250 |     volumes:
251 |       - .:/sources
252 | 
253 |   airflow-cli:
254 |     <<: *airflow-common
255 |     profiles:
256 |       - debug
257 |     environment:
258 |       <<: *airflow-common-env
259 |       CONNECTION_CHECK_MAX_COUNT: "0"
260 |     # Workaround for entrypoint issue. See: https://github.com/apache/airflow/issues/16252
261 |     command:
262 |       - bash
263 |       - -c
264 |       - airflow
265 | 
266 |   # You can enable flower by adding "--profile flower" option e.g. docker-compose --profile flower up
267 |   # or by explicitly targeted on the command line e.g. docker-compose up flower.
268 |   # See: https://docs.docker.com/compose/profiles/
269 |   flower:
270 |     <<: *airflow-common
271 |     command: celery flower
272 |     profiles:
273 |       - flower
274 |     ports:
275 |       - 5555:5555
276 |     healthcheck:
277 |       test: ["CMD", "curl", "--fail", "http://localhost:5555/"]
278 |       interval: 10s
279 |       timeout: 10s
280 |       retries: 5
281 |     restart: always
282 |     depends_on:
283 |       <<: *airflow-common-depends-on
284 |       airflow-init:
285 |         condition: service_completed_successfully
286 | 
287 | volumes:
288 |   postgres-db-volume:
289 | 


--------------------------------------------------------------------------------
/images/Sagemaker_endpoint.jpg:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/Jithsaavvy/Sentiment-analysis-from-MLOps-paradigm/61876b316af138e08c40608b64fe6dc923a23e9c/images/Sagemaker_endpoint.jpg


--------------------------------------------------------------------------------
/images/architecture_diagram.jpeg:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/Jithsaavvy/Sentiment-analysis-from-MLOps-paradigm/61876b316af138e08c40608b64fe6dc923a23e9c/images/architecture_diagram.jpeg


--------------------------------------------------------------------------------
/images/ecr_image.PNG:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/Jithsaavvy/Sentiment-analysis-from-MLOps-paradigm/61876b316af138e08c40608b64fe6dc923a23e9c/images/ecr_image.PNG


--------------------------------------------------------------------------------
/images/etl_dag.PNG:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/Jithsaavvy/Sentiment-analysis-from-MLOps-paradigm/61876b316af138e08c40608b64fe6dc923a23e9c/images/etl_dag.PNG


--------------------------------------------------------------------------------
/images/mlflow_exps.PNG:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/Jithsaavvy/Sentiment-analysis-from-MLOps-paradigm/61876b316af138e08c40608b64fe6dc923a23e9c/images/mlflow_exps.PNG


--------------------------------------------------------------------------------
/images/model_dag.PNG:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/Jithsaavvy/Sentiment-analysis-from-MLOps-paradigm/61876b316af138e08c40608b64fe6dc923a23e9c/images/model_dag.PNG


--------------------------------------------------------------------------------
/images/model_plot.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/Jithsaavvy/Sentiment-analysis-from-MLOps-paradigm/61876b316af138e08c40608b64fe6dc923a23e9c/images/model_plot.png


--------------------------------------------------------------------------------
/images/model_registry_latest1.PNG:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/Jithsaavvy/Sentiment-analysis-from-MLOps-paradigm/61876b316af138e08c40608b64fe6dc923a23e9c/images/model_registry_latest1.PNG


--------------------------------------------------------------------------------
/images/model_registry_latest2.PNG:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/Jithsaavvy/Sentiment-analysis-from-MLOps-paradigm/61876b316af138e08c40608b64fe6dc923a23e9c/images/model_registry_latest2.PNG


--------------------------------------------------------------------------------
/images/model_registry_org.PNG:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/Jithsaavvy/Sentiment-analysis-from-MLOps-paradigm/61876b316af138e08c40608b64fe6dc923a23e9c/images/model_registry_org.PNG


--------------------------------------------------------------------------------
/scripts/behavioral_test.py:
--------------------------------------------------------------------------------
  1 | """
  2 | @author: Jithin Sasikumar
  3 | 
  4 | Module to define and perform behavioral testing of sentiment analysis model. It is based on
  5 | the paper [1] that proposes three different types of tests but only two tests are performed
  6 | in this project namely -
  7 |     - Minimum Functionality test (MFT)
  8 |     - Invariance test (INV)
  9 | 
 10 | Note
 11 | ----
 12 |     Model testing differs from model evaluation.
 13 | 
 14 | References
 15 | ----------
 16 | [1] Beyond Accuracy: Behavioral Testing of NLP models with CheckList
 17 | [2] https://github.com/marcotcr/checklist
 18 | """
 19 | 
 20 | import os
 21 | import spacy
 22 | import numpy as np
 23 | import pandas as pd
 24 | import tensorflow as tf
 25 | from checklist.perturb import Perturb
 26 | from keras.models import Sequential
 27 | from sklearn.metrics import accuracy_score
 28 | nlp = spacy.load('en_core_web_sm')
 29 | 
 30 | 
 31 | def min_functionality_test(dataframe: pd.DataFrame) -> pd.DataFrame:
 32 |     """
 33 |     Function to perturb test data which is suitable to perform MFT. A specific behavior (or)
 34 |     capability of the model is tested. In this case, the specific behavior to be tested
 35 |     is `negation` (i.e.) how well the model handles negated inputs.
 36 | 
 37 |     More detailed information can be found in the README.md
 38 | 
 39 |     Parameters
 40 |     ----------
 41 |     dataframe: pd.DataFrame
 42 |         Test dataframe consisting of original text.
 43 | 
 44 |     Returns
 45 |     -------
 46 |     negated_dataframe: pd.DataFrame
 47 |         Dataframe after negating original texts with their corresponding labels.
 48 |     """
 49 | 
 50 |     original_text: list = dataframe["sample_text"].tolist()
 51 |     true_labels: list = dataframe["labels"].tolist()
 52 |     piped_text  = list(nlp.pipe(original_text))
 53 | 
 54 |     # Adding negation to original text using `checklist` package
 55 |     perturbed_data = Perturb.perturb(piped_text, Perturb.add_negation)
 56 |     negated_texts: list = [text[1] for text in perturbed_data.data]
 57 | 
 58 |     negated_dataframe = pd.DataFrame(
 59 |                                     list(zip(negated_texts, true_labels)),
 60 |                                     columns = ["negated_text", "labels"]
 61 |                                     )
 62 | 
 63 |     return negated_dataframe
 64 | 
 65 | def invariance_test(text: str) -> str:
 66 |     """
 67 |     Function to perturb test data which is suitable to perform invariance test.
 68 |     The test data is perturbed in a way that their context are preserved. Despite
 69 |     perturbing the data, the model is expected to generalize well and predict the
 70 |     same labels pertaining to the actual test data.
 71 | 
 72 |     Two perturbations are added namely:
 73 |         - Adding typos to the actual test data.
 74 |         - Expanding contractions to the same.
 75 | 
 76 |     Parameters
 77 |     ----------
 78 |     text: str
 79 |         Input text from actual test data.
 80 | 
 81 |     Returns
 82 |     -------
 83 |     perturbed_text: str
 84 |         Resulting text after applying two perturbations.
 85 |     """
 86 | 
 87 |     text_with_typo = str(Perturb.add_typos(text))
 88 |     perturbed_text = Perturb.expand_contractions(text_with_typo)
 89 |     return perturbed_text
 90 | 
 91 | 
 92 | def run(test_name: str, model: Sequential,
 93 |         test_dataset: tf.data.Dataset.zip,
 94 |         dataframe: pd.DataFrame) -> float:
 95 |     """
 96 |     Function to perform specified behavioral test using perturbed data.
 97 | 
 98 |     Parameters
 99 |     ----------
100 |     test_name: str
101 |         Name of test (MFT or invariance).
102 |     model: Sequential
103 |         Trained (or) productionalized model pulled from model registry
104 |         in EC2 instance.
105 |     test_dataset: tf.data.Dataset.zip
106 |         Perturbed dataset transformed to tensorflow dataset format.
107 |     dataframe: pd.DataFrame
108 |         Dataframe where test results will be written and saved at the
109 |         end as CSV for analysis and benchmarking.
110 | 
111 |     Returns
112 |     -------
113 |     test_accuracy: float
114 |     """
115 |     try:
116 |         for text, _ in test_dataset.take(1):
117 |             text_ = text.numpy()
118 | 
119 |     except Exception:
120 |         print(f"Exception occurred when trying to access {test_dataset}. Please check!!")
121 |     
122 |     else:
123 |         predicted_probabilities = model.predict(text_)
124 |         predicted_labels = np.argmax(
125 |                                     np.array(predicted_probabilities),
126 |                                     axis = 1
127 |                                     )
128 | 
129 |         dataframe["predicted_labels"] = predicted_labels
130 |         dataframe["predicted_probabilities"] = predicted_probabilities.tolist()
131 | 
132 |         # Save test results as CSv
133 |         dataframe_path = os.path.join(os.getcwd(), "test_results")
134 |         dataframe.to_csv(f"{dataframe_path}/{test_name}_test_results.csv", index = False)
135 | 
136 |         test_accuracy = accuracy_score(
137 |                                         y_true = dataframe['labels'].tolist(),
138 |                                         y_pred = dataframe['predicted_labels'].tolist()
139 |                                     )
140 | 
141 |         return test_accuracy


--------------------------------------------------------------------------------
/scripts/deploy.py:
--------------------------------------------------------------------------------
 1 | #!/usr/bin/env python3
 2 | 
 3 | """
 4 | @author: Jithin Sasikumar
 5 | 
 6 | Script to deploy productionalized model into AWS Sagemaker. The production model
 7 | from MLflow model registry in EC2 instance is packaged into a docker image as a
 8 | deployable model artifact and pushed into Amazon ECR. The deployable image from
 9 | AWS ECR is then deployed into AWS Sagemaker instance which creates an endpoint that
10 | can be used to communicate with the model for inferencing.
11 | """
12 | 
13 | import os
14 | import sys
15 | import mlflow
16 | from mlflow import sagemaker
17 | 
18 | sys.path.append(os.path.join(os.path.dirname(__file__), ".."))
19 | from utils.helper import Config
20 | 
21 | config = Config()
22 | 
23 | mlflow.set_tracking_uri(config["model-tracking"]["mlflow_tracking_uri"])
24 | 
25 | #Name of the resulting endpoint
26 | app_name = config["model-deploy"]["endpoint_name"]
27 | 
28 | # Location of mlflow production model to be deployed from remote server
29 | model_name = config["model-registry"]["model_name"]
30 | model_uri = f"models:/{model_name}/production"
31 | 
32 | # Docker image that is built & pushed to AWS ECR repository as deployable model artifact
33 | docker_image_url = os.environ["IMAGE_URI"]
34 | 
35 | # ARN role of IAM user
36 | role = os.environ["ARN_ROLE"]
37 | 
38 | # Default region of AWS services
39 | region = os.environ["REGION"]
40 | 
41 | # Deploying the docker image containing mlflow production model & dependencies from AWS ECR to Sagemaker instance
42 | sagemaker._deploy(
43 |                 mode = 'create',
44 |                 app_name = app_name,
45 |                 model_uri = model_uri,
46 |                 image_url = docker_image_url,
47 |                 execution_role_arn = role,
48 |                 instance_type = 'ml.m5.xlarge',
49 |                 instance_count = 1,
50 |                 region_name = region
51 |             )


--------------------------------------------------------------------------------
/scripts/stage_model_to_production.py:
--------------------------------------------------------------------------------
  1 | #!/usr/bin/env python3
  2 | 
  3 | """
  4 | @author: Jithin Sasikumar
  5 | 
  6 | Script to productionalize the best model. The models (latest, production) from the
  7 | MLflow model registry in EC2 instance are pulled and benchmarked by means of
  8 | behavioral testing and evaluation. As a result, the best performing model is
  9 | pushed to production and other is archived, so that the production model can be
 10 | packaged as a deployable artifact and deployed to AWS Sagemaker instance.
 11 | """
 12 | 
 13 | import os
 14 | import mlflow
 15 | import sys
 16 | import pandas as pd
 17 | import tensorflow as tf
 18 | import behavioral_test
 19 | from dataclasses import dataclass, field
 20 | from keras.utils import to_categorical
 21 | from transformers import BertTokenizer
 22 | 
 23 | sys.path.append(os.path.join(os.path.dirname(__file__), ".."))
 24 | from utils.helper import Config, load_dataframe
 25 | from utils.prepare_data import Dataset
 26 | 
 27 | config = Config()
 28 | 
 29 | @dataclass
 30 | class Productionalize:
 31 |     """
 32 |     Benchmark and push latest model to production based on testing and evaluation.
 33 |     """
 34 |     tracking_uri: str
 35 |     test_data: str = "./test_data.parquet"
 36 |     client: mlflow.MlflowClient = None
 37 |     test_dataframe: pd.DataFrame = None
 38 |     model_name: str = ""
 39 |     batch_size: int = 64
 40 |     sequence_length: int = 256
 41 |     num_classes: int = 3
 42 |     latest_version: int = 3
 43 |     filter_string = "name LIKE 'sentiment%'"
 44 | 
 45 |     def __post_init__(self) -> None:
 46 |         """
 47 |         Dunder method to set mlflow_tracking_uri and values to some instance variables.
 48 | 
 49 |         Returns
 50 |         -------
 51 |             None
 52 | 
 53 |         Raises
 54 |         ------
 55 |         ConnectionError: Exception
 56 |             If mlflow_tracking_uri is invalid.
 57 |         """
 58 |         try:
 59 |             mlflow.set_tracking_uri(self.tracking_uri)
 60 |         
 61 |         except ConnectionError:
 62 |             print(f"Cannot connect to {self.tracking_uri}. Please check and try again!!!")
 63 | 
 64 |         else:
 65 |             self.client = mlflow.MlflowClient()
 66 |             self.latest_version = self.client.get_latest_versions(name = self.model_name)[0].version
 67 |             self.test_dataframe = load_dataframe(self.test_data)
 68 | 
 69 |     def get_all_registered_models(self) -> None:
 70 |         """
 71 |         Method to search and display all registered models from model registry in EC2 instance based on
 72 |         given filter.
 73 | 
 74 |         Parameters
 75 |         ----------
 76 |             None
 77 |         
 78 |         Returns
 79 |         -------
 80 |             None
 81 |         """
 82 |         # Searching all models with names starting with sentiment
 83 |         for model in self.client.search_registered_models(filter_string = self.filter_string):
 84 |             for model_version in model.latest_versions:
 85 |                 print(f"name = {model_version.name}, version = {model_version.version}, stage = {model_version.current_stage}, run_id = {model_version.run_id}")
 86 | 
 87 |     def load_models(self) -> tf.function:
 88 |         """
 89 |         Method to pull and load tensorflow models from model registry to be used for benchmarking.
 90 |         It loads two models namely:
 91 |                 - Latest model => Trained model added to the model registry with latest version.
 92 |                 - Production model => Model which is already in production stage.
 93 | 
 94 |         Parameters
 95 |         ----------
 96 |             None
 97 | 
 98 |         Returns
 99 |         -------
100 |         latest_model, production_model: tf.function
101 |             Callable TensorFlow graph that takes inputs and returns inferences.
102 |         """
103 | 
104 |         latest_model: tf.function = mlflow.tensorflow.load_model(
105 |                                                             model_uri = f"models:/{self.model_name}/{self.latest_version}"
106 |                                                             )
107 | 
108 |         production_model: tf.function = mlflow.tensorflow.load_model(
109 |                                                             model_uri = f"models:/{self.model_name}/production"
110 |                                                             )
111 | 
112 |         return latest_model, production_model
113 | 
114 |     def transform_data(self, dataframe: pd.DataFrame,
115 |                       col_name: str = "cleaned_tweets") -> tf.data.Dataset.zip:
116 |         """
117 |         Method that transform dataframe into tensorflow dataset using BERT tokenizer. It wraps
118 |         Dataset class from `prepare_data.py` module.
119 | 
120 |         Parameters
121 |         ----------
122 |         dataframe: pd.DataFrame
123 |             Input dataframe
124 |         col_name: str = "cleaned_tweets"
125 |             Name of column containing input texts. Defaults to "cleaned_tweets".
126 | 
127 |         Returns
128 |         -------
129 |         dataset: tf.data.Dataset.zip
130 |             Tensorflow dataset after batching.
131 |         """
132 | 
133 |         y_test = to_categorical(dataframe['labels'], self.num_classes)
134 |         tokenizer = BertTokenizer.from_pretrained("bert-base-uncased")
135 |         dataset = Dataset(tokenizer = tokenizer, dataframe = dataframe,
136 |                           labels = y_test, batch_size = self.batch_size,
137 |                           max_length = self.sequence_length,
138 |                           col_name = col_name).encode_bert_tokens_to_tf_dataset()
139 | 
140 |         return dataset
141 | 
142 |     def benchmark_models(self) -> tuple[tuple[float], tuple[float]]:
143 |         """
144 |         Method to benchmark the loaded models from model registry to productionalize them.
145 |         The benchmarking is done by performing behavioral testing of loaded models and
146 |         evaluating them.
147 | 
148 |         Parameters
149 |         ----------
150 |             None
151 | 
152 |         Returns
153 |         -------
154 |         latest_model_accuracies, production_model_accuracies: tuple(tuple[float], tuple[float])
155 |             Resulting accuracies from testing and evaluation with perturbed and test data
156 |             respectively.
157 |         """
158 | 
159 |         latest_model, production_model = self.load_models()
160 | 
161 |         # Minimum Functionality test
162 |         sample_mft_dataframe = load_dataframe("./scripts/test_data/sample_test_data_for_mft.parquet")
163 |         negated_dataframe = behavioral_test.min_functionality_test(sample_mft_dataframe)
164 |         perturbed_dataset_mft = self.transform_data(dataframe = negated_dataframe, col_name = "negated_text")
165 |         accuracy_latest_model_mft = behavioral_test.run(test_name = "MFT_latest", model = latest_model,
166 |                                                         test_dataset = perturbed_dataset_mft, dataframe = negated_dataframe)
167 |         accuracy_production_model_mft = behavioral_test.run(test_name = "MFT_production", model = production_model,
168 |                                                         test_dataset = perturbed_dataset_mft, dataframe = negated_dataframe)
169 | 
170 |         # Invariance test (Inv)
171 |         perturbed_dataframe_inv = self.test_dataframe.tail(100)
172 |         perturbed_dataframe_inv["cleaned_tweets"] = perturbed_dataframe_inv["cleaned_tweets"].apply(
173 |                                                                             lambda text: behavioral_test.invariance_test(text)
174 |                                                                             )
175 |         perturbed_dataset_inv = self.transform_data(dataframe = perturbed_dataframe_inv)
176 |         accuracy_latest_model_inv = behavioral_test.run(test_name = "Invariance_latest", model = latest_model,
177 |                                                         test_dataset = perturbed_dataset_inv, dataframe = perturbed_dataframe_inv)
178 |         accuracy_production_model_inv = behavioral_test.run(test_name = "Invariance_production", model = production_model,
179 |                                                         test_dataset = perturbed_dataset_inv, dataframe = perturbed_dataframe_inv)
180 | 
181 |         # Model evaluation using full test data
182 |         test_dataset = self.transform_data(dataframe = self.test_dataframe)
183 |         latest_model_score = latest_model.evaluate(test_dataset)
184 |         production_model_score = production_model.evaluate(test_dataset)
185 | 
186 |         # Wrap results in the tuple
187 |         latest_model_accuracies = (accuracy_latest_model_mft, accuracy_latest_model_inv, latest_model_score[1])
188 |         production_model_accuracies = (accuracy_production_model_mft, accuracy_production_model_inv, production_model_score[1])
189 | 
190 |         return latest_model_accuracies, production_model_accuracies
191 | 
192 |     def push_new_model_to_production(self, latest_model_accuracies: tuple[float],
193 |                                     production_model_accuracies: tuple[float]) -> bool:
194 |         """
195 |         Method to push the latest-best model to production stage based on
196 |         testing and evaluation metrics.
197 | 
198 |         Parameters
199 |         ----------
200 |         latest_model_accuracies: tuple[float]
201 |             Resulting accuracies from testing and evaluation of latest model.
202 |         production_model_accuracies: tuple[float]
203 |             Resulting accuracies from testing and evaluation of production model.
204 | 
205 |         Returns
206 |         -------
207 |         success: bool
208 |             True if latest model is pushed to production, else False.
209 |         """
210 | 
211 |         print(f"Latest model accuracies: {latest_model_accuracies},\nProduction model accuracies: {production_model_accuracies}")
212 | 
213 |         if latest_model_accuracies > production_model_accuracies:
214 |             self.client.transition_model_version_stage(
215 |                                                     name = self.model_name,
216 |                                                     version = self.latest_version,
217 |                                                     stage = "Production")
218 | 
219 |             print("Transitioned latest model to production!!")
220 |             success = True
221 | 
222 |         else:
223 |             print("Cannot transition the model stage. Latest model cannot outperform production model in all conducted tests!!!")
224 |             success = False
225 | 
226 |         return success
227 | 
228 | def main() -> None:
229 |     productionalize_ = Productionalize(tracking_uri = config["model-tracking"]["mlflow_tracking_uri"],
230 |                                     test_data = config["files"]["test_data"],
231 |                                     model_name = config["model-registry"]["model_name"],
232 |                                     batch_size = config["train-parameters"]["batch_size"],
233 |                                     sequence_length = config["train-parameters"]["sequence_length"]
234 |                                     )
235 | 
236 |     accuracy_latest_model, accuracy_production_model = productionalize_.benchmark_models()
237 | 
238 |     success_ = productionalize_.push_new_model_to_production(accuracy_latest_model, accuracy_production_model)
239 | 
240 |     if success_:
241 |         productionalize_.get_all_registered_models()
242 | 
243 | if __name__ == "__main__":
244 |     main()


--------------------------------------------------------------------------------
/scripts/test_data/sample_test_data_for_mft.parquet:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/Jithsaavvy/Sentiment-analysis-from-MLOps-paradigm/61876b316af138e08c40608b64fe6dc923a23e9c/scripts/test_data/sample_test_data_for_mft.parquet


--------------------------------------------------------------------------------
/scripts/test_data/test_data.parquet:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/Jithsaavvy/Sentiment-analysis-from-MLOps-paradigm/61876b316af138e08c40608b64fe6dc923a23e9c/scripts/test_data/test_data.parquet


--------------------------------------------------------------------------------
/test_results/Invariance_latest_test_results.csv:
--------------------------------------------------------------------------------
  1 | cleaned_tweets,labels,predicted_labels,predicted_probabilities
  2 | in new  market guide on  ai trust  risk and security management   modelops is one of the key pillars in ai trust  amp  risk management    is offering a complimentaryc opy of the guide         enterpriseai  modelopco  ml  aistrategy  mlops ,0,0,"[0.9975749254226685, 0.001199319725856185, 0.001225676154717803]"
  3 |  mlops is the process of operationalizing your  mahcinelearning models  know more about managing  machinelearningoperationalization   amp  its  impact on business      mlmanagement  artificialintelligence  softwaredevelopment  impressico  digitaltransformation ,2,2,"[0.008079132065176964, 3.538187957019545e-05, 0.9918855428695679]"
  4 | it s friday so it s time to share some awesomeness  here s   k d running  triton in a end to en d training   serving  scenario  benchmarking included     datascience  mlops  machinelearning,0,2,"[0.0011978754773736, 1.166270749308751e-06, 0.9988009929656982]"
  5 |  infographic  types of  machinelearning  via    artificialintelligence  deeplearning  ai  digitaltransformation  bigdata  analytics  datascience  pytocrh  python  tensorflow  reactjs  cloudcomputing  datascientist  linux     daysofcode  mlops  modelops  deeplearning ,0,0,"[0.9999049305915833, 4.034348239656538e-05, 5.47533854842186e-05]"
  6 | understanding the  azrue mlops framework  ,0,0,"[0.9993130564689636, 0.00047179561806842685, 0.0002151436056010425]"
  7 | mlop swith kubernetes  rabbitmq and fastapi ,0,0,"[0.999871015548706, 6.130681867944077e-05, 6.786939047742635e-05]"
  8 | mlops with kubernetes  rabbitmq and fastapi   analytcis  datascience   bigdata   datascience   datascience  ds   machinelearning,0,0,"[0.9995363354682922, 0.0002508562174625695, 0.00021275135804899037]"
  9 |   days to go  join us at   explainable ai  xai  summit      as we move one step closert o derisking  ai in enterprises  register now    appliedai  xai  mlops ,0,0,"[0.9998440742492676, 8.217216964112595e-05, 7.394433487206697e-05]"
 10 | watch this    minute video lead by our ct o pablo tapia  for an introduction to tuplos  the ml ops platform from    digital  data  automation  ml  development  database  mlops  aiops  bigdata  zerotouch  aiforbusiness  lowcodeplatform   ,0,0,"[0.9990637898445129, 0.000264366390183568, 0.0006718619260936975]"
 11 |  rt   for  ai to make a sizable contribution to a company s bottom line  organizations must scale the technology acrosst he organization   mlops can help  but the  ceo must facilitate it   ,0,0,"[0.9967904090881348, 0.0017071174224838614, 0.001502607250586152]"
 12 | big data analytics  a viable solution to all healthcare problems via  towardsai     macihnelearning  ml  artificialintelligence  mlops  ai  datascience  deeplearning  technology  programming  news  research  coding  aidevelopment via   towardai,0,0,"[0.9996434450149536, 0.00014437125355470926, 0.00021206472592893988]"
 13 | tdatascience  rt   mlops with kubernetes  rabbitmq and fastapi  my new article was published on   thanks    skipper is a simple and flexible open source ml workflow engine  it helps tor un and scale ml services in production     ,0,0,"[0.9962496161460876, 0.002823008457198739, 0.0009273902396671474]"
 14 |   different approaches for train test splitting of a pandas dataframe via  towardsai     machinelearning  ml  artificialintelligence  mlops  ai  datascience  deeplearning  technology  progarmming  news  research  coding  aidevelopment via   ,0,0,"[0.9991858601570129, 0.0005140349385328591, 0.00030007565510459244]"
 15 | artificial intelligence  ai  newsletter by towards ai     via  towardsai    mw    machinelearning  ml  artificialintelligence  mlops  ai  datascience  deeplearning  technology  programmign  news  research  coding  aidevelopment  ainews  ainewsletter v ,1,1,"[0.00432670908048749, 0.9956595301628113, 1.3761035916104447e-05]"
 16 | the role of m lops on effective  ai  ,2,2,"[0.0005395612679421902, 2.6852296741708415e-07, 0.9994601011276245]"
 17 | responsible ai widgets provides a collection of model and data exploration and asssesment user interfaces that enable a better understanding of ai systems    mlops,2,2,"[0.002955460688099265, 6.2794038058200385e-06, 0.9970381855964661]"
 18 | mlops with kubernetes  rabbtimq and fastapi   ml  machinelearning  ai  artificialintelligence,0,0,"[0.9995831847190857, 0.00021738286886829883, 0.00019951179274357855]"
 19 | for  ai to make a sizbale contribution to a company s bottom line  organizations must scale the technology across the organization   mlops can help  but the  ceo must facilitate it   ,0,0,"[0.998309314250946, 0.0006102448678575456, 0.0010804523481056094]"
 20 | th estate of ai in       machine learning in production  mlops and data centric ai   artificiallintelligence  machinelearning  data  qacycle ,0,1,"[0.27157410979270935, 0.7209789156913757, 0.007446992211043835]"
 21 | the state of ai in       amchine learning in production  mlops and data centric ai   artificiallintelligence  machinelearning  data  appsunify ,0,1,"[0.17362572252750397, 0.8226094841957092, 0.0037648086436092854]"
 22 |    communityday track about kubernetes deconstructed   aws edition  speaker  ninad pundalik     if you are mlops  amp  devops enthsuiasts  do join the same     acd      awsusergroups  awscommunityday  ,0,0,"[0.9998753070831299, 6.574806320713833e-05, 5.896862057852559e-05]"
 23 | next up on the judging panel for the  mozdyaihackathon is angel rivera  senior developer advocate at    angel is an experienced  hackathon mentor and judge  and we re so excited to have him on our panel       ai  devlife  coding  devops  mlops ,2,2,"[0.2896493077278137, 0.01047223899513483, 0.6998785138130188]"
 24 | businesses in apac that invest in  customerexperience are becoming pandemic proof   covidburnout  cx  custexp  custserv  infinitejourneys  rox  retrust  ex  hcd  hcxd  designthinking  servdes  ai  lifejourneys  momentsoflife  momentsthatmatter  mlops,2,2,"[0.0023258232977241278, 4.154785983701004e-06, 0.9976699948310852]"
 25 |   communityday track about explainable ai with amazon sagemaker clarify by    sarbani maiti     if you are mlops enthusiasts  do join the same     acd      awsusergroups  aswcommunityday ,0,0,"[0.9995389580726624, 0.0001459317863918841, 0.000315043464070186]"
 26 | what makes an optimal  customerexperience in          cx  custserv  custexp  designthinking  jtbd  innovation  ai  mlop s devops  purpose  retrust  infinitejourneys  rox  experienceequity,0,0,"[0.947429895401001, 0.0027520316652953625, 0.049817971885204315]"
 27 |  like mlops  kdiops takes a village,0,0,"[0.999565064907074, 0.00020490327733568847, 0.00022997547057457268]"
 28 | agile  mindset needed in technology and business   innovation  strategy   machinelearning  datascience  pytohn  ai     daysofcode   iot  flutter  javascript  serverless  womenintech  cybersecurity  technology   womenwhocode  bigdata  deeplearning  data  mlops  rstats ,2,2,"[0.003155388403683901, 8.757564501138404e-06, 0.9968358874320984]"
 29 |  the rol eof mlops on effective ai   ,2,2,"[0.0008684382773935795, 6.541851007568766e-07, 0.9991308450698853]"
 30 | for  ai to make a sizable contribution to a company s bottom line  organizations must scale the technology across the organization  m lops can help  but the  ceo must facilitate it   ,0,0,"[0.9841560125350952, 0.006479825358837843, 0.009364011697471142]"
 31 |        is there a way to compare these wit he g  git    dvc branches   mlops  modelops ,0,0,"[0.7543706893920898, 0.011338168755173683, 0.23429104685783386]"
 32 | a copmlete mlops toolbox by martin carmona ,0,0,"[0.9985236525535583, 0.000729620922356844, 0.000746635312680155]"
 33 | datatron introduces new features to mlops and ai governance solution   prnewswire ,0,0,"[0.9990226626396179, 0.0004840958572458476, 0.0004931276198476553]"
 34 | enusre machine learning success through mlops  ,2,2,"[0.4778515100479126, 0.0053591011092066765, 0.5167893171310425]"
 35 | datatron introduces new features to mlops and ai governance solution   prnewswrie ,0,0,"[0.9990211129188538, 0.0004831781843677163, 0.0004956190241500735]"
 36 | i m be giving a talk at the  conference  only onew eek away   get your tickets now   towards cloud native distributed machine learning pipelines at scale     machinelearning  python  datascience  mlops  devops  cloudnative  kubernetes,0,0,"[0.9994450807571411, 0.00018101614841725677, 0.00037393771344795823]"
 37 | prepare yourself for success with a strong foundation in machine learning essentials  including  mlops  securing  lm environments  and training ml models at scale  sign up for free today   ,2,2,"[0.000749451108276844, 4.743877184409939e-07, 0.999250054359436]"
 38 | the latest update for  algorithmia includes  struggling with  machinelearning  you re not alone  and   report  a comprehensive guide for machine learning governance in th eenterprise      mlops  ai  analytics ,2,2,"[0.0007890466367825866, 6.443226538976887e-07, 0.9992102980613708]"
 39 | big data analytics  a viable solution to all healthcare problems via  towardsai     machinelearning  ml  artificialintelligence  mlops  ai  datascience  deeplearning  technology  programming  news  reesarch  coding  aidevelopment,0,0,"[0.9996389150619507, 0.0001492551527917385, 0.00021183474746067077]"
 40 |  paper recommendation  this paper explains why deep learning models such as bert  clip  gpt and dall  earen t just new machine learning models  but what they are now calling  foundation models   mlops  datascientist  machinelearning  foundationmodels  standfordai  jai ,0,0,"[0.9992578029632568, 0.000252933386946097, 0.0004893361474387348]"
 41 | the rloe of mlops on effective ai by carl w handlin wallace ,2,2,"[0.02761387638747692, 0.0001762305764714256, 0.9722099304199219]"
 42 | build new skills in  ml   nlp  mlops  and much more whatever your skill level with     hands on training sessions and expert led workshops at  odscwest this november  register now     off ends soon    datascience  ai  machinelearning  nlp ,2,2,"[0.013526243157684803, 7.89159385021776e-05, 0.9863947629928589]"
 43 |  odscwest      will host some of the best and brightest minds  in  ml  dl   mlops  and more  don t miss this chance to learn from the leading experts in your field  register now     of fends soon   ,2,2,"[0.0006112701958045363, 3.289638357273361e-07, 0.9993883967399597]"
 44 | our upcoming training session on practical mlops will address some of the challenges and questoins that you might face while building out your organization s mlops   datascoemce  mlops ,0,0,"[0.9987800121307373, 0.000426615122705698, 0.0007933723973110318]"
 45 | thrilled to kick off our product blog series highlighting the new features and enhancements in the verta platform    first up  the all important capability of managing access across different types of users and teams  and supporting collaborative ai ml development   mlop s verta ,2,2,"[0.046468961983919144, 0.00031164908432401717, 0.9532193541526794]"
 46 |       key finding      operations  organizations that document and neforce mlops processes are twice as likely to achieve their goals to a high degree  they are also nearly  x more likely to be highly prepared for ai related risks,0,0,"[0.9858360886573792, 0.003545548999682069, 0.010618377476930618]"
 47 |      we re looking forward to participating in the fireside chat at  xaisummit   next week  wednesday  co founder  will have a hcat with fellow panelists from    and  on mlops ecosystems     free registration  ,0,0,"[0.9503957629203796, 0.006481673568487167, 0.043122585862874985]"
 48 | all this talk about mlops but what i really struggle wtih is kidops   ,0,0,"[0.9997024536132812, 0.0001395035651512444, 0.00015802186680957675]"
 49 | reasons organizations must invest in data enginereing and mlops talents   pcquest ,0,0,"[0.9994622468948364, 0.0002567728515714407, 0.00028102879878133535]"
 50 | from  insights  gt  gt  see how  is a key  modelops vendor    see why         machinelearning  bigdata  ai  enterpriseai  datascience  mlops  modelopco  modelgovernance  modelriskmanagement  datascientists  aistrategy ,0,0,"[0.9983953833580017, 0.0009175522718578577, 0.0006871342775411904]"
 51 |   different approachse for train test splitting of a pandas dataframe via  towardsai     machinelearning  ml  artificialintelligence  mlops  ai  datascience  deeplearning  technology  programming  news  research  coding  aidevelopment,0,0,"[0.999395489692688, 0.000347074877936393, 0.0002574461395852268]"
 52 | why do you need a feature store for machine learning   learn this and more on our webcast on kubeflow feast  watch to learn more        mlops  kubeflow  featureengineering  kbueflowfeast ,2,2,"[0.002696745563298464, 5.282335678202799e-06, 0.9972979426383972]"
 53 | a gentle introudction to  mlops by yashaswi nayak in  ,0,0,"[0.9972885251045227, 0.0011645941995084286, 0.001546790124848485]"
 54 | inusrance agents have to be very good at decision making in the insurance industry  with the help of ai  they can make the best decisions and provide enhanced customer service  read this article to know more about it      xpressoai  datascientists  mlops ,2,2,"[0.0008502001292072237, 6.323303978206241e-07, 0.9991491436958313]"
 55 | mlops with kubernetes  rabbitmq and fastapi  my new article was published on   thanks    skipper is a simple and flexible open source ml workflow engine  it helps to run and scale ml services in production    python  kubernetes  read   ,0,0,"[0.9957531094551086, 0.0026684061158448458, 0.001578421681188047]"
 56 |  datascientists and data engineers play a hgue role in  mlops and  devops  with the right  data  both teams work closely to generate the best  application performance  head to the blog now to learn more   via    devops  cloud  programming  aws ,2,2,"[0.0013010645052418113, 1.4494435163214803e-06, 0.9986974000930786]"
 57 | read our full benchmark comparing mlops enterprise readiness soluitons in the cloud from analysts  and jake dolezal       machinelearning  artificialintelligence  deeplearning  ai  bigdata  analytics  datascience  cloudcomputing  mlops ,2,0,"[0.9316837787628174, 0.0422937236726284, 0.02602248638868332]"
 58 | good overview and introduction to  mlops for  datascience by      analytics  iianalytics  tech  technology  artificialintelligence  machinelearning  ml  ai  data  dataanalytics d ataandanalytics,2,2,"[0.0009493071120232344, 9.608435220798128e-07, 0.999049723148346]"
 59 | check this summary of what s new in  kubeflow      plus a breakdown of contributor and chnage stats for each component      machinelearning  datascience  mlops,0,0,"[0.93825763463974, 0.054222866892814636, 0.007519515696913004]"
 60 | iguazio mlops platform now supports amazon fsx for nteapp ontap ,0,0,"[0.9969731569290161, 0.0011448762379586697, 0.0018819262040778995]"
 61 | iguazio mlops platform nwo supports amazon fsx for netapp ontap ,0,0,"[0.9977012872695923, 0.0009127571247518063, 0.001385986339300871]"
 62 |   tools for machine learning serving in mlops    tensorflow serving   torch serve   bentoml   sagemaker   cortex labs   ployagon   aible   seldon   lagorithmia,0,0,"[0.9997541904449463, 0.00011336587340338156, 0.00013248846516944468]"
 63 |  mlops is hot      lots of interesting work happening in the startup ecosystem to help enterprises operationalize ml   join us at  xaisummit   to listen to these amazing speakers from         register today   ,2,2,"[0.0018054584506899118, 2.460224777678377e-06, 0.9981921315193176]"
 64 |  infographic  types of  machinelearning  artificialintelligence  deeplearning  ai  digitaltransformation  bigdata  analytisc  datascience  pytorch  python  tensorflow  reactjs  cloudcomputing  datascientist  linux     daysofcode  mlops  modelops  deeplearning ,0,0,"[0.9999008774757385, 4.2387469875393435e-05, 5.666241850121878e-05]"
 65 |   only   weeks away from our  mlopssalon   we ll be bringing together expertsf rom industry as well as research  and showcase best practices  real world case studies  and a wonderful panel discussion  join us and register here      mlops  machinelearning,2,2,"[0.0026631527580320835, 5.048794264439493e-06, 0.9973317980766296]"
 66 | join this upcoming event to learn more about reproducibility   mlops  memoizatoin  static checking and more  register now    odsc  datascience  ai ,2,2,"[0.0009666963596828282, 8.31110867238749e-07, 0.9990324378013611]"
 67 | from faster model deployment and anomaly detection to adoption of real time data  read how businesse suse  mlops to improve management ,0,0,"[0.9995748400688171, 0.0001555221388116479, 0.00026967705343849957]"
 68 | are we heading towards a new wave of mlops tool evoultion  i think so  here is a small write up on our thought process    mlops  netbook  mlinfraops  datascience  ,1,1,"[0.08040372282266617, 0.9180561304092407, 0.0015400615520775318]"
 69 | artificial intelligence  ai  newsletter by towards ai     via  towardsai    mw    machinelearning  ml  artifciialintelligence  mlops  ai  datascience  deeplearning  technology  programming  news  research  coding  aidevelopment  ainews  ainewsletter,1,1,"[0.004402304533869028, 0.9955834746360779, 1.4152177755022421e-05]"
 70 | all you needt o know to start with deep learning via  towardsai     machinelearning  ml  artificialintelligence  mlops  ai  datascience  deeplearning  technology  programming  news  research  coding  aidevelopment,0,0,"[0.9997861385345459, 8.93956603249535e-05, 0.0001245760649908334]"
 71 | minikf is the fastest and easiest way to get  kubeflow up and running on   or your laptop  got questions  we have a new technical minikf faq that just went live     machineleanring  mlops  datascience,0,0,"[0.999112606048584, 0.0002522862341720611, 0.0006350554758682847]"
 72 |  launches zero emission ai cloud with integratedm lops technology stack optimized for nvidia   ein news ,0,0,"[0.8072778582572937, 0.010195355862379074, 0.18252688646316528]"
 73 | streaming live at   p edt is matt cowell from  with our lunchtime keynote   can humans learn like machines  the case for human machine learning    join his session  free       machinelearning  executive  augmentedmachinelearnnig  mlops ,0,0,"[0.9853835105895996, 0.004197043366730213, 0.010419302619993687]"
 74 | the imitation game  can you tell the difference between people and  ai    deeplearning  ml  lmops  aiops  datascience,1,1,"[0.025944195687770844, 0.9737597703933716, 0.0002961347345262766]"
 75 | mlops with kubernetes  rabbitmq and fastapi   wewantdata  data  inisghts  bigdata  web  database  tech  marketing ,0,0,"[0.9994960427284241, 0.0002712365530896932, 0.00023269359371624887]"
 76 | the role of mlops on effective ai by carl w ahndlin wallace ,2,2,"[0.0011887723812833428, 1.114126575885166e-06, 0.9988101124763489]"
 77 | mlops iwth kubernetes  rabbitmq and fastapi  ,0,0,"[0.99948650598526, 0.000281448126770556, 0.0002319987106602639]"
 78 | rt mlops with kubernetes  rabbitmq and fastapi   mlops  imcroservices  machinelearning  python ,0,0,"[0.9997118711471558, 0.00015001899737399071, 0.00013815666898153722]"
 79 | big thanks  for the super      mlopsforgood swag   was super fun working on this project together  looking forwar dto the next one       opensource  mlops  aiforgood ,0,0,"[0.9649217128753662, 0.00535299489274621, 0.029725266620516777]"
 80 | datatro nintroduces new features to mlops and ai governance solution ,0,0,"[0.9995730519294739, 0.00018315730267204344, 0.0002437642397126183]"
 81 | neu ro launches zero emission ai cloud with integrated mlops technology stacko ptimized for nvidia architectures ,0,0,"[0.9998247623443604, 7.013216963969171e-05, 0.0001052175066433847]"
 82 | join  today masterclass prat    we examine the final leg of the journey to move the  ai model into business    modelops  mlops  aiethics  aigovernance  enterpriseai ,0,0,"[0.9918893575668335, 0.006094373296946287, 0.002016287064179778]"
 83 | hot off the press  we ve released new research about the current state of machine learning in the enterprise  download the erport to discover the   latest industry trends you need to know   mltrends  enterpriseml  mlops  machinelearning,0,0,"[0.9937769174575806, 0.0016631459584459662, 0.004559958819299936]"
 84 |   october heartbeat is out  all the news from our growing community       mlops workflows    lots of ways to learn    meetup and conference videos    docs udpates    info on our growing team  and more       ,2,2,"[0.010104007087647915, 4.623148197424598e-05, 0.9898495674133301]"
 85 | found the ultimate project list for  ml  ai  python  nlp  computervision     deeplearning  neuralnetworks  machinelearning  datascience  datascinetist  datamining  mlops,0,0,"[0.9996525049209595, 0.00018718511273618788, 0.00016031661652959883]"
 86 | from sci fi films to reality   artificiallintelligence has become one of the hottest fields in modern technology  ho wexactly does ai benefit us and improve  quality of  life   read more       datascience  machinelearning  mlops  nocode ,2,0,"[0.9724183082580566, 0.02142958901822567, 0.0061520473100245]"
 87 | anindya has a great talk linked up fo r datascientists   dataengineers  and  mlops folks  tune in tomorrow and be sure to let me know what you think  ,2,2,"[0.00047982463729567826, 2.1706216557504376e-07, 0.9995198845863342]"
 88 | thinking darwinian via  towardsai     machinelearning  ml  artificialintelligence  mlops  ai  datascience  deeplearning  technology  programming  news  research  coding  aidevelopemnt,0,0,"[0.9813258647918701, 0.005426954943686724, 0.013247109018266201]"
 89 |      mlops and automl are two of the most popular applications of machine learning today  giving teams the ability to automate tasks and bring devops principles to mcahine learning use cases ,2,2,"[0.000763630261644721, 5.13763836806902e-07, 0.9992358684539795]"
 90 |      mlops and devops  why data makes it different   o reilly radar ,0,0,"[0.9995023608207703, 0.00019818305736407638, 0.00029939220985397696]"
 91 | seldon s fsi leda richard jarvis explores why bank omnichannel success needs mlops to truly scale in our latest blog post    ,2,2,"[0.0052197836339473724, 1.6140877050929703e-05, 0.9947640299797058]"
 92 | data changes over time resulting in predictive performance degradation  in your models      how can you address this issue   often the ersult of concept drift    see how to use these statistical methods to detect  conceptdrift in your models       mlops ,0,0,"[0.9944462776184082, 0.004890242125838995, 0.0006635418976657093]"
 93 |  same i m also trying to do  amp  after learning programming  mlops  devops  cloud  full stack  mobile app dev  web dev etc  now i feel the difference     ,0,0,"[0.9915984272956848, 0.004044204950332642, 0.004357412923127413]"
 94 | we re hosting our first virtual tech ethics meetup  next friday   nd october  if you re interested in delving deeper into practical  ai ethics from an mlops perspective  join us  find out moer details and sign up here      ,0,0,"[0.9989414811134338, 0.00032748529338277876, 0.0007310412474907935]"
 95 | mlops and devops w hy data makes it different ,0,0,"[0.9996846914291382, 0.00012176520249340683, 0.00019348404021002352]"
 96 |  on demand webinar  watch fern halper from   ankita gupta from   sanjithraj rao from   and lti s shivanand pawar discuss optimizing mlpos journey  amp  best practices for success in the recently concluded webinar    letssolve ,2,2,"[0.04078484699130058, 0.00033405638532713056, 0.9588810205459595]"
 97 | a fudnamental principle of neuroscience that is inspiring optimizations in neural networks via  towardsai     machinelearning  ml  artificialintelligence  mlops  ai  datascience  deeplearning  technology  programming  news  research  coding  aidevelop ,2,0,"[0.9982366561889648, 0.0007150783785618842, 0.0010482212528586388]"
 98 | how to generate th erequirements of your python project based on your imports via  towardsai     machinelearning  ml  artificialintelligence  mlops  ai  datascience  deeplearning  technology  programming  news  research  coding  aidevelopment via  ,0,0,"[0.999488353729248, 0.0002877443330362439, 0.0002239350724266842]"
 99 | rela time stock news sentiment analyzer via  towardsai     machinelearning  ml  artificialintelligence  mlops  ai  datascience  deeplearning  technology  programming  news  research  coding  aidevelopment via   towardai,0,0,"[0.9993545413017273, 0.0003615759778767824, 0.0002838729997165501]"
100 | what does your dat ascience workflow look like   at askanna we talk with data scientists every week  based on what we learned  we created this  datascience  workflow  what do you recognize  what did we miss    ml  machinelearning  ai  mlops  continuousdevelopment ,0,0,"[0.9995591044425964, 0.00016162208339665085, 0.00027925631729885936]"
101 | streamline your computer vision stack with an end to end mlops platform   via   read more     mlops  machinelearning  ml  artificialintelligence  ai  deeplearning  innovation ,2,2,"[0.0015776593936607242, 1.9448652892606333e-06, 0.9984203577041626]"
102 | 


--------------------------------------------------------------------------------
/test_results/Invariance_production_test_results.csv:
--------------------------------------------------------------------------------
  1 | cleaned_tweets,labels,predicted_labels,predicted_probabilities
  2 | in new  market guide on  ai trust  risk and security management   modelops is one of the key pillars in ai trust  amp  risk management    is offering a complimentaryc opy of the guide         enterpriseai  modelopco  ml  aistrategy  mlops ,0,0,"[0.9964763522148132, 0.003300165757536888, 0.00022335691028274596]"
  3 |  mlops is the process of operationalizing your  mahcinelearning models  know more about managing  machinelearningoperationalization   amp  its  impact on business      mlmanagement  artificialintelligence  softwaredevelopment  impressico  digitaltransformation ,2,0,"[0.6456086039543152, 0.027243392542004585, 0.32714787125587463]"
  4 | it s friday so it s time to share some awesomeness  here s   k d running  triton in a end to en d training   serving  scenario  benchmarking included     datascience  mlops  machinelearning,0,2,"[0.08876750618219376, 0.001242325291968882, 0.909990131855011]"
  5 |  infographic  types of  machinelearning  via    artificialintelligence  deeplearning  ai  digitaltransformation  bigdata  analytics  datascience  pytocrh  python  tensorflow  reactjs  cloudcomputing  datascientist  linux     daysofcode  mlops  modelops  deeplearning ,0,0,"[0.9965153336524963, 0.0032605219166725874, 0.0002240451576653868]"
  6 | understanding the  azrue mlops framework  ,0,0,"[0.9958954453468323, 0.003841615514829755, 0.00026288192020729184]"
  7 | mlop swith kubernetes  rabbitmq and fastapi ,0,0,"[0.7078998684883118, 0.23131082952022552, 0.06078921630978584]"
  8 | mlops with kubernetes  rabbitmq and fastapi   analytcis  datascience   bigdata   datascience   datascience  ds   machinelearning,0,0,"[0.6717026233673096, 0.26232820749282837, 0.06596920639276505]"
  9 |   days to go  join us at   explainable ai  xai  summit      as we move one step closert o derisking  ai in enterprises  register now    appliedai  xai  mlops ,0,0,"[0.6786503195762634, 0.2073763906955719, 0.11397319287061691]"
 10 | watch this    minute video lead by our ct o pablo tapia  for an introduction to tuplos  the ml ops platform from    digital  data  automation  ml  development  database  mlops  aiops  bigdata  zerotouch  aiforbusiness  lowcodeplatform   ,0,0,"[0.9968128800392151, 0.0029878742061555386, 0.00019930866255890578]"
 11 |  rt   for  ai to make a sizable contribution to a company s bottom line  organizations must scale the technology acrosst he organization   mlops can help  but the  ceo must facilitate it   ,0,0,"[0.7677634954452515, 0.10348440706729889, 0.1287519931793213]"
 12 | big data analytics  a viable solution to all healthcare problems via  towardsai     macihnelearning  ml  artificialintelligence  mlops  ai  datascience  deeplearning  technology  programming  news  research  coding  aidevelopment via   towardai,0,0,"[0.9968908429145813, 0.002915390068665147, 0.0001938095228979364]"
 13 | tdatascience  rt   mlops with kubernetes  rabbitmq and fastapi  my new article was published on   thanks    skipper is a simple and flexible open source ml workflow engine  it helps tor un and scale ml services in production     ,0,0,"[0.7782416343688965, 0.17550846934318542, 0.046249911189079285]"
 14 |   different approaches for train test splitting of a pandas dataframe via  towardsai     machinelearning  ml  artificialintelligence  mlops  ai  datascience  deeplearning  technology  progarmming  news  research  coding  aidevelopment via   ,0,0,"[0.9966549277305603, 0.003133349819108844, 0.00021156204456929117]"
 15 | artificial intelligence  ai  newsletter by towards ai     via  towardsai    mw    machinelearning  ml  artificialintelligence  mlops  ai  datascience  deeplearning  technology  programmign  news  research  coding  aidevelopment  ainews  ainewsletter v ,1,1,"[0.24477313458919525, 0.6673772931098938, 0.08784963935613632]"
 16 | the role of m lops on effective  ai  ,2,2,"[0.23168961703777313, 0.008777985349297523, 0.7595323324203491]"
 17 | responsible ai widgets provides a collection of model and data exploration and asssesment user interfaces that enable a better understanding of ai systems    mlops,2,0,"[0.6743612885475159, 0.031493671238422394, 0.29414504766464233]"
 18 | mlops with kubernetes  rabbtimq and fastapi   ml  machinelearning  ai  artificialintelligence,0,0,"[0.9960924983024597, 0.0036561377346515656, 0.00025132461450994015]"
 19 | for  ai to make a sizbale contribution to a company s bottom line  organizations must scale the technology across the organization   mlops can help  but the  ceo must facilitate it   ,0,0,"[0.7841159105300903, 0.11956708133220673, 0.09631700813770294]"
 20 | th estate of ai in       machine learning in production  mlops and data centric ai   artificiallintelligence  machinelearning  data  qacycle ,0,0,"[0.7806621789932251, 0.18903343379497528, 0.030304528772830963]"
 21 | the state of ai in       amchine learning in production  mlops and data centric ai   artificiallintelligence  machinelearning  data  appsunify ,0,0,"[0.9957331418991089, 0.003992869984358549, 0.0002739278133958578]"
 22 |    communityday track about kubernetes deconstructed   aws edition  speaker  ninad pundalik     if you are mlops  amp  devops enthsuiasts  do join the same     acd      awsusergroups  awscommunityday  ,0,0,"[0.7732478380203247, 0.17958824336528778, 0.04716384410858154]"
 23 | next up on the judging panel for the  mozdyaihackathon is angel rivera  senior developer advocate at    angel is an experienced  hackathon mentor and judge  and we re so excited to have him on our panel       ai  devlife  coding  devops  mlops ,2,2,"[0.40167421102523804, 0.02482571266591549, 0.5734999775886536]"
 24 | businesses in apac that invest in  customerexperience are becoming pandemic proof   covidburnout  cx  custexp  custserv  infinitejourneys  rox  retrust  ex  hcd  hcxd  designthinking  servdes  ai  lifejourneys  momentsoflife  momentsthatmatter  mlops,2,0,"[0.9945065975189209, 0.005119737703353167, 0.0003735064237844199]"
 25 |   communityday track about explainable ai with amazon sagemaker clarify by    sarbani maiti     if you are mlops enthusiasts  do join the same     acd      awsusergroups  aswcommunityday ,0,0,"[0.9942811727523804, 0.005316091235727072, 0.00040273607010021806]"
 26 | what makes an optimal  customerexperience in          cx  custserv  custexp  designthinking  jtbd  innovation  ai  mlop s devops  purpose  retrust  infinitejourneys  rox  experienceequity,0,0,"[0.9874982833862305, 0.011554501950740814, 0.0009471528464928269]"
 27 |  like mlops  kdiops takes a village,0,0,"[0.9969377517700195, 0.0028716595843434334, 0.00019061024067923427]"
 28 | agile  mindset needed in technology and business   innovation  strategy   machinelearning  datascience  pytohn  ai     daysofcode   iot  flutter  javascript  serverless  womenintech  cybersecurity  technology   womenwhocode  bigdata  deeplearning  data  mlops  rstats ,2,2,"[0.1650698184967041, 0.00620446540415287, 0.8287256956100464]"
 29 |  the rol eof mlops on effective ai   ,2,2,"[0.15323346853256226, 0.004205780569463968, 0.8425607681274414]"
 30 | for  ai to make a sizable contribution to a company s bottom line  organizations must scale the technology across the organization  m lops can help  but the  ceo must facilitate it   ,0,0,"[0.7621762752532959, 0.10468554496765137, 0.13313817977905273]"
 31 |        is there a way to compare these wit he g  git    dvc branches   mlops  modelops ,0,1,"[0.3838743567466736, 0.5366832613945007, 0.07944231480360031]"
 32 | a copmlete mlops toolbox by martin carmona ,0,0,"[0.9958305954933167, 0.003902552416548133, 0.0002668892266228795]"
 33 | datatron introduces new features to mlops and ai governance solution   prnewswire ,0,0,"[0.945685863494873, 0.0491819903254509, 0.005131965968757868]"
 34 | enusre machine learning success through mlops  ,2,0,"[0.4987344443798065, 0.06378398090600967, 0.4374815821647644]"
 35 | datatron introduces new features to mlops and ai governance solution   prnewswrie ,0,0,"[0.9454514384269714, 0.04938902333378792, 0.005159459542483091]"
 36 | i m be giving a talk at the  conference  only onew eek away   get your tickets now   towards cloud native distributed machine learning pipelines at scale     machinelearning  python  datascience  mlops  devops  cloudnative  kubernetes,0,0,"[0.9922609329223633, 0.007194820325821638, 0.0005442930269055068]"
 37 | prepare yourself for success with a strong foundation in machine learning essentials  including  mlops  securing  lm environments  and training ml models at scale  sign up for free today   ,2,2,"[0.07517533004283905, 0.0008303517824970186, 0.923994243144989]"
 38 | the latest update for  algorithmia includes  struggling with  machinelearning  you re not alone  and   report  a comprehensive guide for machine learning governance in th eenterprise      mlops  ai  analytics ,2,2,"[0.11064320057630539, 0.0021819151006639004, 0.8871749043464661]"
 39 | big data analytics  a viable solution to all healthcare problems via  towardsai     machinelearning  ml  artificialintelligence  mlops  ai  datascience  deeplearning  technology  programming  news  reesarch  coding  aidevelopment,0,0,"[0.9968715906143188, 0.002933291019871831, 0.00019528463599272072]"
 40 |  paper recommendation  this paper explains why deep learning models such as bert  clip  gpt and dall  earen t just new machine learning models  but what they are now calling  foundation models   mlops  datascientist  machinelearning  foundationmodels  standfordai  jai ,0,0,"[0.5046657919883728, 0.3528582751750946, 0.14247587323188782]"
 41 | the rloe of mlops on effective ai by carl w handlin wallace ,2,2,"[0.3064640760421753, 0.013173254206776619, 0.6803627014160156]"
 42 | build new skills in  ml   nlp  mlops  and much more whatever your skill level with     hands on training sessions and expert led workshops at  odscwest this november  register now     off ends soon    datascience  ai  machinelearning  nlp ,2,2,"[0.1051965057849884, 0.0018729001749306917, 0.8929306268692017]"
 43 |  odscwest      will host some of the best and brightest minds  in  ml  dl   mlops  and more  don t miss this chance to learn from the leading experts in your field  register now     of fends soon   ,2,2,"[0.07636086642742157, 0.0008551652426831424, 0.9227839708328247]"
 44 | our upcoming training session on practical mlops will address some of the challenges and questoins that you might face while building out your organization s mlops   datascoemce  mlops ,0,0,"[0.9658889174461365, 0.031115038320422173, 0.0029959676321595907]"
 45 | thrilled to kick off our product blog series highlighting the new features and enhancements in the verta platform    first up  the all important capability of managing access across different types of users and teams  and supporting collaborative ai ml development   mlop s verta ,2,2,"[0.16167517006397247, 0.005087140016257763, 0.8332377076148987]"
 46 |       key finding      operations  organizations that document and neforce mlops processes are twice as likely to achieve their goals to a high degree  they are also nearly  x more likely to be highly prepared for ai related risks,0,0,"[0.6923893094062805, 0.025036849081516266, 0.2825738787651062]"
 47 |      we re looking forward to participating in the fireside chat at  xaisummit   next week  wednesday  co founder  will have a hcat with fellow panelists from    and  on mlops ecosystems     free registration  ,0,2,"[0.3861650824546814, 0.026446418836712837, 0.5873884558677673]"
 48 | all this talk about mlops but what i really struggle wtih is kidops   ,0,0,"[0.5316697359085083, 0.13710354268550873, 0.331226646900177]"
 49 | reasons organizations must invest in data enginereing and mlops talents   pcquest ,0,0,"[0.9966242909431458, 0.0031645207200199366, 0.00021117455617059022]"
 50 | from  insights  gt  gt  see how  is a key  modelops vendor    see why         machinelearning  bigdata  ai  enterpriseai  datascience  mlops  modelopco  modelgovernance  modelriskmanagement  datascientists  aistrategy ,0,0,"[0.9948321580886841, 0.004829770885407925, 0.0003379612462595105]"
 51 |   different approachse for train test splitting of a pandas dataframe via  towardsai     machinelearning  ml  artificialintelligence  mlops  ai  datascience  deeplearning  technology  programming  news  research  coding  aidevelopment,0,0,"[0.9968273639678955, 0.0029735269490629435, 0.00019914739823434502]"
 52 | why do you need a feature store for machine learning   learn this and more on our webcast on kubeflow feast  watch to learn more        mlops  kubeflow  featureengineering  kbueflowfeast ,2,2,"[0.09164253622293472, 0.0013369751395657659, 0.9070204496383667]"
 53 | a gentle introudction to  mlops by yashaswi nayak in  ,0,0,"[0.996032178401947, 0.0037136124446988106, 0.000254080950981006]"
 54 | inusrance agents have to be very good at decision making in the insurance industry  with the help of ai  they can make the best decisions and provide enhanced customer service  read this article to know more about it      xpressoai  datascientists  mlops ,2,2,"[0.07920340448617935, 0.000931842252612114, 0.9198647141456604]"
 55 | mlops with kubernetes  rabbitmq and fastapi  my new article was published on   thanks    skipper is a simple and flexible open source ml workflow engine  it helps to run and scale ml services in production    python  kubernetes  read   ,0,0,"[0.6621905565261841, 0.2723737061023712, 0.06543572247028351]"
 56 |  datascientists and data engineers play a hgue role in  mlops and  devops  with the right  data  both teams work closely to generate the best  application performance  head to the blog now to learn more   via    devops  cloud  programming  aws ,2,2,"[0.09338055551052094, 0.001373759936541319, 0.9052456617355347]"
 57 | read our full benchmark comparing mlops enterprise readiness soluitons in the cloud from analysts  and jake dolezal       machinelearning  artificialintelligence  deeplearning  ai  bigdata  analytics  datascience  cloudcomputing  mlops ,2,2,"[0.10897282510995865, 0.002131231129169464, 0.8888959884643555]"
 58 | good overview and introduction to  mlops for  datascience by      analytics  iianalytics  tech  technology  artificialintelligence  machinelearning  ml  ai  data  dataanalytics d ataandanalytics,2,2,"[0.09891516715288162, 0.0016732927178964019, 0.8994114995002747]"
 59 | check this summary of what s new in  kubeflow      plus a breakdown of contributor and chnage stats for each component      machinelearning  datascience  mlops,0,0,"[0.9952695965766907, 0.004413694608956575, 0.0003167215909343213]"
 60 | iguazio mlops platform now supports amazon fsx for nteapp ontap ,0,0,"[0.6057823300361633, 0.023763388395309448, 0.3704543113708496]"
 61 | iguazio mlops platform nwo supports amazon fsx for netapp ontap ,0,0,"[0.9963659048080444, 0.003403782146051526, 0.0002302663924638182]"
 62 |   tools for machine learning serving in mlops    tensorflow serving   torch serve   bentoml   sagemaker   cortex labs   ployagon   aible   seldon   lagorithmia,0,0,"[0.9960690140724182, 0.0036697378382086754, 0.0002611815871205181]"
 63 |  mlops is hot      lots of interesting work happening in the startup ecosystem to help enterprises operationalize ml   join us at  xaisummit   to listen to these amazing speakers from         register today   ,2,2,"[0.08134283870458603, 0.001001441851258278, 0.9176558256149292]"
 64 |  infographic  types of  machinelearning  artificialintelligence  deeplearning  ai  digitaltransformation  bigdata  analytisc  datascience  pytorch  python  tensorflow  reactjs  cloudcomputing  datascientist  linux     daysofcode  mlops  modelops  deeplearning ,0,0,"[0.9964445233345032, 0.003326390404254198, 0.00022906716912984848]"
 65 |   only   weeks away from our  mlopssalon   we ll be bringing together expertsf rom industry as well as research  and showcase best practices  real world case studies  and a wonderful panel discussion  join us and register here      mlops  machinelearning,2,2,"[0.08088953793048859, 0.000979499309323728, 0.9181309342384338]"
 66 | join this upcoming event to learn more about reproducibility   mlops  memoizatoin  static checking and more  register now    odsc  datascience  ai ,2,2,"[0.07691574096679688, 0.0008849663427099586, 0.9221992492675781]"
 67 | from faster model deployment and anomaly detection to adoption of real time data  read how businesse suse  mlops to improve management ,0,0,"[0.9930604100227356, 0.006471828557550907, 0.00046773048234172165]"
 68 | are we heading towards a new wave of mlops tool evoultion  i think so  here is a small write up on our thought process    mlops  netbook  mlinfraops  datascience  ,1,1,"[0.2561167776584625, 0.6674502491950989, 0.0764329805970192]"
 69 | artificial intelligence  ai  newsletter by towards ai     via  towardsai    mw    machinelearning  ml  artifciialintelligence  mlops  ai  datascience  deeplearning  technology  programming  news  research  coding  aidevelopment  ainews  ainewsletter,1,1,"[0.24498817324638367, 0.6660119891166687, 0.08899985998868942]"
 70 | all you needt o know to start with deep learning via  towardsai     machinelearning  ml  artificialintelligence  mlops  ai  datascience  deeplearning  technology  programming  news  research  coding  aidevelopment,0,0,"[0.9952159523963928, 0.004472442902624607, 0.00031160275102593005]"
 71 | minikf is the fastest and easiest way to get  kubeflow up and running on   or your laptop  got questions  we have a new technical minikf faq that just went live     machineleanring  mlops  datascience,0,0,"[0.993370771408081, 0.0061850701458752155, 0.0004441736964508891]"
 72 |  launches zero emission ai cloud with integratedm lops technology stack optimized for nvidia   ein news ,0,0,"[0.5411422848701477, 0.3883601129055023, 0.07049757987260818]"
 73 | streaming live at   p edt is matt cowell from  with our lunchtime keynote   can humans learn like machines  the case for human machine learning    join his session  free       machinelearning  executive  augmentedmachinelearnnig  mlops ,0,2,"[0.3963286578655243, 0.017366835847496986, 0.5863044857978821]"
 74 | the imitation game  can you tell the difference between people and  ai    deeplearning  ml  lmops  aiops  datascience,1,1,"[0.2585621774196625, 0.6317030787467957, 0.1097346767783165]"
 75 | mlops with kubernetes  rabbitmq and fastapi   wewantdata  data  inisghts  bigdata  web  database  tech  marketing ,0,0,"[0.6223801374435425, 0.30713409185409546, 0.07048574090003967]"
 76 | the role of mlops on effective ai by carl w ahndlin wallace ,2,2,"[0.2497011423110962, 0.009928539395332336, 0.7403702735900879]"
 77 | mlops iwth kubernetes  rabbitmq and fastapi  ,0,0,"[0.6326648592948914, 0.2954410910606384, 0.07189397513866425]"
 78 | rt mlops with kubernetes  rabbitmq and fastapi   mlops  imcroservices  machinelearning  python ,0,0,"[0.6604217886924744, 0.274705708026886, 0.06487248837947845]"
 79 | big thanks  for the super      mlopsforgood swag   was super fun working on this project together  looking forwar dto the next one       opensource  mlops  aiforgood ,0,2,"[0.26532408595085144, 0.012645184993743896, 0.722030758857727]"
 80 | datatro nintroduces new features to mlops and ai governance solution ,0,0,"[0.995606005191803, 0.004109969828277826, 0.0002839597873389721]"
 81 | neu ro launches zero emission ai cloud with integrated mlops technology stacko ptimized for nvidia architectures ,0,0,"[0.9966084957122803, 0.0031772786751389503, 0.0002141773875337094]"
 82 | join  today masterclass prat    we examine the final leg of the journey to move the  ai model into business    modelops  mlops  aiethics  aigovernance  enterpriseai ,0,0,"[0.9854965209960938, 0.013322942890226841, 0.0011805054964497685]"
 83 | hot off the press  we ve released new research about the current state of machine learning in the enterprise  download the erport to discover the   latest industry trends you need to know   mltrends  enterpriseml  mlops  machinelearning,0,0,"[0.6281914114952087, 0.026905635371804237, 0.344902902841568]"
 84 |   october heartbeat is out  all the news from our growing community       mlops workflows    lots of ways to learn    meetup and conference videos    docs udpates    info on our growing team  and more       ,2,2,"[0.4681890904903412, 0.016620755195617676, 0.515190064907074]"
 85 | found the ultimate project list for  ml  ai  python  nlp  computervision     deeplearning  neuralnetworks  machinelearning  datascience  datascinetist  datamining  mlops,0,0,"[0.995892345905304, 0.0038427524268627167, 0.0002650012611411512]"
 86 | from sci fi films to reality   artificiallintelligence has become one of the hottest fields in modern technology  ho wexactly does ai benefit us and improve  quality of  life   read more       datascience  machinelearning  mlops  nocode ,2,0,"[0.5610067248344421, 0.030900394544005394, 0.4080928564071655]"
 87 | anindya has a great talk linked up fo r datascientists   dataengineers  and  mlops folks  tune in tomorrow and be sure to let me know what you think  ,2,2,"[0.09707242250442505, 0.001532541704364121, 0.9013950824737549]"
 88 | thinking darwinian via  towardsai     machinelearning  ml  artificialintelligence  mlops  ai  datascience  deeplearning  technology  programming  news  research  coding  aidevelopemnt,0,0,"[0.9967163801193237, 0.0030752187594771385, 0.00020837220654357225]"
 89 |      mlops and automl are two of the most popular applications of machine learning today  giving teams the ability to automate tasks and bring devops principles to mcahine learning use cases ,2,2,"[0.09231641888618469, 0.0013705312740057707, 0.9063130617141724]"
 90 |      mlops and devops  why data makes it different   o reilly radar ,0,0,"[0.9952362775802612, 0.00445513566955924, 0.0003086017386522144]"
 91 | seldon s fsi leda richard jarvis explores why bank omnichannel success needs mlops to truly scale in our latest blog post    ,2,2,"[0.45805591344833374, 0.01759915053844452, 0.5243449211120605]"
 92 | data changes over time resulting in predictive performance degradation  in your models      how can you address this issue   often the ersult of concept drift    see how to use these statistical methods to detect  conceptdrift in your models       mlops ,0,2,"[0.4532445967197418, 0.07632842659950256, 0.4704269468784332]"
 93 |  same i m also trying to do  amp  after learning programming  mlops  devops  cloud  full stack  mobile app dev  web dev etc  now i feel the difference     ,0,0,"[0.5526949167251587, 0.024858929216861725, 0.42244619131088257]"
 94 | we re hosting our first virtual tech ethics meetup  next friday   nd october  if you re interested in delving deeper into practical  ai ethics from an mlops perspective  join us  find out moer details and sign up here      ,0,0,"[0.628234326839447, 0.2226191908121109, 0.14914649724960327]"
 95 | mlops and devops w hy data makes it different ,0,0,"[0.9954752326011658, 0.004232785198837519, 0.00029194532544352114]"
 96 |  on demand webinar  watch fern halper from   ankita gupta from   sanjithraj rao from   and lti s shivanand pawar discuss optimizing mlpos journey  amp  best practices for success in the recently concluded webinar    letssolve ,2,2,"[0.23922623693943024, 0.007067013997584581, 0.753706693649292]"
 97 | a fudnamental principle of neuroscience that is inspiring optimizations in neural networks via  towardsai     machinelearning  ml  artificialintelligence  mlops  ai  datascience  deeplearning  technology  programming  news  research  coding  aidevelop ,2,0,"[0.4875865876674652, 0.040823642164468765, 0.47158968448638916]"
 98 | how to generate th erequirements of your python project based on your imports via  towardsai     machinelearning  ml  artificialintelligence  mlops  ai  datascience  deeplearning  technology  programming  news  research  coding  aidevelopment via  ,0,0,"[0.9957854151725769, 0.00394292501732707, 0.0002716170565690845]"
 99 | rela time stock news sentiment analyzer via  towardsai     machinelearning  ml  artificialintelligence  mlops  ai  datascience  deeplearning  technology  programming  news  research  coding  aidevelopment via   towardai,0,0,"[0.9955062866210938, 0.004199127200990915, 0.00029448879649862647]"
100 | what does your dat ascience workflow look like   at askanna we talk with data scientists every week  based on what we learned  we created this  datascience  workflow  what do you recognize  what did we miss    ml  machinelearning  ai  mlops  continuousdevelopment ,0,0,"[0.9949839115142822, 0.004690011031925678, 0.0003260923840571195]"
101 | streamline your computer vision stack with an end to end mlops platform   via   read more     mlops  machinelearning  ml  artificialintelligence  ai  deeplearning  innovation ,2,2,"[0.10797715932130814, 0.001998339779675007, 0.8900244832038879]"
102 | 


--------------------------------------------------------------------------------
/test_results/MFT_latest_test_results.csv:
--------------------------------------------------------------------------------
1 | negated_text,labels,predicted_labels,predicted_probabilities
2 | it is not sunny,0,0,"[0.9976467490196228, 0.0017173351952806115, 0.0006358569371514022]"
3 | pasta is not very delicious,1,2,"[0.3838845193386078, 0.02801734395325184, 0.5880982279777527]"
4 | the product is not worse,2,1,"[0.01968579739332199, 0.9801393747329712, 0.00017482005932834]"
5 | mlops is not inspired from devops,0,0,"[0.9993830919265747, 0.00034421923919580877, 0.0002726506209000945]"
6 | John is not a morning person,0,0,"[0.9989610910415649, 0.0006495718262158334, 0.0003893142275046557]"
7 | 


--------------------------------------------------------------------------------
/test_results/MFT_production_test_results.csv:
--------------------------------------------------------------------------------
1 | negated_text,labels,predicted_labels,predicted_probabilities
2 | it is not sunny,0,0,"[0.9916446805000305, 0.007791681680828333, 0.0005636655259877443]"
3 | pasta is not very delicious,1,1,"[0.242270827293396, 0.6703853011131287, 0.08734394609928131]"
4 | the product is not worse,2,0,"[0.43473562598228455, 0.24204568564891815, 0.3232187330722809]"
5 | mlops is not inspired from devops,0,0,"[0.9960785508155823, 0.003672090359032154, 0.00024939357535913587]"
6 | John is not a morning person,0,0,"[0.9937883019447327, 0.005802359897643328, 0.00040929310489445925]"
7 | 


--------------------------------------------------------------------------------
/utils/experiment_tracking.py:
--------------------------------------------------------------------------------
 1 | """
 2 | @author: Jithin Sasikumar
 3 | 
 4 | Module to track model training and log the model artifacts, resulting metrics
 5 | and parameters. For that purpose, `MLFlow` is used. This module has the flexibility
 6 | to extend its functionality to support other tracking mechanism like tensorboard etc.
 7 | It is facilitated via `ExperimentTracker protocol` which is similar to interface.
 8 | """
 9 | 
10 | import mlflow
11 | from typing import Protocol
12 | from dataclasses import dataclass
13 | 
14 | class ExperimentTracker(Protocol):
15 |     """
16 |     Interface to track experiments by inherting from Protocol class.
17 |     """
18 |     def __start__(self):
19 |         ...
20 | 
21 |     def log(self):
22 |         ...
23 | 
24 |     def end(self):
25 |         ...
26 | 
27 | @dataclass
28 | class MLFlowTracker:
29 |     """
30 |     Dataclass to track experiment via MLFlow.
31 | 
32 |     Instance variables
33 |     ------------------
34 |     experiment_name: str
35 |         Name of the experiment to be activated or created.
36 |     tracking_uri: str
37 |         URI of EC2 instance where MLflow server is hosted.
38 |     run_name: str
39 |         Name of training run pertaining to an experiment.
40 |     experiment: bool
41 |         Boolean to create a new experiment, else False.
42 |     """
43 | 
44 |     experiment_name: str
45 |     tracking_uri: str
46 |     run_name: str
47 |     experiment: bool
48 |     
49 |     def __start__(self) -> None:
50 |         """
51 |         Dunder method to start a new mlflow run in MLFlow server and set
52 |         model tracking URI and create experiment.
53 | 
54 |         Parameters
55 |         ----------
56 |             None
57 |         
58 |         Returns
59 |         -------
60 |             None
61 | 
62 |         Raises
63 |         ------
64 |         ConnectionError: Exception
65 |             If mlflow tracking URI doesn't exist or invalid.
66 |         """
67 |         try:
68 |             mlflow.set_tracking_uri(self.tracking_uri)
69 | 
70 |         except ConnectionError:
71 |             print(f"Cannot connect to {self.tracking_uri}. Please check and validate the URI!!")
72 | 
73 |         else:
74 |             if self.experiment:
75 |                 exp_id = mlflow.create_experiment(self.experiment_name)
76 |                 experiment = mlflow.get_experiment(exp_id)
77 | 
78 |             else:
79 |                 experiment = mlflow.set_experiment(self.experiment_name)
80 | 
81 |             mlflow.start_run(run_name = self.run_name,
82 |                             experiment_id = experiment.experiment_id)
83 |     
84 |     def log(self) -> None:
85 |         """
86 |         Initialize auto-logging for tracking. This will log model
87 |         artifacts in S3 bucket, parameters and metrics in the EC2 instance.
88 |         """
89 |         self.__start__()
90 |         mlflow.tensorflow.autolog()
91 | 
92 |     def end(self) -> None:
93 |         """
94 |         End an active MLflow run.
95 |         """
96 |         mlflow.end_run()


--------------------------------------------------------------------------------
/utils/helper.py:
--------------------------------------------------------------------------------
  1 | """
  2 | @author: Jithin Sasikumar
  3 | 
  4 | Module consisting of helper functions which is generic across the project.
  5 | """
  6 | 
  7 | import re
  8 | import os
  9 | import nltk
 10 | import pandas as pd
 11 | from textblob import TextBlob
 12 | from nltk.probability import FreqDist
 13 | import tomli as tomlib
 14 | from typing import Any
 15 | from dataclasses import dataclass
 16 | from airflow import settings
 17 | from airflow.exceptions import AirflowFailException
 18 | from airflow.models.connection import Connection
 19 | 
 20 | class Config:
 21 |     """
 22 |     Loads all configurations from `config.toml` for the project.
 23 |     """
 24 |     def __new__(cls) -> dict[str, Any]:
 25 |         """
 26 |         Dunder method to load config.
 27 | 
 28 |         Parameters
 29 |         ----------
 30 |         cls
 31 |             Class to be instantiated.
 32 | 
 33 |         Returns
 34 |         -------
 35 |         config: dict[str, Any]
 36 |             Loaded configurations as dict.
 37 |         """
 38 | 
 39 |         with open("./config/config.toml", mode="rb") as config_file:
 40 |             config = tomlib.load(config_file)
 41 |         return config
 42 | 
 43 | def load_dataframe(file_path: str) -> pd.DataFrame:
 44 |     """
 45 |     Helper function to load any parquet file as pandas dataframe.
 46 | 
 47 |     Parameters
 48 |     ----------
 49 |     file_path: str
 50 |         Path to input parquet file.
 51 |     
 52 |     Returns
 53 |     -------
 54 |     dataframe: pd.DataFrame
 55 |     """
 56 |     this_dir = os.getcwd()
 57 |     dataframe_path = os.path.join(this_dir, file_path)
 58 |     dataframe = pd.read_parquet(path = dataframe_path, engine = "pyarrow")
 59 |     return dataframe
 60 | 
 61 | @dataclass
 62 | class Connections:
 63 |     """
 64 |     Dataclass to configure and set Airflow connections.
 65 |     """
 66 |     new_connection: Connection
 67 | 
 68 |     def create_connections(self) -> bool:
 69 |         """
 70 |         Method to create a new airflow connection
 71 | 
 72 |         Parameters
 73 |         ----------
 74 |             None
 75 | 
 76 |         Returns
 77 |         -------
 78 |         bool
 79 |             True if connection is created, else False.
 80 | 
 81 |         Raises
 82 |         ------
 83 |         AirflowFailException: Exception
 84 |             If connection cannot be created or invalid.
 85 |         """
 86 |         try:
 87 |             session = settings.Session()
 88 |             connection_name = session.query(Connection).filter(
 89 |                                                         Connection.conn_id == self.new_connection.conn_id
 90 |                                                         ).first()
 91 | 
 92 |             if str(connection_name) != str(self.new_connection.conn_id):
 93 |                 session.add(self.new_connection)
 94 |                 session.commit()
 95 | 
 96 |         except Exception as exc:
 97 |             raise AirflowFailException( f"Error when creating new connection:{exc}") from exc
 98 | 
 99 |         else:
100 |             return True
101 |         
102 |         finally:
103 |             session.close()
104 | 
105 | def remove_noise(text: str) -> str:
106 |     """
107 |     Helper function to remove noise from text as part of text cleaning
108 |     using regular expressions (regex).
109 | 
110 |     Parameters
111 |     ----------
112 |     text: str
113 |         Input text
114 |     
115 |     Returns
116 |     -------
117 |     Cleaned text
118 |     """
119 | 
120 |     text = re.sub('http[s]?://(?:[a-zA-Z]|[0-9]|[$-_@.&+#]|[!*\(\),]|'\
121 |             '(?:%[0-9a-fA-F][0-9a-fA-F]))+','', text)
122 |     text = re.sub("(@[A-Za-z0-9_]+)","", text)
123 |     text = re.sub('\n',' ', text)
124 |     text = re.sub('#','', text)
125 |     
126 |     return text
127 | 
128 | def calculate_polarity(text: str) -> float:
129 |     """
130 |     Helper function to calculate text polarity.
131 | 
132 |     Parameters
133 |     ----------
134 |     text: str
135 |         Input text
136 |     
137 |     Returns
138 |     -------
139 |     polarity: float
140 |     """
141 |     return TextBlob(text).sentiment.polarity
142 | 
143 | def remove_stopwords(tokens: list[str],
144 |                     stopwords_: nltk.corpus.stopwords) -> list[str]:
145 |     """
146 |     Helper function to remove stopwords from given input tokens.
147 | 
148 |     Parameters
149 |     ----------
150 |     tokens: list[str]
151 |         List of tokens pertaining to each text.
152 |     stopwords_: nltk.corpus.stopwords
153 |         List of stopwords defined in NLTK.
154 | 
155 |     Returns
156 |     -------
157 |     list[str]
158 |         Resultant list of text with no stopwords.
159 |     """
160 |     return [token for token in tokens if token not in stopwords_]
161 | 
162 | def remove_less_frequent_words(dataframe) -> pd.DataFrame:
163 |     """
164 |     Helper function to remove the words that are less frequent (< 2 times).
165 | 
166 |     Parameters
167 |     ----------
168 |     dataframe: pd.DataFrame
169 |         Input dataframe
170 | 
171 |     Returns
172 |     -------
173 |     Resultant dataframe with less frequent words removed.
174 |     """
175 | 
176 |     dataframe['tokenized_strings'] = dataframe['tokenized_tweets'].apply(
177 |                                                                     lambda tokens: ' '.join(
178 |                                                                                     [token for token in tokens if len(token) > 2]
179 |                                                                                     )
180 |                                                                         )
181 |     tokenized_words = nltk.tokenize.word_tokenize(' '.join(
182 |                                                             [word
183 |                                                             for word in dataframe['tokenized_strings']
184 |                                                             ]
185 |                                                         )
186 |                                                 )
187 |     frequency_distribution = FreqDist(tokenized_words)
188 |     dataframe['tokenized_strings'] = dataframe['tokenized_tweets'].apply(
189 |                                                                         lambda tweets: ' '.join(
190 |                                                                                 [tweet for tweet in tweets
191 |                                                                                 if frequency_distribution[tweet] > 2
192 |                                                                                 ]
193 |                                                                             )
194 |                                                                         )
195 |     return dataframe
196 | 
197 | def assign_sentiment_labels(score: float) -> str:
198 |     """
199 |     Helper function to assign sentiment labels to polarity scores.
200 | 
201 |     Parameters
202 |     ----------
203 |     score: float
204 |         Polarity score of each text.
205 | 
206 |     Returns
207 |     -------
208 |     sentiment_label: str
209 |     """
210 |     if score > 0.25:
211 |         return "positive"
212 |     elif score < 0:
213 |         return "negative"
214 |     else:
215 |         return "neutral"


--------------------------------------------------------------------------------
/utils/model.py:
--------------------------------------------------------------------------------
 1 | """
 2 | @author: Jithin Sasikumar
 3 | 
 4 | Model to define deep neural network for training.
 5 | 
 6 | Bi-directional LSTM (biLSTM) network is used for this project encompassing an
 7 | embedding layer, stack of biLSTM layers followed by fully connected dense layers
 8 | with dropout. This module provides the flexibility to add any other models
 9 | by inheriting Models(ABC).
10 | 
11 | """
12 | 
13 | from abc import ABC, abstractmethod
14 | from dataclasses import dataclass
15 | from keras.models import Sequential
16 | from keras.layers import Embedding, Bidirectional, LSTM, Dense, Dropout
17 | 
18 | class Models(ABC):
19 |     """
20 |     Abstract base class that defines and creates model.
21 |     """
22 |     @abstractmethod
23 |     def define_model(self):
24 |         pass
25 | 
26 |     @abstractmethod
27 |     def create_model(self):
28 |         pass
29 | 
30 | 
31 | @dataclass
32 | class BiLSTM_Model(Models):
33 |     """
34 |     Dataclass to create biLSTM model inheriting Models class.
35 |     """
36 |     vocab_size: int
37 |     num_classes: int
38 |     embedding_dim: int = 64
39 |     input_length: int = 128
40 | 
41 |     def define_model(self) -> Sequential:
42 |         """
43 |         Method to define model that can be used for training and inference.
44 |         The existing model can also be tweaked by changing parameters,
45 |         based on the requirements.
46 | 
47 |         Parameters
48 |         ----------
49 |             None
50 | 
51 |         Returns
52 |         -------
53 |         keras.models.Sequential
54 |         """
55 |         return Sequential(
56 |                     [
57 | 
58 |                     # Embedding layer that expects the following:
59 |                     # Size of vocabulary, Output embedding vectors & Size of each input sequence
60 |                     Embedding(self.vocab_size, self.embedding_dim, input_length = self.input_length),
61 | 
62 |                     #Bidirectional LSTM layers
63 |                     Bidirectional(LSTM(self.embedding_dim, return_sequences=True)),
64 |                     Bidirectional(LSTM(64, return_sequences = True)),
65 |                     Bidirectional(LSTM(32)),
66 |                     
67 |                     #Dense layers
68 |                     Dense(self.embedding_dim, activation = 'relu'),
69 |                     Dense(64, activation = 'relu'),
70 |                     Dropout(0.25),
71 |                     Dense(self.num_classes, activation = 'softmax')
72 |                     ]
73 |                 )
74 |         
75 |     def create_model(self) -> Sequential:
76 |         """
77 |         Method to create the model defined by define_model() method
78 |         and prints the model summary.
79 | 
80 |         Parameters
81 |         ----------
82 |             None
83 | 
84 |         Returns
85 |         -------
86 |         model: keras.models.Sequential
87 |             Created model
88 |         """
89 | 
90 |         model: Sequential = self.define_model()
91 |         model.summary()
92 |         return model


--------------------------------------------------------------------------------
/utils/prepare_data.py:
--------------------------------------------------------------------------------
 1 | """
 2 | @author: Jithin Sasikumar
 3 | 
 4 | Module to transform preprocessed dataframe (parquet or csv) into tf.data.Dataset format
 5 | which creates an efficient input pipeline that in turn be fed into the tensorflow model.
 6 | BERT tokenizer is used instead of normal tokenizer for better embeddings.
 7 | 
 8 | """
 9 | import math
10 | import pandas as pd
11 | import numpy as np
12 | import tensorflow as tf
13 | from dataclasses import dataclass, field
14 | from transformers import BertTokenizer
15 | 
16 | @dataclass
17 | class Dataset:
18 |     """
19 |     Dataclass that encodes and transforms dataframe into tensorflow dataset.
20 |     """
21 |     tokenizer: BertTokenizer
22 |     dataframe: pd.DataFrame = field(default_factory = pd.DataFrame())
23 |     labels: np.ndarray = None
24 |     batch_size: int = 64
25 |     max_length: int = 256
26 |     train: bool = False
27 |     col_name: str = "cleaned_tweets"
28 | 
29 |     @property
30 |     def list_of_texts(self) -> list[str]:
31 |         """
32 |         Class property to convert text column of dataframe to list of strings
33 |         for processing.
34 | 
35 |         Parameters
36 |         ----------
37 |             None
38 |         
39 |         Returns
40 |         -------
41 |         list[str]
42 |             List of texts
43 |         """
44 |         return self.dataframe[self.col_name].tolist()
45 | 
46 |     @property
47 |     def shuffle_size(self) -> int:
48 |         """
49 |         Class property to calculate the shuffle size for dataset.
50 | 
51 |         Parameters
52 |         ----------
53 |             None
54 |         
55 |         Returns
56 |         -------
57 |         shuffle_size: int
58 |         """
59 |         return math.ceil(len(self.list_of_texts) / self.batch_size)
60 | 
61 |     def encode_bert_tokens_to_tf_dataset(self) -> tf.data.Dataset.zip:
62 |         """
63 |         Transform tokens into tensorflow dataset. The dataset is batched and
64 |         shuffled.
65 |         
66 |         BERT tokenizer is used => (i.e.) The texts are tokenized and each token
67 |         is encoded into unique IDs referred as input_ids by means of vocabulary.
68 | 
69 |         Parameters
70 |         ----------
71 |             None
72 |         
73 |         Returns
74 |         -------
75 |         dataset: tf.data.Dataset.zip
76 |             Tensorflow dataset after batching and shuffling.
77 |         """
78 |         tokenized: BertTokenizer = self.tokenizer(
79 |                                                 text = self.list_of_texts,
80 |                                                 add_special_tokens = True,
81 |                                                 max_length = self.max_length,
82 |                                                 padding = "max_length",
83 |                                                 return_tensors = "tf",
84 |                                                 return_attention_mask = False,
85 |                                                 return_token_type_ids = False,
86 |                                                 verbose = True
87 |                                             )
88 | 
89 |         input_ids = tf.data.Dataset.from_tensor_slices(np.array(tokenized['input_ids']))
90 |         labels = tf.data.Dataset.from_tensor_slices(self.labels)
91 |         # Zipping input_ids and labels as a single dataset object
92 |         dataset = tf.data.Dataset.zip((input_ids, labels))
93 | 
94 |         if self.train:
95 |             return dataset.shuffle(self.shuffle_size).batch(self.batch_size)
96 | 
97 |         return dataset.batch(self.batch_size)


--------------------------------------------------------------------------------