├── .github └── workflows │ ├── benchmark_and_test_model.yml │ ├── deploy.yml │ └── run_dags.yml ├── .gitignore ├── Makefile ├── README.md ├── config └── config.toml ├── dags ├── etl_twitter_dag.py ├── model_training_dag.py └── task_definitions │ ├── etl_task_definitions.py │ └── model_training.py ├── dependencies ├── Dockerfile └── requirements.txt ├── docker-compose.yaml ├── images ├── Sagemaker_endpoint.jpg ├── architecture_diagram.jpeg ├── ecr_image.PNG ├── etl_dag.PNG ├── mlflow_exps.PNG ├── model_dag.PNG ├── model_plot.png ├── model_registry_latest1.PNG ├── model_registry_latest2.PNG └── model_registry_org.PNG ├── scripts ├── behavioral_test.py ├── deploy.py ├── stage_model_to_production.py └── test_data │ ├── sample_test_data_for_mft.parquet │ └── test_data.parquet ├── test_results ├── Invariance_latest_test_results.csv ├── Invariance_production_test_results.csv ├── MFT_latest_test_results.csv └── MFT_production_test_results.csv └── utils ├── experiment_tracking.py ├── helper.py ├── model.py └── prepare_data.py /.github/workflows/benchmark_and_test_model.yml: -------------------------------------------------------------------------------- 1 | # Name of the workflow 2 | name: Test and benchmark models 3 | 4 | on: 5 | push: 6 | branches: [main] 7 | 8 | jobs: 9 | build: 10 | runs-on: self-hosted 11 | steps: 12 | - uses: actions/checkout@v2 13 | 14 | - name: Test and benchmark models 15 | id: test_benchmark 16 | env: 17 | AWS_ACCESS_KEY_ID: ${{ secrets.AWS_ACCESS_KEY_ID }} 18 | AWS_SECRET_ACCESS_KEY: ${{ secrets.AWS_SECRET_ACCESS_KEY }} 19 | REGION: ${{ secrets.REGION }} 20 | 21 | run: | 22 | python3 -m pip install --upgrade pip 23 | pip install -r ./dependencies/requirements.txt 24 | python -m spacy download en_core_web_sm 25 | python ./scripts/stage_model_to_production.py 26 | 27 | -------------------------------------------------------------------------------- /.github/workflows/deploy.yml: -------------------------------------------------------------------------------- 1 | # Name of the workflow 2 | name: Deploy to sagemaker 3 | 4 | on: workflow_dispatch 5 | 6 | jobs: 7 | build: 8 | runs-on: self-hosted 9 | steps: 10 | - uses: actions/checkout@v2 11 | 12 | - name: Deploy production-ready image from AWS ECR to Sagemaker 13 | id: deploy_to_prod 14 | env: 15 | AWS_ACCESS_KEY_ID: ${{ secrets.AWS_ACCESS_KEY_ID }} 16 | AWS_SECRET_ACCESS_KEY: ${{ secrets.AWS_SECRET_ACCESS_KEY }} 17 | REGION: ${{ secrets.REGION }} 18 | IMAGE_URI: ${{ secrets.IMAGE_URI }} 19 | ARN_ROLE: ${{ secrets.ARN_ROLE }} 20 | 21 | run: | 22 | python3 -m pip install --upgrade pip 23 | pip install -r ./dependencies/requirements.txt 24 | python ./scripts/deploy.py -------------------------------------------------------------------------------- /.github/workflows/run_dags.yml: -------------------------------------------------------------------------------- 1 | # Name of the workflow 2 | name: Run Airflow DAG 3 | 4 | on: workflow_dispatch 5 | 6 | jobs: 7 | build: 8 | runs-on: self-hosted 9 | steps: 10 | - uses: actions/checkout@v2 11 | with: 12 | # Loading the secrets 13 | secrets: | 14 | "AWS_ACCESS_KEY_ID=${{ secrets.AWS_ACCESS_KEY_ID }}" 15 | "AWS_SECRET_ACCESS_KEY=${{ secrets.AWS_SECRET_ACCESS_KEY }}" 16 | "REGION=${{ secrets.REGION }}" 17 | "LOGIN=${{ secrets.LOGIN }}" 18 | "PASSWORD=${{ secrets.PASSWORD }}" 19 | "HOST=${{ secrets.HOST }}" 20 | "ACCOUNT=${{ secrets.ACCOUNT }}" 21 | "WAREHOUSE=${{ secrets.WAREHOUSE }}" 22 | "DATABASE=${{ secrets.DATABASE }}" 23 | "SCHEMA=${{ secrets.SCHEMA }}" 24 | 25 | - name: Run airflow dag 26 | run: make run_dag -------------------------------------------------------------------------------- /.gitignore: -------------------------------------------------------------------------------- 1 | # Byte-compiled / optimized / DLL files 2 | __pycache__/ 3 | *.py[cod] 4 | *$py.class 5 | 6 | # C extensions 7 | *.so 8 | 9 | # Distribution / packaging 10 | .Python 11 | build/ 12 | develop-eggs/ 13 | dist/ 14 | downloads/ 15 | eggs/ 16 | .eggs/ 17 | lib/ 18 | lib64/ 19 | parts/ 20 | sdist/ 21 | var/ 22 | wheels/ 23 | pip-wheel-metadata/ 24 | share/python-wheels/ 25 | *.egg-info/ 26 | .installed.cfg 27 | *.egg 28 | MANIFEST 29 | 30 | # PyInstaller 31 | # Usually these files are written by a python script from a template 32 | # before PyInstaller builds the exe, so as to inject date/other infos into it. 33 | *.manifest 34 | *.spec 35 | 36 | # Installer logs 37 | pip-log.txt 38 | pip-delete-this-directory.txt 39 | 40 | # Unit test / coverage reports 41 | htmlcov/ 42 | .tox/ 43 | .nox/ 44 | .coverage 45 | .coverage.* 46 | .cache 47 | nosetests.xml 48 | coverage.xml 49 | *.cover 50 | *.py,cover 51 | .hypothesis/ 52 | .pytest_cache/ 53 | 54 | # Translations 55 | *.mo 56 | *.pot 57 | 58 | # Django stuff: 59 | *.log 60 | local_settings.py 61 | db.sqlite3 62 | db.sqlite3-journal 63 | 64 | # Flask stuff: 65 | instance/ 66 | .webassets-cache 67 | 68 | # Scrapy stuff: 69 | .scrapy 70 | 71 | # Sphinx documentation 72 | docs/_build/ 73 | 74 | # PyBuilder 75 | target/ 76 | 77 | # Jupyter Notebook 78 | .ipynb_checkpoints 79 | 80 | # IPython 81 | profile_default/ 82 | ipython_config.py 83 | 84 | # pyenv 85 | .python-version 86 | 87 | # pipenv 88 | # According to pypa/pipenv#598, it is recommended to include Pipfile.lock in version control. 89 | # However, in case of collaboration, if having platform-specific dependencies or dependencies 90 | # having no cross-platform support, pipenv may install dependencies that don't work, or not 91 | # install all needed dependencies. 92 | #Pipfile.lock 93 | 94 | # PEP 582; used by e.g. github.com/David-OConnor/pyflow 95 | __pypackages__/ 96 | 97 | # Celery stuff 98 | celerybeat-schedule 99 | celerybeat.pid 100 | 101 | # SageMath parsed files 102 | *.sage.py 103 | 104 | # Environments 105 | .env 106 | .venv 107 | env/ 108 | venv/ 109 | ENV/ 110 | env.bak/ 111 | venv.bak/ 112 | 113 | # Spyder project settings 114 | .spyderproject 115 | .spyproject 116 | 117 | # Rope project settings 118 | .ropeproject 119 | 120 | # mkdocs documentation 121 | /site 122 | 123 | # mypy 124 | .mypy_cache/ 125 | .dmypy.json 126 | dmypy.json 127 | 128 | # Pyre type checker 129 | .pyre/ 130 | -------------------------------------------------------------------------------- /Makefile: -------------------------------------------------------------------------------- 1 | # Makefile to run Airflow DAG in docker container with external dependencies 2 | 3 | include .env 4 | 5 | run_dag: 6 | # Build extended airflow docker image with required pip dependencies 7 | docker build . -f ./dependencies/Dockerfile --tag extending_airflow:latest 8 | # Rebuild airflow webserver and scheduler with our newly build image 9 | docker-compose up -d --no-deps --build airflow-webserver airflow-scheduler 10 | 11 | # Start all required containers to run all airflow services 12 | docker-compose -f docker-compose.yaml up -d 13 | docker ps 14 | sleep 15 15 | 16 | # Triggering DAG for the first time by accessing the webserver container 17 | docker exec -it twitter_bot_airflow-webserver_1 bash -c "airflow dags trigger twitter_data_pipeline_dag_etl 18 | 19 | stop_dag: 20 | docker-compose -f docker-compose.yaml down -------------------------------------------------------------------------------- /README.md: -------------------------------------------------------------------------------- 1 | # Sentiment analysis from MLOps paradigm 2 | 3 | ![benchmark](https://github.com/Jithsaavvy/Sentiment-analysis-from-MLOps-paradigm/workflows/Test%20and%20benchmark%20models/badge.svg) 4 | ![deploy](https://github.com/Jithsaavvy/Sentiment-analysis-from-MLOps-paradigm/workflows/Deploy%20to%20sagemaker/badge.svg) 5 | 6 | This project promulgates an **automated end-to-end ML pipeline** that trains a **bi-directional LSTM** network for sentiment analysis task, **tracks** experiments, **pushes** trained models to **model registry**, benchmark them by means of **model testing** and **evaluation**, pushes the best model into production followed by **dockerizing** the production model artifacts into a deployable image and **deploys** the same into cloud instance via **CI/CD**. 7 | 8 | ## Author 9 | 10 | - [@Jithin Sasikumar](https://www.github.com/Jithsaavvy) 11 | 12 | ## Languages and Tools 13 | 14 |
15 | python 16 | tensorflow 17 | docker 18 | airflow 19 | actions 20 | mlflow 21 | s3 22 | ec2 23 | ecr 24 | sagemaker 25 | sagemaker 26 |
27 |
28 | 29 | ## Motivation 30 | 31 | In a machine learning (ML) project, it comprises of a chain of tasks like data collection, pre-processing, transforming datasets, feature extraction, model training, model selection, evaluation, deployment. For a small-scale project, these tasks can be managed manually but, as the scalability and scope of the project increases, manual process is really a pain. The actual problem arises when the model has to be productionalized in order to make value out of it. MLOps defines various disciplines to nullify such problems and work efficiently. Thus, pipelines are crucial in an ML project and automating such end-to-end pipelines are also vital. 32 | 33 | ## Description 34 | 35 | The project is a concoction of research (sentiment analysis, NLP, BERT, biLSTM), development (text normalization, ETL, transformation, deep neural network training, evaluation, model testing) and deployment (building and packaging model artifacts, tracking, docker, workflows, pipelines, cloud) by integrating CI/CD pipelines with automated releases. 36 | 37 | | ![flowchart](./images/architecture_diagram.jpeg) | 38 | |:--:| 39 | | Figure 1: Complete end-to-end project pipeline| 40 | 41 | ## Technical facets 42 | 43 | 1. Setting up `Airflow` in docker for `workflow orchestration`. 44 | 2. Writing a `Dockerfile` that creates a base docker image with all dependencies installed and secrets containing sensitive credentials and access tokens are mounted. 45 | 2. Defining `ETL` and `model training` workflow followed by scheduling them for orchestration. 46 | 3. Executing `Airflow DAGs`: 47 | - **ETL** - Performs Extract, Transform and Load operation on twitter data. As a result, raw tweets scraped from twitter are processed and loaded into `Snowflake data warehouse` as database table. 48 | - **Model_training** - Deep end-to-end `biLSTM` model is trained using `Tensorflow` by fetching processed data from data warehouse. 49 | 4. Tracking the entire model training using `MLflow server` hosted on `AWS EC2 instance` from which trained model artifacts, metrics and parameters are logged. 50 | 5. Using `AWS S3 buckets` to store the model artifacts and data. 51 | 6. Adding the trained model to `MLflow model registry` on `AWS EC2 instance` that facilitates in managing, maintaining, versioning, staging, testing and productionalizing the model collaboratively. 52 | 7. Automating the `pipeline` as follows: 53 | - Initialize `GitHub Actions` workflows. 54 | - `benchmark_and_test_model.yml` => In order to productionalize a model, simply evaluating the model is not sufficient. So, it is very important to test them. Thus, the best model is pushed into **production stage** by means of **benchmarking** (`behavioral testing` + evaluation). 55 | - `deploy.yml` => The production model from model registry in `EC2 instance` is packaged into a docker image with all required dependencies & metadata as a `deployable model artifact` and pushed into `Amazon ECR` **(CI job)**. The deployable image is then deployed into `AWS Sagemaker` instance which creates an **endpoint** that can be used to communicate with the model for inference **(CD job)**. 56 | - `run_dags.yml` - Triggers Airflow DAG run that performs ETL and model training task based on schedule. 57 | - `release.yml` => A new release will be created automatically when tags are pushed to the repository. 58 | 59 | 60 | ## Directory structure 61 | 62 | ``` 63 | ├── .github 64 | │ └── workflows 65 | │ ├── benchmark_and_test_model.yaml 66 | | ├── deploy.yaml 67 | | ├── release.yaml 68 | | └── run_dags.yaml 69 | ├── config 70 | │   └── config.toml 71 | ├── dags # Directory where every Airflow DAG is defined 72 | │   ├── etl_twitter_dag.py 73 | │   ├── model_training_dag.py 74 | │   └── task_definitions 75 | │   ├── etl_task_definitions.py 76 | │   └── model_training.py 77 | ├── dependencies 78 | │   ├── Dockerfile 79 | │   └── requirements.txt 80 | ├── docker-compose.yaml # Airflow and it's components run as docker containers 81 | ├── images 82 | ├── Makefile # Set of docker commands for Airflow run 83 | ├── README.md 84 | ├── scripts # Contains code for model testing, evaluation and deployment to AWS Sagemaker 85 | │   ├── behavioral_test.py 86 | │   ├── deploy.py 87 | │   ├── stage_model_to_production.py 88 | │   └── test_data 89 | │   ├── sample_test_data_for_mft.parquet 90 | │   └── test_data.parquet 91 | ├── test_results 92 | └── utils 93 | ├── experiment_tracking.py 94 | ├── helper.py 95 | ├── model.py 96 | └── prepare_data.py 97 | 98 | ``` 99 | 100 | ## Pipeline 101 | ### Dependencies & Secrets management 102 | 103 | As mentioned above, Airflow is running in a docker container. In order to install dependencies, a docker image is build with all installed dependencies and it will be used as a base image for `docker-compose`. The dependencies are listed in [requirements.txt](./dependencies/requirements.txt). A more better way would be, to use any kind of dependency management tools like **Poetry** for organizational projects but, it is out of scope for this project. 104 | 105 | One important challenge would be, to manage & handle sensitive information such as **credentials, access tokens** etc needed for Airflow to connect with other services like `AWS S3`, `Snowflake`, `EC2`. It is vulnerable to use any such sensitive info during `docker build` as they will be exposed as a result of layer caching during image build. The secure way is to mount them into the image as **docker secrets** and then export it as environment variables, so that they aren't leaked. It can be done as follows: 106 | 107 | - Create secrets using the command 108 | ``` 109 | docker secret create 110 | ``` 111 | 112 | - Mount those secrets into `/run/secrets/` of the container 113 | ``` 114 | RUN --mount=type=secret,id= \ 115 | export =$(cat /run/secrets/) 116 | ``` 117 | 118 | #### ProTip to do the same in production environment 119 | 120 | The aforementioned steps are not well suitable for production. To do so, use `docker stack`. For more info, refer [here](https://docs.docker.com/engine/swarm/stack-deploy/) 121 | 122 | ### Workflow Orchestration - Airflow 123 | 124 | [Apache Airflow](https://airflow.apache.org/) is used to orchestrate workflows in this project. The workflows are represented as **Directed Acyclic Graph** `(DAG)`. 125 | 126 | ### DAGS 127 | 128 | ### ETL 129 | It is a data workflow that performs Extract Transform Load `(ETL)` task defined in [etl_twitter_dag.py](./dags/etl_twitter_dag.py) on scheduled interval. It performs the following tasks: 130 | - The raw tweets are scraped from twitter using [snscrape](https://pypi.org/project/snscrape/) library and loaded to `AWS S3 bucket`. 131 | - They are cleaned using **regular expressions** and labelled by calculating **polarity** and also loaded to the same `S3 bucket`. 132 | - The labelled data is normalized and preprocessed using NLP techniques and loaded as **database table** to `Snowflake data warehouse` which can be used for analysis and model training. 133 | - The data is stored in the `parquet` format for efficient storage and retrieval. 134 | 135 | | ![flowchart](./images/etl_dag.PNG) | 136 | |:--:| 137 | | Figure 2: ETL Data pipeline - Airflow| 138 | 139 | ### Model training 140 | It is a model training workflow that trains deep end-to-end `biLSTM` network with `BERT tokenizer`. Detailed explanation of biLSTM model can be found [here](#bi-directional-lstm-model). The DAG performs the following tasks: 141 | 142 | - Preprocessed data loaded as a result of ETL pipeline is fetched from the database 143 | table of **snowflake data warehouse** as a **dataframe**. 144 | - External (user-build) **docker container** with `tensorflow GPU` and other dependencies installed, is used to train the model. It is facilitated in Airflow by `DockerOperator` as: 145 | ``` 146 | DockerOperator( 147 | task_id = "train_model_task", 148 | image = "model_training_tf:latest", 149 | auto_remove = True, 150 | docker_url = "unix://var/run/docker.sock", 151 | api_version = "auto", 152 | command = "python3 model_training.py" 153 | ) 154 | ``` 155 | - The **GPU accelerated** training for the above task is defined in [model_training.py](./dags/task_definitions/model_training.py). Additionally, `BERT tokenizer` is used instead of normal tokenizer **(i.e.)** The texts are tokenized and each tokens are encoded into unique IDs referred as `input_ids`. Finally, they are transformed as `tensorflow datasets` for efficient input pipeline and fed into the model. All these are defined in [prepare_data.py](./utils/prepare_data.py). 156 | 157 | | ![flowchart](./images/model_dag.PNG) | 158 | |:--:| 159 | | Figure 3: Model training pipeline - Airflow| 160 | 161 | **Note:**
162 | *GPU used for training*: NVIDIA GeForce GTX 980M with `8GB GDDR5` memory 163 | 164 | ### Bi-directional LSTM model 165 | biLSTM network encompassing an 166 | `embedding layer`, stack of `biLSTM layers` followed by `fully connected dense layers` 167 | with `dropout` is used for this project. The **model plot** is depicted in the below image: 168 | 169 |

170 | 171 |

172 | 173 | ### MLflow Server 174 | 175 | All the experiments are tracked and logged by [MLflow](https://mlflow.org/docs/latest/tracking.html). It is not done locally in a **localhost**, instead the `MLflow Server` is installed and hosted in `AWS EC2 instance` as a **remote tracking server** which paves way for centralized access. The **trained model artifacts** are saved in `AWS S3 bucket` which serves as an artifact store and parameters, metrics (per epoch), all other metadata are logged into EC2 instance itself. 176 | 177 | | ![flowchart](./images/mlflow_exps.PNG) | 178 | |:--:| 179 | | Figure 4: All experiment runs on MLflow Server - EC2 Instance| 180 | 181 | ### MLflow Model Registry 182 | 183 | The models to be staged and tested are pushed to the model registry which serves as a **centralized model store**. It facilitates to manage, version, stage, test and productionalize the model and provides functionalities to work on the models collaboratively. 184 | 185 | | ![flowchart](./images/model_registry_org.PNG) | 186 | |:--:| 187 | | Figure 5: Model Registry with already existing production model and staged model - EC2 Instance | 188 | 189 | ### Benchmarking 190 | 191 | The model with latest version and model in production stage are benchmarked by means of behavioral testing and evaluation. This is done to find out whether the latest model outperforms the current production model. If yes, it triggers the `CI/CD` workflow job. 192 | 193 | Model testing differs from model evaluation. For instance, a model with high evaluation metric doesn't always guarantee to be the best performing model because, it might fail in some specific scenarios. To solve and quantify that, **model testing** is an important aspect in production. 194 | 195 | ### Behavioral testing 196 | 197 | It is based on this [paper](https://homes.cs.washington.edu/~marcotcr/acl20_checklist.pdf) to test the behavior of the model in specific conditions. [Checklist](https://github.com/marcotcr/checklist) library is used for performing both the tests. These testing functions are defined in [behavioral_test.py](./scripts/behavioral_test.py). Three different types of tests are proposed in the paper but only two of them are performed in this project namely: 198 | - Minimum Functionality test (MFT) 199 | - Invariance test (INV) 200 | 201 | ### MFT: 202 | MFT is inspired from unit test. A specific behavior (or) capability of the model is tested. 203 | 204 | | 1. | **Model** | Sentiment Analysis | 205 | |----|:-------------------------:|:------------------------------------------------------------------------------------------------------------------:| 206 | | 2. | **Dataset** | Perturbed dataset created from a small subset of test dataset with labels. Original texts are negated as perturbed | 207 | | 3. | **Minimum functionality** | Negations (i.e.) how well the model handles negated inputs | 208 | | 4. | **Example** | *Original text*: This product is very good - **Positive**
*Negated text*: This product is not very good - **Negative** | 209 | | 5. | **Expected behavior** | Model should be generalized to predict correct labels for both original and negated text | 210 | 211 | ### INV 212 | 213 | Label-preserving perturbations are applied to the test data. Despite perturbing the data, the model is expected to give the same prediction. 214 | 215 | | 1. | **Model** | Sentiment Analysis | 216 | |:---:|:-------------------------:|:-----------------------------------------------------------------------------------------------------------------------------:| 217 | | 2. | **Dataset** | Larger subset of test dataset is perturbed by adding invariances and their contexts are preserved | 218 | | 3. | **Invariance** | Typos and expanding contractions (i.e.) how well the model handle these invariances | 219 | | 4. | **Example** | *Original text*: I haven't liked this product - **Negative**
*Invariance text*: I have not liekd this prodcut - **Negative** | 220 | | 5. | **Expected behavior** | Model should be generalized to handles these invariances and predict same label for both original and invariance texts | 221 | 222 | Benchmarking (defined in [stage_model_to_production.py](./scripts/stage_model_to_production.py)) is done as follows: 223 | - Latest and current production models are pulled from the model registry. 224 | - Test data (fresh data that the model hasn't seen during training) is fetched from S3 bucket. 225 | - **Behavioral testing** (perturbed data) and **evaluation** (original test data) is performed for both the models and metrics are returned. 226 | - If the latest model outperform the current production model, then push latest model into production and archive current production model. 227 | 228 | ``` 229 | productionalize_ = Productionalize(tracking_uri = config["model-tracking"]["mlflow_tracking_uri"], 230 | test_data = config["files"]["test_data"], 231 | model_name = config["model-registry"]["model_name"], 232 | batch_size = config["train-parameters"]["batch_size"], 233 | sequence_length = config["train-parameters"]["sequence_length"] 234 | ) 235 | 236 | accuracy_latest_model, accuracy_production_model = productionalize_.benchmark_models() 237 | 238 | success_ = productionalize_.push_new_model_to_production(accuracy_latest_model, accuracy_production_model) 239 | ``` 240 | 241 | | ![flowchart](./images/model_registry_latest1.PNG) | 242 | |:--:| 243 | | Figure 6: Model Registry with latest model pushed to production model and archiving the other one - EC2 Instance | 244 | 245 | | ![flowchart](./images//model_registry_latest2.PNG) | 246 | |:--:| 247 | | Figure 7: Model Registry with latest production model - EC2 Instance | 248 | 249 | ### CI/CD 250 | 251 | It involves packaging the model artifacts into an image and deploy them to cloud instance. The steps are as follows: 252 | - The model registry in **EC2 instance** holds the **latest production model** that have passed both testing and evaluation. 253 | - The production model from the model registry is packaged and build into a docker image with all required dependencies & metadata as a **deployable model artifact**. 254 | - This artifact is then pushed into **Amazon ECR** that serves as a container registry. 255 | 256 | | ![flowchart](./images/ecr_image.PNG) | 257 | |:--:| 258 | | Figure 8: Deployable docker image pushed to AWS ECR | 259 | 260 | - Finally, the deployable image from ECR is deployed into `AWS Sagemaker` instance which creates an **endpoint** that can be used to communicate with the model for inferencing. 261 | - The endpoint can be tested using some tools like `Postman`. 262 | - The aforementioned steps are defined in [deploy.py](./scripts/deploy.py). All the necessary secrets are exported as environment variables. Specific IAM role and user have been created for deployment. 263 | 264 | ``` 265 | sagemaker._deploy( 266 | mode = 'create', 267 | app_name = app_name, 268 | model_uri = model_uri, 269 | image_url = docker_image_url, 270 | execution_role_arn = role, 271 | instance_type = 'ml.m5.xlarge', 272 | instance_count = 1, 273 | region_name = region 274 | ) 275 | ``` 276 | 277 | | ![flowchart](./images/Sagemaker_endpoint.jpg) | 278 | |:--:| 279 | | Figure 9: Production model deployed to AWS Sagemaker | 280 | 281 | **Note:**
282 | *Every AWS resources created for this project will be deleted after the pipeline is executed successfully. This is done on purpose, to restrict and limit any incurring additional cost!!* 283 | 284 | ## Feedback 285 | 286 | If you have any feedback, please reach out to me at jithsasikumar@gmail.com 287 | 288 | ## Bug / Issues 289 | 290 | If you come across any bugs (or) issues related to code, model, implementation, results, pipeline etc, please feel free to open a [new issue here](https://github.com/Jithsaavvy/Sentiment-analysis-from-MLOps-paradigm/issues/new) by describing your search query and expected result. 291 | 292 | ## References 293 | 294 | [Paper - Beyond Accuracy: Behavioral Testing of NLP models with CheckList](https://homes.cs.washington.edu/~marcotcr/acl20_checklist.pdf) 295 | 296 | [https://github.com/marcotcr/checklist](https://github.com/marcotcr/checklist) 297 | 298 | [AWS Documentations](https://docs.aws.amazon.com/) 299 | 300 | [Airflow Docs](https://airflow.apache.org/docs/) -------------------------------------------------------------------------------- /config/config.toml: -------------------------------------------------------------------------------- 1 | [tweets-scraping] 2 | search_query = "mlops" 3 | tweet_limit = 50000 4 | 5 | [aws] 6 | connection_id = "s3_connection" 7 | s3_bucket_name = "twitter-data-bucket" 8 | temp_data_path = "/opt/airflow/dags/" 9 | 10 | [files] 11 | raw_file_name = "raw_tweets.parquet" 12 | labelled_file_name = "labelled_tweets.parquet" 13 | preprocessed_file_name = "preprocessed_tweets.parquet" 14 | test_data = "./scripts/test_data/test_data.parquet" 15 | 16 | [train-parameters] 17 | batch_size = 128 18 | num_classes = 3 19 | embedding_dim = 128 20 | sequence_length = 512 21 | num_epochs = 4 22 | learning_rate = 2e-3 23 | 24 | [model-tracking] 25 | experiment = false 26 | experiment_name = "sentiment_classifier" 27 | run_name = "sc_run3" 28 | mlflow_tracking_uri = "http://ec2-44-203-120-100.compute-1.amazonaws.com:5000/" 29 | 30 | [model-registry] 31 | model_name = "sentiment_classifier" 32 | filter_string = "name LIKE 'sentiment%'" 33 | 34 | [model-deploy] 35 | endpoint_name = "sentiment-classifier" 36 | 37 | [misc] 38 | query = "SELECT * from PROCESSED_TWEET" 39 | table_name = "PROCESSED_TWEETS" -------------------------------------------------------------------------------- /dags/etl_twitter_dag.py: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env python3 2 | 3 | """ 4 | @author: Jithin Sasikumar 5 | 6 | Script to define the data pipeline as Airflow DAG that performs ETL (Extract Load Transform) tasks such as 7 | scraping tweets from twitter, labelling, cleaning, normalizing and preprocessing the raw data to be used 8 | for analysis and model training on scheduled interval. 9 | """ 10 | 11 | import os 12 | import json 13 | import sys 14 | from datetime import datetime 15 | from airflow.decorators import task, dag 16 | from airflow.utils.task_group import TaskGroup 17 | from airflow.operators.python import PythonOperator 18 | from airflow.providers.amazon.aws.hooks.s3 import S3Hook 19 | from airflow.providers.snowflake.hooks.snowflake import SnowflakeHook 20 | from snowflake.connector.pandas_tools import write_pandas 21 | from airflow.models.connection import Connection 22 | from task_definitions.etl_task_definitions import scrap_raw_tweets_from_web, preprocess_tweets 23 | from task_definitions.etl_task_definitions import add_sentiment_labels_to_tweets 24 | 25 | sys.path.append(os.path.join(os.path.dirname(__file__), "..")) 26 | from utils.helper import Config, Connections 27 | from utils.helper import load_dataframe 28 | 29 | 30 | # Load all configurations from config.toml 31 | config = Config() 32 | 33 | @dag(dag_id = "etl", start_date = datetime(2023,1,1), schedule_interval = "@monthly", catchup = False) 34 | def twitter_data_pipeline_dag_etl() -> None: 35 | """ 36 | Data pipeline for performing ETL task that has to be used for training. 37 | 38 | Returns 39 | ------- 40 | None 41 | """ 42 | 43 | @task(task_id = "configure_connections") 44 | def set_connections() -> None: 45 | """ 46 | Task 1 => Configure and establish respective connections for external services like 47 | AWS S3 buckets and Snowflake data warehouse. The credentials are stored as docker secrets 48 | in respective containers and accessed as environment variables for secure usage which 49 | restricts them from getting leaked in the docker image or repository. 50 | 51 | Note: 52 | AWS credentials are generated using specific IAM users and roles. 53 | 54 | Returns 55 | ------- 56 | None 57 | """ 58 | 59 | # AWS S3 connection 60 | aws_access_key_id = os.environ["AWS_ACCESS_KEY_ID"] 61 | aws_secret_access_key = os.environ["AWS_SECRET_ACCESS_KEY"] 62 | aws_region_name = os.environ["REGION"] 63 | s3_credentials = json.dumps( 64 | dict( 65 | aws_access_key_id = aws_access_key_id, 66 | aws_secret_access_key = aws_secret_access_key, 67 | aws_region_name = aws_region_name, 68 | ) 69 | ) 70 | 71 | s3_connection = Connection(conn_id = "s3_connection", 72 | conn_type = "S3", 73 | extra = s3_credentials 74 | ) 75 | s3_conn_response = Connections(s3_connection).create_connections() 76 | 77 | # Snowflake connection 78 | login = os.environ["LOGIN"] 79 | password = os.environ["PASSWORD"] 80 | host_name = os.environ["HOST"] 81 | 82 | snowflake_connection = Connection(conn_id = "snowflake_conn", 83 | conn_type = "Snowflake", 84 | host = host_name, 85 | login = login, 86 | password = password 87 | ) 88 | 89 | snowflake_conn_response = Connections(snowflake_connection).create_connections() 90 | 91 | 92 | if not s3_conn_response and snowflake_conn_response: 93 | print("Connection not established!!") 94 | 95 | #Instantiating S3 hook for respective tasks 96 | s3_hook = S3Hook(aws_conn_id = config["aws"]["connection_id"]) 97 | 98 | # Task 2 => Refer respective task definition for documentation 99 | scrap_raw_tweets_from_web_ = PythonOperator( 100 | task_id = "scrap_raw_tweets_from_web", 101 | python_callable = scrap_raw_tweets_from_web, 102 | op_kwargs = { 103 | 's3_hook': s3_hook, 104 | 'bucket_name': config["aws"]["s3_bucket_name"], 105 | 'search_query': config["tweets-scraping"]["search_query"], 106 | 'tweet_limit': config["tweets-scraping"]["tweet_limit"], 107 | 'raw_file_name': config["files"]["raw_file_name"] 108 | } 109 | ) 110 | 111 | @task(task_id = "download_from_s3") 112 | def download_data_from_s3_bucket(temp_data_path: str, file_name: str) -> None: 113 | """ 114 | Task 3 => Download data stored in S3 buckets for usage. 115 | 116 | Parameters 117 | ---------- 118 | temp_data_path: str 119 | Path to save downloaded file. 120 | file_name: str 121 | Name of the downloaded file. 122 | 123 | Returns 124 | ------- 125 | None 126 | """ 127 | 128 | # Creating a S3 hook using the connection created via task 1. 129 | downloaded_file = s3_hook.download_file( 130 | key = file_name, 131 | bucket_name = config["aws"]["s3_bucket_name"], 132 | local_path = temp_data_path 133 | ) 134 | os.rename(src = downloaded_file, destination = f"{temp_data_path}/{file_name}") 135 | 136 | with TaskGroup(group_id = "sentiment_labelling") as group1: 137 | #Task 4 => Refer respective task definition for documentation 138 | add_sentiment_labels_to_scrapped_tweets_ = PythonOperator( 139 | task_id = "add_sentiment_labels_to_scrapped_tweets", 140 | python_callable = add_sentiment_labels_to_tweets, 141 | op_kwargs = { 142 | 's3_hook': s3_hook, 143 | 'bucket_name': config["aws"]["s3_bucket_name"], 144 | 'temp_data_path': config["aws"]["temp_data_path"], 145 | 'raw_file_name': config["files"]["raw_file_name"], 146 | 'labelled_file_name': config["files"]["labelled_file_name"], 147 | } 148 | ) 149 | 150 | # Prioritizing every downstream tasks pertaining to task group 1 151 | download_data_from_s3_bucket(config["aws"]["temp_data_path"], config["files"]["raw_file_name"]) >> add_sentiment_labels_to_scrapped_tweets_ 152 | 153 | 154 | with TaskGroup(group_id = "preprocess_tweets_using_NLP") as group2: 155 | #Task 5 => Refer respective task definition for documentation 156 | preprocess_tweets_ = PythonOperator( 157 | task_id = "preprocess_labelled_tweets_using_nlp_techniques", 158 | python_callable = preprocess_tweets, 159 | op_kwargs = { 160 | 's3_hook': s3_hook, 161 | 'bucket_name': config["aws"]["s3_bucket_name"], 162 | 'temp_data_path': config["aws"]["temp_data_path"], 163 | 'labelled_file_name': config["files"]["labelled_file_name"], 164 | 'preprocessed_file_name': config["files"]["preprocessed_file_name"] 165 | } 166 | ) 167 | 168 | # Prioritizing every downstream tasks pertaining to task group 2 169 | download_data_from_s3_bucket(config["aws"]["temp_data_path"], config["files"]["labelled_file_name"]) >> preprocess_tweets_ 170 | 171 | @task(task_id = "load_processed_data_to_datawarehouse") 172 | def load_processed_data_to_snowflake(processed_file: str, table_name: str) -> None: 173 | """ 174 | Task 6 => Load and write final processed data into snowflake data warehouse. It loads the processed parquet 175 | file as dataframe and loads it as a database table into the data warehouse. 176 | 177 | Parameters 178 | ---------- 179 | processed_file: str 180 | Name of preprocessed parquet file. 181 | table_name: str 182 | Name of the database table in snowflake data warehouse. 183 | 184 | Returns 185 | ------- 186 | None 187 | """ 188 | try: 189 | # Similar to S3 hook, snowflake hook is used accordingly 190 | snowflake_conn = SnowflakeHook( 191 | snowflake_conn_id = "snowflake_conn", 192 | account = os.environ["ACCOUNT"], 193 | warehouse = os.environ["WAREHOUSE"], 194 | database = os.environ["DATABASE"], 195 | schema = os.environ["SCHEMA"], 196 | role = os.environ["ROLE"] 197 | ) 198 | 199 | dataframe = load_dataframe(processed_file) 200 | 201 | # Functionality to write any pandas dataframe into snowflake 202 | write_pandas( 203 | conn = snowflake_conn, 204 | df = dataframe, 205 | table_name = table_name, 206 | quote_identifiers = False 207 | ) 208 | 209 | except Exception as exc: 210 | raise ConnectionError("Something went wrong with the snowflake connection. Please check them!!") from exc 211 | 212 | finally: 213 | snowflake_conn.close() 214 | 215 | # Prioritizing every downstream tasks pertaining to the entire DAG 216 | set_connections() >> scrap_raw_tweets_from_web_>> group1 >> group2 >> load_processed_data_to_snowflake(config["files"]["preprocessed_file_name"], config["misc"]["table_name"]) 217 | 218 | 219 | etl_dag = twitter_data_pipeline_dag_etl() -------------------------------------------------------------------------------- /dags/model_training_dag.py: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env python3 2 | 3 | """ 4 | @author: Jithin Sasikumar 5 | 6 | Script to define model training pipeline as Airflow DAG that trains Bi-LSTM model with the 7 | processed data from data warehouse. In the DAG, in order to improve the training time and 8 | efficiency, the model training is done within an external (user-build) docker container with 9 | tensorflow-gpu base image and it is not included in airflow docker compose. 10 | It is a GPU accelerated training. 11 | 12 | """ 13 | 14 | import os 15 | import sys 16 | import pandas as pd 17 | from datetime import datetime 18 | from airflow.decorators import task, dag 19 | from airflow.providers.docker.operators.docker import DockerOperator 20 | from airflow.providers.snowflake.hooks.snowflake import SnowflakeHook 21 | 22 | sys.path.append(os.path.join(os.path.dirname(__file__), "..")) 23 | from utils.helper import Config 24 | 25 | # Load all configurations from config.toml 26 | config = Config() 27 | 28 | @dag(dag_id = "model_training", start_date = datetime(2023,1,1), schedule_interval = "@monthly", catchup = False) 29 | def model_training_pipeline_dag() -> None: 30 | """ 31 | Pipeline to perform the GPU accelerated model training within the user-build docker image 32 | 33 | Returns 34 | ------- 35 | None 36 | """ 37 | 38 | @task(task_id = "load_data_from_warehouse") 39 | def pull_snowflake_data_as_df(query: str) -> pd.DataFrame: 40 | """ 41 | Task 1 => Loaded data as a result of ETL pipeline is fetched from the database 42 | table of snowflake data warehouse as a dataframe. This will be used for 43 | model training. 44 | 45 | Parameters 46 | ---------- 47 | query: str 48 | Database query 49 | 50 | Returns 51 | ------- 52 | dataframe: pd.DataFrame 53 | Fetched data 54 | """ 55 | try: 56 | snowflake_conn = SnowflakeHook( 57 | snowflake_conn_id = "snowflake_conn", 58 | account = os.environ["ACCOUNT"], 59 | warehouse = os.environ["WAREHOUSE"], 60 | database = os.environ["DATABASE"], 61 | schema = os.environ["SCHEMA"], 62 | role = os.environ["ROLE"] 63 | ) 64 | 65 | cursor = snowflake_conn.cursor().execute(query) 66 | dataframe = cursor.fetch_pandas_all() 67 | 68 | return dataframe 69 | 70 | except Exception as exc: 71 | raise ConnectionError("Snowflake connection error. Please check and try again!!") from exc 72 | 73 | finally: 74 | cursor.close() 75 | snowflake_conn.close() 76 | 77 | 78 | # Task 2 => Refer /task_definitions/model_training.py for documentation 79 | train_model = DockerOperator( 80 | task_id = "train_model_task", 81 | image = "model_training_tf:latest", 82 | auto_remove = True, 83 | docker_url = "unix://var/run/docker.sock", 84 | api_version = "auto", 85 | command = "python3 model_training.py" 86 | ) 87 | 88 | pull_snowflake_data_as_df(config["misc"]["query"]) >> train_model 89 | 90 | model_train_dag = model_training_pipeline_dag() -------------------------------------------------------------------------------- /dags/task_definitions/etl_task_definitions.py: -------------------------------------------------------------------------------- 1 | """ 2 | @author: Jithin Sasikumar 3 | 4 | Module that defines every task required for ETL data pipeline (DAG) to run successfully. 5 | """ 6 | import os 7 | import sys 8 | import pandas as pd 9 | import snscrape.modules.twitter as sntwitter 10 | import nltk 11 | from nltk.tokenize import word_tokenize 12 | from nltk.corpus import stopwords 13 | from nltk.stem import WordNetLemmatizer 14 | from nltk.stem.porter import PorterStemmer 15 | from airflow.providers.amazon.aws.hooks.s3 import S3Hook 16 | 17 | sys.path.append(os.path.join(os.path.dirname(__file__), "...")) 18 | from utils import helper 19 | nltk.download('punkt') 20 | nltk.download('stopwords') 21 | stopwords_ = stopwords.words("english") 22 | nltk.download('wordnet') 23 | nltk.download('omw-1.4') 24 | nltk.download('vader_lexicon') 25 | 26 | def scrap_raw_tweets_from_web(**kwargs) -> None: 27 | """ 28 | Scrap raw tweets from twitter using snscrape library and load it as parquet file to S3 bucket. 29 | 30 | Parameters 31 | ---------- 32 | **kwargs: Arbitrary keyword arguments 33 | See below for expansion 34 | 35 | keyword arguments 36 | ----------------- 37 | **s3_hook: S3Hook 38 | Instance of S3Hook to connect with specified S3 bucket. 39 | **bucket_name: str 40 | Name of S3 bucket to load resulting raw parquet file. 41 | **search_query: str 42 | Keyword or topic to scrap the tweets. 43 | **tweet_limit: int 44 | Limit of tweets to scrap from. 45 | **raw_file_name: str 46 | Name of raw parquet file to be loaded to S3. 47 | 48 | Returns 49 | ------- 50 | None 51 | """ 52 | tweets = list() 53 | try: 54 | for index, tweet in enumerate(sntwitter.TwitterSearchScraper(kwargs["search_query"]).get_items()): 55 | if index != kwargs["tweet_limit"]: 56 | tweets.append([tweet.date, tweet.id, tweet.lang, 57 | tweet.user.username, tweet.content]) 58 | 59 | raw_tweets_dataframe = pd.DataFrame( 60 | tweets, 61 | columns = [ 62 | 'datetime', 'id', 63 | 'lang', 'username', 64 | 'raw_tweets' 65 | ] 66 | ) 67 | 68 | raw_tweets_dataframe.to_parquet(kwargs["raw_file_name"], 69 | index = False, engine = "pyarrow") 70 | kwargs["s3_hook"].load_file( 71 | filename = kwargs["raw_file_name"], 72 | key = kwargs["raw_file_name"], 73 | bucket_name = kwargs["bucket_name"] 74 | ) 75 | 76 | except Exception as exc: 77 | raise Exception("Something went wrong with the tweet scraping task. Please check them!!") from exc 78 | 79 | def add_sentiment_labels_to_tweets(**kwargs) -> None: 80 | """ 81 | Calculate polarity of tweets and assign sentiment labels for the same fro S3 bucket as extracted raw tweets 82 | are unlabelled. 83 | 84 | Parameters 85 | ---------- 86 | **kwargs: Arbitrary keyword arguments 87 | See below for expansion 88 | 89 | keyword arguments 90 | ----------------- 91 | **s3_hook: S3Hook 92 | Instance of S3Hook to connect with specified S3 bucket. 93 | **bucket_name: str 94 | Name of S3 bucket to load resulting raw parquet file. 95 | **temp_data_path: str 96 | Path to save intermittent temp file as a buffer. 97 | **raw_file_name: str 98 | Name of raw parquet file from S3. 99 | **labelled_file_name: str 100 | Name of file containing respective sentiment labels. 101 | 102 | Returns 103 | ------- 104 | None 105 | """ 106 | dataframe = pd.read_parquet( 107 | path = f"{kwargs['temp_data_path']}/{kwargs['raw_file_name']}", 108 | engine = "pyarrow" 109 | ) 110 | dataframe_en = dataframe[dataframe['lang'] == "en"] 111 | dataframe_en["cleaned_tweets"] = dataframe_en["raw_tweets"].apply( 112 | lambda text: helper.remove_noise(text) 113 | ) 114 | dataframe_en["polarity"] = dataframe_en["cleaned_tweets"].apply( 115 | lambda text: helper.calculate_polarity(text) 116 | ) 117 | dataframe_en["sentiment"] = dataframe_en["polarity"].apply( 118 | lambda score: helper.assign_sentiment_labels(score) 119 | ) 120 | 121 | dataframe_en.to_parquet(kwargs["labelled_file_name"], 122 | index = True, engine = "pyarrow") 123 | kwargs["s3_hook"].load_file( 124 | filename = kwargs["labelled_file_name"], 125 | key = kwargs["labelled_file_name"], 126 | bucket_name = kwargs["bucket_name"] 127 | ) 128 | 129 | def preprocess_tweets(**kwargs) -> None: 130 | """ 131 | Normalize and preprocess labelled tweets from S3 using NLP techniques which wil be used for 132 | model training. 133 | 134 | Parameters 135 | ---------- 136 | **kwargs: Arbitrary keyword arguments 137 | See below for expansion 138 | 139 | keyword arguments 140 | ----------------- 141 | **s3_hook: S3Hook 142 | Instance of S3Hook to connect with specified S3 bucket. 143 | **bucket_name: str 144 | Name of S3 bucket to load resulting raw parquet file. 145 | **temp_data_path: str 146 | Path to save intermittent temp file as a buffer. 147 | **labelled_file_name: str 148 | Name of file containing respective sentiment labels. 149 | *preprocessed_file_name: str 150 | Name of the file to be loaded to s3 after preprocessing. 151 | 152 | Returns 153 | ------- 154 | None 155 | """ 156 | dataframe = pd.read_parquet(path = f"{kwargs['temp_data_path']}/{kwargs['labelled_file_name']}", 157 | engine = "pyarrow") 158 | dataframe = dataframe.iloc[: , 1:] 159 | dataframe['cleaned_tweets'] = dataframe['cleaned_tweets'].astype(str).str.lower() 160 | dataframe['tokenized_tweets'] = dataframe["cleaned_tweets"].apply(word_tokenize) 161 | 162 | #Remove stopwords 163 | dataframe['tokenized_tweets'] = dataframe['tokenized_tweets'].apply( 164 | lambda tokens: helper.remove_stopwords(tokens, stopwords_) 165 | ) 166 | dataframe = helper.remove_less_frequent_words(dataframe) 167 | 168 | #Lemmatize each tweet 169 | wordnet_lem = WordNetLemmatizer() 170 | dataframe['lemmatized_tweets'] = dataframe['tokenized_strings'].apply(lambda tweet: " ".join([ 171 | wordnet_lem.lemmatize(word) 172 | for word in tweet.split()])) 173 | 174 | #Stem each tweet 175 | porter_stemmer = PorterStemmer() 176 | dataframe['processed_tweets'] = dataframe['lemmatized_tweets'].apply(lambda tweet: " ".join([ 177 | porter_stemmer.stem(word) 178 | for word in tweet.split()])) 179 | 180 | dataframe = dataframe.reindex(columns = [col for col in dataframe.columns if col != 'sentiment'] + ['sentiment']) 181 | 182 | # Encoding labels (integers) to sentiments 183 | dataframe['labels'] = dataframe['sentiment'].map( 184 | { 185 | "neutral": 0, 186 | "negative": 1, 187 | "positive": 2 188 | } 189 | ) 190 | # Printing in console to ensure that the entire process is successful which can be later accessed from Airflow logs 191 | print(dataframe.shape, dataframe.columns) 192 | 193 | dataframe.to_parquet(kwargs["preprocessed_file_name"], 194 | index = False, engine = "pyarrow") 195 | kwargs["s3_hook"].load_file( 196 | filename = kwargs["preprocessed_file_name"], 197 | key = kwargs["preprocessed_file_name"], 198 | bucket_name = kwargs["bucket_name"] 199 | ) -------------------------------------------------------------------------------- /dags/task_definitions/model_training.py: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env python3 2 | 3 | """ 4 | @author: Jithin Sasikumar 5 | 6 | Script to perform Bi-directional LSTM training with BERT tokenizer. This script will be copied and 7 | executed inside external (user-build) docker container with tensorflow GPU installed. This is 8 | provided in this directory for reference. 9 | 10 | Every training run will be tracked, artifacts are logged by MLflow tracking server hosted on AWS EC2 instance. 11 | (i.e.) training is performed locally using GPU via user-build docker container and entire model tracking & 12 | logging happens in the EC2 instance by the tracking server. 13 | 14 | """ 15 | 16 | import os 17 | import sys 18 | import pandas as pd 19 | from tqdm.auto import tqdm 20 | from dataclasses import dataclass 21 | import tensorflow as tf 22 | from sklearn.model_selection import train_test_split 23 | from keras.models import Sequential 24 | from keras.utils import to_categorical 25 | from keras import losses, optimizers, metrics 26 | from transformers import BertTokenizer 27 | 28 | sys.path.append(os.path.join(os.path.dirname(__file__), "...")) 29 | from utils.helper import load_dataframe 30 | from utils.prepare_data import Dataset 31 | from utils.model import BiLSTM_Model 32 | from utils.helper import Config 33 | from utils.experiment_tracking import MLFlowTracker 34 | 35 | config = Config() 36 | 37 | @dataclass 38 | class Train_parameters: 39 | """ 40 | Dataclass for holding parameter values for training. 41 | 42 | Member variables 43 | ---------------- 44 | batch_size: int 45 | Number of samples per gradient update. 46 | num_classes: int 47 | Number of output labels or classes. 48 | embedding_dim: int 49 | Number of output embedding vectors for embedding layer. 50 | sequence_length: int 51 | Size of each input sequence 52 | num_epochs: int 53 | Number of epochs to train the model. 54 | """ 55 | batch_size: int 56 | num_classes: int 57 | embedding_dim: int 58 | sequence_length: int 59 | num_epochs: int 60 | learning_rate: float 61 | 62 | @dataclass 63 | class Model_tracking_parameters: 64 | """ 65 | Dataclass for holding parameter values for model tracking. 66 | 67 | Member variables 68 | ---------------- 69 | experiment_name: str 70 | Name of experiment to log as MLflow run. 71 | mlflow_tracking_uri: str 72 | URI of EC2 instance where MLflow server is hosted. 73 | run_name: str 74 | Name of training run pertaining to an experiment 75 | experiment: bool 76 | True to create a new experiment, else False. 77 | """ 78 | experiment_name: str 79 | mlflow_tracking_uri: str 80 | run_name: str 81 | experiment: bool 82 | 83 | class Training: 84 | def __init__(self, training_args: Train_parameters, 85 | model_tracking_args: Model_tracking_parameters 86 | ): 87 | 88 | """ 89 | Instance variables 90 | ------------------ 91 | training_args: Train_parameters 92 | Instance of Train_parameters 93 | model_tracking_args: Model_tracking_parameters 94 | Instance of Model_tracking_parameters 95 | """ 96 | self.training_args = training_args 97 | self.model_tracking_args = model_tracking_args 98 | 99 | def check_and_set_gpu(self) -> tf.config.LogicalDevice: 100 | """ 101 | Configure and set GPU for model training, else use CPU by default. 102 | 103 | Parameters 104 | ---------- 105 | None 106 | 107 | Returns 108 | ------- 109 | logical_gpu: tf.config.LogicalDevice 110 | List of initialized logical devices. 111 | 112 | Raises 113 | ------ 114 | RuntimeError: Exception 115 | If GPU setting failed during runtime. 116 | """ 117 | try: 118 | available_gpu_devices = tf.config.experimental.list_physical_devices("GPU") 119 | if len(available_gpu_devices) > 0: 120 | # Since the system has only one GPU, setting it to the first GPU 121 | tf.config.set_visible_devices(available_gpu_devices[0], "GPU") 122 | # Allocating GPU memory based on the runtime 123 | tf.config.experimental.set_memory_growth(available_gpu_devices[0], True) 124 | logical_gpu = tf.config.list_logical_devices("GPU") 125 | 126 | except Exception as exc: 127 | raise RuntimeError("Runtime failed in GPU setting. Please check and try again!!") from exc 128 | 129 | return logical_gpu 130 | 131 | def train(self) -> None: 132 | """ 133 | Method that initializes and performs model training. 134 | 135 | Parameters 136 | ---------- 137 | None 138 | 139 | Returns 140 | ------- 141 | None 142 | """ 143 | 144 | # Configure physical GPU to logical device in the runtime and assert whether it's successful 145 | gpu = self.check_and_set_gpu() 146 | assert len(gpu) > 0 147 | 148 | tracker = MLFlowTracker(experiment_name = self.model_tracking_args.experiment_name, 149 | tracking_uri = self.model_tracking_args.mlflow_tracking_uri, 150 | run_name = self.model_tracking_args.run_name, 151 | experiment = self.model_tracking_args.experiment) 152 | tracker.log() 153 | 154 | dataframe: pd.DataFrame = load_dataframe("./preprocessed_tweets.parquet") 155 | df = dataframe[['cleaned_tweets','labels']].iloc[0:35000].copy() 156 | train_dataframe, test_dataframe = train_test_split(df, test_size = 0.25, 157 | random_state = 42, 158 | stratify = df['labels']) 159 | train_dataframe.dropna(inplace = True) 160 | test_dataframe.dropna(inplace = True) 161 | 162 | y_train = to_categorical(train_dataframe['labels'], num_classes = self.training_args.num_classes) 163 | y_test = to_categorical(test_dataframe['labels'], num_classes = self.training_args.num_classes) 164 | 165 | # Using the BERT tokenizer to tokenize every input tweets, rather than a normal tokenizer 166 | tokenizer = BertTokenizer.from_pretrained("bert-base-uncased") 167 | train_dataset: tf.data.Dataset.zip = Dataset(tokenizer = tokenizer, dataframe = train_dataframe, 168 | labels = y_train, batch_size = self.training_args.batch_size, 169 | max_length = self.training_args.sequence_length, 170 | train = True).encode_bert_tokens_to_tf_dataset() 171 | 172 | test_dataset: tf.data.Dataset.zip = Dataset(tokenizer = tokenizer, dataframe = test_dataframe, 173 | labels = y_test, batch_size = self.training_args.batch_size, 174 | max_length = self.training_args.sequence_length, 175 | train = True).encode_bert_tokens_to_tf_dataset() 176 | 177 | model: Sequential = BiLSTM_Model( 178 | tokenizer.vocab_size, 179 | self.training_args.num_classes, 180 | self.training_args.embedding_dim, 181 | self.training_args.sequence_length).create_model() 182 | 183 | print("Training started.....") 184 | model.compile( 185 | loss = losses.CategoricalCrossentropy(), 186 | optimizer = optimizers.Adam( 187 | learning_rate = self.training_args.learning_rate, 188 | epsilon=1e-08), 189 | metrics = [metrics.CategoricalAccuracy('accuracy')] 190 | ) 191 | 192 | model.fit( 193 | train_dataset, 194 | validation_data = test_dataset, 195 | epochs = self.training_args.num_epochs, 196 | batch_size = self.training_args.batch_size 197 | ) 198 | 199 | tracker.end() 200 | 201 | def main() -> None: 202 | training_parameters_ = Train_parameters( 203 | config["train-parameters"]["batch_size"], 204 | config["train-parameters"]["num_classes"], 205 | config["train-parameters"]["embedding_dim"], 206 | config["train-parameters"]["sequence_length"], 207 | config["train-parameters"]["num_epochs"], 208 | config["train-parameters"]["learning_rate"], 209 | ) 210 | 211 | model_tracking_parameters_ = Model_tracking_parameters( 212 | config["model-tracking"]["experiment_name"], 213 | config["model-tracking"]["mlflow_tracking_uri"], 214 | config["model-tracking"]["run_name"], 215 | config["model-tracking"]["experiment"] 216 | ) 217 | 218 | model_training_ = Training( 219 | training_parameters_, 220 | model_tracking_parameters_ 221 | ) 222 | 223 | model_training_.train() 224 | 225 | if __name__ == "__main__": 226 | main() -------------------------------------------------------------------------------- /dependencies/Dockerfile: -------------------------------------------------------------------------------- 1 | # Base docker image with all required dependencies and secrets mounted 2 | 3 | FROM apache/airflow:2.4.1-python3.9 4 | 5 | COPY ./dependencies/requirements.txt /requirements.txt 6 | 7 | RUN pip install --user --upgrade pip 8 | 9 | RUN pip install -r /requirements.txt 10 | 11 | # Mounting every docker secrets into the docker image as environment variables, 12 | # so that they aren't leaked & exposed by layer caching during image build 13 | RUN --mount=type=secret,id=AWS_ACCESS_KEY_ID \ 14 | --mount=type=secret,id=AWS_SECRET_ACCESS_KEY \ 15 | --mount=type=secret,id=REGION \ 16 | --mount=type=secret,id=LOGIN \ 17 | --mount=type=secret,id=PASSWORD \ 18 | --mount=type=secret,id=HOST \ 19 | --mount=type=secret,id=ACCOUNT \ 20 | --mount=type=secret,id=WAREHOUSE \ 21 | --mount=type=secret,id=DATABASE \ 22 | --mount=type=secret,id=SCHEMA \ 23 | export AWS_ACCESS_KEY_ID=$(cat /run/secrets/AWS_ACCESS_KEY_ID) && \ 24 | export AWS_SECRET_ACCESS_KEY=$(cat /run/secrets/AWS_SECRET_ACCESS_KEY) && \ 25 | export REGION=$(cat /run/secrets/REGION) && \ 26 | export LOGIN=$(cat /run/secrets/LOGIN) && \ 27 | export PASSWORD=$(cat /run/secrets/PASSWORD) && \ 28 | export HOST=$(cat /run/secrets/HOST) && \ 29 | export ACCOUNT=$(cat /run/secrets/ACCOUNT) && \ 30 | export WAREHOUSE=$(cat /run/secrets/WAREHOUSE) && \ 31 | export DATABASE=$(cat /run/secrets/DATABASE) && \ 32 | export SCHEMA=$(cat /run/secrets/SCHEMA) -------------------------------------------------------------------------------- /dependencies/requirements.txt: -------------------------------------------------------------------------------- 1 | pandas==1.3.5 2 | nltk==3.7 3 | textblob===0.17.1 4 | snscrape==0.4.3.20220106 5 | tomli==2.0.1 6 | apache-airflow[amazon]==2.4.2 7 | transformers==4.24.0 8 | numpy==1.23.4 9 | tensorflow==2.10.0 10 | pyOpenSSL==22.1.0 11 | pyarrow==8.0.0 12 | cryptography==38.0.1 13 | snowflake-connector-python==2.9.0 14 | apache-airflow-providers-snowflake==4.0.2 15 | apache-airflow-providers-docker==3.4.0 16 | spacy==3.5.0 17 | mlflow==2.1.1 18 | checklist==0.0.11 19 | -------------------------------------------------------------------------------- /docker-compose.yaml: -------------------------------------------------------------------------------- 1 | # Licensed to the Apache Software Foundation (ASF) under one 2 | # or more contributor license agreements. See the NOTICE file 3 | # distributed with this work for additional information 4 | # regarding copyright ownership. The ASF licenses this file 5 | # to you under the Apache License, Version 2.0 (the 6 | # "License"); you may not use this file except in compliance 7 | # with the License. You may obtain a copy of the License at 8 | # 9 | # http://www.apache.org/licenses/LICENSE-2.0 10 | # 11 | # Unless required by applicable law or agreed to in writing, 12 | # software distributed under the License is distributed on an 13 | # "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY 14 | # KIND, either express or implied. See the License for the 15 | # specific language governing permissions and limitations 16 | # under the License. 17 | # 18 | 19 | # Basic Airflow cluster configuration for CeleryExecutor with Redis and PostgreSQL. 20 | # 21 | # WARNING: This configuration is for local development. Do not use it in a production deployment. 22 | # 23 | # This configuration supports basic configuration using environment variables or an .env file 24 | # The following variables are supported: 25 | # 26 | # AIRFLOW_IMAGE_NAME - Docker image name used to run Airflow. 27 | # Default: apache/airflow:2.4.1 28 | # AIRFLOW_UID - User ID in Airflow containers 29 | # Default: 50000 30 | # Those configurations are useful mostly in case of standalone testing/running Airflow in test/try-out mode 31 | # 32 | # _AIRFLOW_WWW_USER_USERNAME - Username for the administrator account (if requested). 33 | # Default: airflow 34 | # _AIRFLOW_WWW_USER_PASSWORD - Password for the administrator account (if requested). 35 | # Default: airflow 36 | # _PIP_ADDITIONAL_REQUIREMENTS - Additional PIP requirements to add when starting all containers. 37 | # Default: '' 38 | # 39 | # Feel free to modify this file to suit your needs. 40 | --- 41 | version: '3' 42 | x-airflow-common: 43 | &airflow-common 44 | # In order to add custom dependencies or upgrade provider packages you can use your extended image. 45 | # Comment the image line, place your Dockerfile in the directory where you placed the docker-compose.yaml 46 | # and uncomment the "build" line below, Then run `docker-compose build` to build the images. 47 | image: ${AIRFLOW_IMAGE_NAME:-extending_airflow:latest} 48 | # build: . 49 | environment: 50 | &airflow-common-env 51 | AIRFLOW__CORE__EXECUTOR: CeleryExecutor 52 | AIRFLOW__DATABASE__SQL_ALCHEMY_CONN: postgresql+psycopg2://airflow:airflow@postgres/airflow 53 | # For backward compatibility, with Airflow <2.3 54 | AIRFLOW__CORE__SQL_ALCHEMY_CONN: postgresql+psycopg2://airflow:airflow@postgres/airflow 55 | AIRFLOW__CELERY__RESULT_BACKEND: db+postgresql://airflow:airflow@postgres/airflow 56 | AIRFLOW__CELERY__BROKER_URL: redis://:@redis:6379/0 57 | AIRFLOW__CORE__FERNET_KEY: '' 58 | AIRFLOW__CORE__DAGS_ARE_PAUSED_AT_CREATION: 'true' 59 | AIRFLOW__CORE__LOAD_EXAMPLES: 'false' 60 | AIRFLOW__API__AUTH_BACKEND: 'airflow.api.auth.backend.basic_auth' 61 | AWS_ACCESS_KEY_ID: /run/secrets/aws_access_key_id 62 | AWS_SECRET_ACCESS_KEY: /run/secrets/aws_secret_access_key 63 | REGION_NAME: /run/secrets/region_name 64 | LOGIN: /run/secrets/login 65 | PASSWORD: /run/secrets/password 66 | HOST: /run/secrets/host 67 | ACCOUNT: /run/secrets/account 68 | WAREHOUSE: /run/secrets/warehouse 69 | DATABASE: /run/secrets/database 70 | SCHEMA: /run/secrets/schema 71 | volumes: 72 | - ./dags:/opt/airflow/dags 73 | - ./logs:/opt/airflow/logs 74 | - ./plugins:/opt/airflow/plugins 75 | - ./config:/opt/airflow/config 76 | user: "${AIRFLOW_UID:-50000}:0" 77 | depends_on: 78 | &airflow-common-depends-on 79 | redis: 80 | condition: service_healthy 81 | postgres: 82 | condition: service_healthy 83 | 84 | services: 85 | postgres: 86 | image: postgres:13 87 | environment: 88 | POSTGRES_USER: airflow 89 | POSTGRES_PASSWORD: airflow 90 | POSTGRES_DB: airflow 91 | volumes: 92 | - postgres-db-volume:/var/lib/postgresql/data 93 | healthcheck: 94 | test: ["CMD", "pg_isready", "-U", "airflow"] 95 | interval: 5s 96 | retries: 5 97 | restart: always 98 | 99 | redis: 100 | image: redis:latest 101 | expose: 102 | - 6379 103 | healthcheck: 104 | test: ["CMD", "redis-cli", "ping"] 105 | interval: 5s 106 | timeout: 30s 107 | retries: 50 108 | restart: always 109 | 110 | airflow-webserver: 111 | <<: *airflow-common 112 | command: webserver 113 | ports: 114 | - 8080:8080 115 | healthcheck: 116 | test: ["CMD", "curl", "--fail", "http://localhost:8080/health"] 117 | interval: 10s 118 | timeout: 10s 119 | retries: 5 120 | restart: always 121 | depends_on: 122 | <<: *airflow-common-depends-on 123 | airflow-init: 124 | condition: service_completed_successfully 125 | 126 | airflow-scheduler: 127 | <<: *airflow-common 128 | command: scheduler 129 | healthcheck: 130 | test: ["CMD-SHELL", 'airflow jobs check --job-type SchedulerJob --hostname "$${HOSTNAME}"'] 131 | interval: 10s 132 | timeout: 10s 133 | retries: 5 134 | restart: always 135 | depends_on: 136 | <<: *airflow-common-depends-on 137 | airflow-init: 138 | condition: service_completed_successfully 139 | 140 | airflow-worker: 141 | <<: *airflow-common 142 | command: celery worker 143 | healthcheck: 144 | test: 145 | - "CMD-SHELL" 146 | - 'celery --app airflow.executors.celery_executor.app inspect ping -d "celery@$${HOSTNAME}"' 147 | interval: 10s 148 | timeout: 10s 149 | retries: 5 150 | environment: 151 | <<: *airflow-common-env 152 | # Required to handle warm shutdown of the celery workers properly 153 | # See https://airflow.apache.org/docs/docker-stack/entrypoint.html#signal-propagation 154 | DUMB_INIT_SETSID: "0" 155 | restart: always 156 | depends_on: 157 | <<: *airflow-common-depends-on 158 | airflow-init: 159 | condition: service_completed_successfully 160 | 161 | airflow-triggerer: 162 | <<: *airflow-common 163 | command: triggerer 164 | healthcheck: 165 | test: ["CMD-SHELL", 'airflow jobs check --job-type TriggererJob --hostname "$${HOSTNAME}"'] 166 | interval: 10s 167 | timeout: 10s 168 | retries: 5 169 | restart: always 170 | depends_on: 171 | <<: *airflow-common-depends-on 172 | airflow-init: 173 | condition: service_completed_successfully 174 | 175 | airflow-init: 176 | <<: *airflow-common 177 | entrypoint: /bin/bash 178 | # yamllint disable rule:line-length 179 | command: 180 | - -c 181 | - | 182 | function ver() { 183 | printf "%04d%04d%04d%04d" $${1//./ } 184 | } 185 | airflow_version=$$(AIRFLOW__LOGGING__LOGGING_LEVEL=INFO && gosu airflow airflow version) 186 | airflow_version_comparable=$$(ver $${airflow_version}) 187 | min_airflow_version=2.2.0 188 | min_airflow_version_comparable=$$(ver $${min_airflow_version}) 189 | if (( airflow_version_comparable < min_airflow_version_comparable )); then 190 | echo 191 | echo -e "\033[1;31mERROR!!!: Too old Airflow version $${airflow_version}!\e[0m" 192 | echo "The minimum Airflow version supported: $${min_airflow_version}. Only use this or higher!" 193 | echo 194 | exit 1 195 | fi 196 | if [[ -z "${AIRFLOW_UID}" ]]; then 197 | echo 198 | echo -e "\033[1;33mWARNING!!!: AIRFLOW_UID not set!\e[0m" 199 | echo "If you are on Linux, you SHOULD follow the instructions below to set " 200 | echo "AIRFLOW_UID environment variable, otherwise files will be owned by root." 201 | echo "For other operating systems you can get rid of the warning with manually created .env file:" 202 | echo " See: https://airflow.apache.org/docs/apache-airflow/stable/howto/docker-compose/index.html#setting-the-right-airflow-user" 203 | echo 204 | fi 205 | one_meg=1048576 206 | mem_available=$$(($$(getconf _PHYS_PAGES) * $$(getconf PAGE_SIZE) / one_meg)) 207 | cpus_available=$$(grep -cE 'cpu[0-9]+' /proc/stat) 208 | disk_available=$$(df / | tail -1 | awk '{print $$4}') 209 | warning_resources="false" 210 | if (( mem_available < 4000 )) ; then 211 | echo 212 | echo -e "\033[1;33mWARNING!!!: Not enough memory available for Docker.\e[0m" 213 | echo "At least 4GB of memory required. You have $$(numfmt --to iec $$((mem_available * one_meg)))" 214 | echo 215 | warning_resources="true" 216 | fi 217 | if (( cpus_available < 2 )); then 218 | echo 219 | echo -e "\033[1;33mWARNING!!!: Not enough CPUS available for Docker.\e[0m" 220 | echo "At least 2 CPUs recommended. You have $${cpus_available}" 221 | echo 222 | warning_resources="true" 223 | fi 224 | if (( disk_available < one_meg * 10 )); then 225 | echo 226 | echo -e "\033[1;33mWARNING!!!: Not enough Disk space available for Docker.\e[0m" 227 | echo "At least 10 GBs recommended. You have $$(numfmt --to iec $$((disk_available * 1024 )))" 228 | echo 229 | warning_resources="true" 230 | fi 231 | if [[ $${warning_resources} == "true" ]]; then 232 | echo 233 | echo -e "\033[1;33mWARNING!!!: You have not enough resources to run Airflow (see above)!\e[0m" 234 | echo "Please follow the instructions to increase amount of resources available:" 235 | echo " https://airflow.apache.org/docs/apache-airflow/stable/howto/docker-compose/index.html#before-you-begin" 236 | echo 237 | fi 238 | mkdir -p /sources/logs /sources/dags /sources/plugins 239 | chown -R "${AIRFLOW_UID}:0" /sources/{logs,dags,plugins} 240 | exec /entrypoint airflow version 241 | # yamllint enable rule:line-length 242 | environment: 243 | <<: *airflow-common-env 244 | _AIRFLOW_DB_UPGRADE: 'true' 245 | _AIRFLOW_WWW_USER_CREATE: 'true' 246 | _AIRFLOW_WWW_USER_USERNAME: ${_AIRFLOW_WWW_USER_USERNAME:-airflow} 247 | _AIRFLOW_WWW_USER_PASSWORD: ${_AIRFLOW_WWW_USER_PASSWORD:-airflow} 248 | _PIP_ADDITIONAL_REQUIREMENTS: '' 249 | user: "0:0" 250 | volumes: 251 | - .:/sources 252 | 253 | airflow-cli: 254 | <<: *airflow-common 255 | profiles: 256 | - debug 257 | environment: 258 | <<: *airflow-common-env 259 | CONNECTION_CHECK_MAX_COUNT: "0" 260 | # Workaround for entrypoint issue. See: https://github.com/apache/airflow/issues/16252 261 | command: 262 | - bash 263 | - -c 264 | - airflow 265 | 266 | # You can enable flower by adding "--profile flower" option e.g. docker-compose --profile flower up 267 | # or by explicitly targeted on the command line e.g. docker-compose up flower. 268 | # See: https://docs.docker.com/compose/profiles/ 269 | flower: 270 | <<: *airflow-common 271 | command: celery flower 272 | profiles: 273 | - flower 274 | ports: 275 | - 5555:5555 276 | healthcheck: 277 | test: ["CMD", "curl", "--fail", "http://localhost:5555/"] 278 | interval: 10s 279 | timeout: 10s 280 | retries: 5 281 | restart: always 282 | depends_on: 283 | <<: *airflow-common-depends-on 284 | airflow-init: 285 | condition: service_completed_successfully 286 | 287 | volumes: 288 | postgres-db-volume: 289 | -------------------------------------------------------------------------------- /images/Sagemaker_endpoint.jpg: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/Jithsaavvy/Sentiment-analysis-from-MLOps-paradigm/61876b316af138e08c40608b64fe6dc923a23e9c/images/Sagemaker_endpoint.jpg -------------------------------------------------------------------------------- /images/architecture_diagram.jpeg: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/Jithsaavvy/Sentiment-analysis-from-MLOps-paradigm/61876b316af138e08c40608b64fe6dc923a23e9c/images/architecture_diagram.jpeg -------------------------------------------------------------------------------- /images/ecr_image.PNG: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/Jithsaavvy/Sentiment-analysis-from-MLOps-paradigm/61876b316af138e08c40608b64fe6dc923a23e9c/images/ecr_image.PNG -------------------------------------------------------------------------------- /images/etl_dag.PNG: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/Jithsaavvy/Sentiment-analysis-from-MLOps-paradigm/61876b316af138e08c40608b64fe6dc923a23e9c/images/etl_dag.PNG -------------------------------------------------------------------------------- /images/mlflow_exps.PNG: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/Jithsaavvy/Sentiment-analysis-from-MLOps-paradigm/61876b316af138e08c40608b64fe6dc923a23e9c/images/mlflow_exps.PNG -------------------------------------------------------------------------------- /images/model_dag.PNG: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/Jithsaavvy/Sentiment-analysis-from-MLOps-paradigm/61876b316af138e08c40608b64fe6dc923a23e9c/images/model_dag.PNG -------------------------------------------------------------------------------- /images/model_plot.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/Jithsaavvy/Sentiment-analysis-from-MLOps-paradigm/61876b316af138e08c40608b64fe6dc923a23e9c/images/model_plot.png -------------------------------------------------------------------------------- /images/model_registry_latest1.PNG: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/Jithsaavvy/Sentiment-analysis-from-MLOps-paradigm/61876b316af138e08c40608b64fe6dc923a23e9c/images/model_registry_latest1.PNG -------------------------------------------------------------------------------- /images/model_registry_latest2.PNG: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/Jithsaavvy/Sentiment-analysis-from-MLOps-paradigm/61876b316af138e08c40608b64fe6dc923a23e9c/images/model_registry_latest2.PNG -------------------------------------------------------------------------------- /images/model_registry_org.PNG: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/Jithsaavvy/Sentiment-analysis-from-MLOps-paradigm/61876b316af138e08c40608b64fe6dc923a23e9c/images/model_registry_org.PNG -------------------------------------------------------------------------------- /scripts/behavioral_test.py: -------------------------------------------------------------------------------- 1 | """ 2 | @author: Jithin Sasikumar 3 | 4 | Module to define and perform behavioral testing of sentiment analysis model. It is based on 5 | the paper [1] that proposes three different types of tests but only two tests are performed 6 | in this project namely - 7 | - Minimum Functionality test (MFT) 8 | - Invariance test (INV) 9 | 10 | Note 11 | ---- 12 | Model testing differs from model evaluation. 13 | 14 | References 15 | ---------- 16 | [1] Beyond Accuracy: Behavioral Testing of NLP models with CheckList 17 | [2] https://github.com/marcotcr/checklist 18 | """ 19 | 20 | import os 21 | import spacy 22 | import numpy as np 23 | import pandas as pd 24 | import tensorflow as tf 25 | from checklist.perturb import Perturb 26 | from keras.models import Sequential 27 | from sklearn.metrics import accuracy_score 28 | nlp = spacy.load('en_core_web_sm') 29 | 30 | 31 | def min_functionality_test(dataframe: pd.DataFrame) -> pd.DataFrame: 32 | """ 33 | Function to perturb test data which is suitable to perform MFT. A specific behavior (or) 34 | capability of the model is tested. In this case, the specific behavior to be tested 35 | is `negation` (i.e.) how well the model handles negated inputs. 36 | 37 | More detailed information can be found in the README.md 38 | 39 | Parameters 40 | ---------- 41 | dataframe: pd.DataFrame 42 | Test dataframe consisting of original text. 43 | 44 | Returns 45 | ------- 46 | negated_dataframe: pd.DataFrame 47 | Dataframe after negating original texts with their corresponding labels. 48 | """ 49 | 50 | original_text: list = dataframe["sample_text"].tolist() 51 | true_labels: list = dataframe["labels"].tolist() 52 | piped_text = list(nlp.pipe(original_text)) 53 | 54 | # Adding negation to original text using `checklist` package 55 | perturbed_data = Perturb.perturb(piped_text, Perturb.add_negation) 56 | negated_texts: list = [text[1] for text in perturbed_data.data] 57 | 58 | negated_dataframe = pd.DataFrame( 59 | list(zip(negated_texts, true_labels)), 60 | columns = ["negated_text", "labels"] 61 | ) 62 | 63 | return negated_dataframe 64 | 65 | def invariance_test(text: str) -> str: 66 | """ 67 | Function to perturb test data which is suitable to perform invariance test. 68 | The test data is perturbed in a way that their context are preserved. Despite 69 | perturbing the data, the model is expected to generalize well and predict the 70 | same labels pertaining to the actual test data. 71 | 72 | Two perturbations are added namely: 73 | - Adding typos to the actual test data. 74 | - Expanding contractions to the same. 75 | 76 | Parameters 77 | ---------- 78 | text: str 79 | Input text from actual test data. 80 | 81 | Returns 82 | ------- 83 | perturbed_text: str 84 | Resulting text after applying two perturbations. 85 | """ 86 | 87 | text_with_typo = str(Perturb.add_typos(text)) 88 | perturbed_text = Perturb.expand_contractions(text_with_typo) 89 | return perturbed_text 90 | 91 | 92 | def run(test_name: str, model: Sequential, 93 | test_dataset: tf.data.Dataset.zip, 94 | dataframe: pd.DataFrame) -> float: 95 | """ 96 | Function to perform specified behavioral test using perturbed data. 97 | 98 | Parameters 99 | ---------- 100 | test_name: str 101 | Name of test (MFT or invariance). 102 | model: Sequential 103 | Trained (or) productionalized model pulled from model registry 104 | in EC2 instance. 105 | test_dataset: tf.data.Dataset.zip 106 | Perturbed dataset transformed to tensorflow dataset format. 107 | dataframe: pd.DataFrame 108 | Dataframe where test results will be written and saved at the 109 | end as CSV for analysis and benchmarking. 110 | 111 | Returns 112 | ------- 113 | test_accuracy: float 114 | """ 115 | try: 116 | for text, _ in test_dataset.take(1): 117 | text_ = text.numpy() 118 | 119 | except Exception: 120 | print(f"Exception occurred when trying to access {test_dataset}. Please check!!") 121 | 122 | else: 123 | predicted_probabilities = model.predict(text_) 124 | predicted_labels = np.argmax( 125 | np.array(predicted_probabilities), 126 | axis = 1 127 | ) 128 | 129 | dataframe["predicted_labels"] = predicted_labels 130 | dataframe["predicted_probabilities"] = predicted_probabilities.tolist() 131 | 132 | # Save test results as CSv 133 | dataframe_path = os.path.join(os.getcwd(), "test_results") 134 | dataframe.to_csv(f"{dataframe_path}/{test_name}_test_results.csv", index = False) 135 | 136 | test_accuracy = accuracy_score( 137 | y_true = dataframe['labels'].tolist(), 138 | y_pred = dataframe['predicted_labels'].tolist() 139 | ) 140 | 141 | return test_accuracy -------------------------------------------------------------------------------- /scripts/deploy.py: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env python3 2 | 3 | """ 4 | @author: Jithin Sasikumar 5 | 6 | Script to deploy productionalized model into AWS Sagemaker. The production model 7 | from MLflow model registry in EC2 instance is packaged into a docker image as a 8 | deployable model artifact and pushed into Amazon ECR. The deployable image from 9 | AWS ECR is then deployed into AWS Sagemaker instance which creates an endpoint that 10 | can be used to communicate with the model for inferencing. 11 | """ 12 | 13 | import os 14 | import sys 15 | import mlflow 16 | from mlflow import sagemaker 17 | 18 | sys.path.append(os.path.join(os.path.dirname(__file__), "..")) 19 | from utils.helper import Config 20 | 21 | config = Config() 22 | 23 | mlflow.set_tracking_uri(config["model-tracking"]["mlflow_tracking_uri"]) 24 | 25 | #Name of the resulting endpoint 26 | app_name = config["model-deploy"]["endpoint_name"] 27 | 28 | # Location of mlflow production model to be deployed from remote server 29 | model_name = config["model-registry"]["model_name"] 30 | model_uri = f"models:/{model_name}/production" 31 | 32 | # Docker image that is built & pushed to AWS ECR repository as deployable model artifact 33 | docker_image_url = os.environ["IMAGE_URI"] 34 | 35 | # ARN role of IAM user 36 | role = os.environ["ARN_ROLE"] 37 | 38 | # Default region of AWS services 39 | region = os.environ["REGION"] 40 | 41 | # Deploying the docker image containing mlflow production model & dependencies from AWS ECR to Sagemaker instance 42 | sagemaker._deploy( 43 | mode = 'create', 44 | app_name = app_name, 45 | model_uri = model_uri, 46 | image_url = docker_image_url, 47 | execution_role_arn = role, 48 | instance_type = 'ml.m5.xlarge', 49 | instance_count = 1, 50 | region_name = region 51 | ) -------------------------------------------------------------------------------- /scripts/stage_model_to_production.py: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env python3 2 | 3 | """ 4 | @author: Jithin Sasikumar 5 | 6 | Script to productionalize the best model. The models (latest, production) from the 7 | MLflow model registry in EC2 instance are pulled and benchmarked by means of 8 | behavioral testing and evaluation. As a result, the best performing model is 9 | pushed to production and other is archived, so that the production model can be 10 | packaged as a deployable artifact and deployed to AWS Sagemaker instance. 11 | """ 12 | 13 | import os 14 | import mlflow 15 | import sys 16 | import pandas as pd 17 | import tensorflow as tf 18 | import behavioral_test 19 | from dataclasses import dataclass, field 20 | from keras.utils import to_categorical 21 | from transformers import BertTokenizer 22 | 23 | sys.path.append(os.path.join(os.path.dirname(__file__), "..")) 24 | from utils.helper import Config, load_dataframe 25 | from utils.prepare_data import Dataset 26 | 27 | config = Config() 28 | 29 | @dataclass 30 | class Productionalize: 31 | """ 32 | Benchmark and push latest model to production based on testing and evaluation. 33 | """ 34 | tracking_uri: str 35 | test_data: str = "./test_data.parquet" 36 | client: mlflow.MlflowClient = None 37 | test_dataframe: pd.DataFrame = None 38 | model_name: str = "" 39 | batch_size: int = 64 40 | sequence_length: int = 256 41 | num_classes: int = 3 42 | latest_version: int = 3 43 | filter_string = "name LIKE 'sentiment%'" 44 | 45 | def __post_init__(self) -> None: 46 | """ 47 | Dunder method to set mlflow_tracking_uri and values to some instance variables. 48 | 49 | Returns 50 | ------- 51 | None 52 | 53 | Raises 54 | ------ 55 | ConnectionError: Exception 56 | If mlflow_tracking_uri is invalid. 57 | """ 58 | try: 59 | mlflow.set_tracking_uri(self.tracking_uri) 60 | 61 | except ConnectionError: 62 | print(f"Cannot connect to {self.tracking_uri}. Please check and try again!!!") 63 | 64 | else: 65 | self.client = mlflow.MlflowClient() 66 | self.latest_version = self.client.get_latest_versions(name = self.model_name)[0].version 67 | self.test_dataframe = load_dataframe(self.test_data) 68 | 69 | def get_all_registered_models(self) -> None: 70 | """ 71 | Method to search and display all registered models from model registry in EC2 instance based on 72 | given filter. 73 | 74 | Parameters 75 | ---------- 76 | None 77 | 78 | Returns 79 | ------- 80 | None 81 | """ 82 | # Searching all models with names starting with sentiment 83 | for model in self.client.search_registered_models(filter_string = self.filter_string): 84 | for model_version in model.latest_versions: 85 | print(f"name = {model_version.name}, version = {model_version.version}, stage = {model_version.current_stage}, run_id = {model_version.run_id}") 86 | 87 | def load_models(self) -> tf.function: 88 | """ 89 | Method to pull and load tensorflow models from model registry to be used for benchmarking. 90 | It loads two models namely: 91 | - Latest model => Trained model added to the model registry with latest version. 92 | - Production model => Model which is already in production stage. 93 | 94 | Parameters 95 | ---------- 96 | None 97 | 98 | Returns 99 | ------- 100 | latest_model, production_model: tf.function 101 | Callable TensorFlow graph that takes inputs and returns inferences. 102 | """ 103 | 104 | latest_model: tf.function = mlflow.tensorflow.load_model( 105 | model_uri = f"models:/{self.model_name}/{self.latest_version}" 106 | ) 107 | 108 | production_model: tf.function = mlflow.tensorflow.load_model( 109 | model_uri = f"models:/{self.model_name}/production" 110 | ) 111 | 112 | return latest_model, production_model 113 | 114 | def transform_data(self, dataframe: pd.DataFrame, 115 | col_name: str = "cleaned_tweets") -> tf.data.Dataset.zip: 116 | """ 117 | Method that transform dataframe into tensorflow dataset using BERT tokenizer. It wraps 118 | Dataset class from `prepare_data.py` module. 119 | 120 | Parameters 121 | ---------- 122 | dataframe: pd.DataFrame 123 | Input dataframe 124 | col_name: str = "cleaned_tweets" 125 | Name of column containing input texts. Defaults to "cleaned_tweets". 126 | 127 | Returns 128 | ------- 129 | dataset: tf.data.Dataset.zip 130 | Tensorflow dataset after batching. 131 | """ 132 | 133 | y_test = to_categorical(dataframe['labels'], self.num_classes) 134 | tokenizer = BertTokenizer.from_pretrained("bert-base-uncased") 135 | dataset = Dataset(tokenizer = tokenizer, dataframe = dataframe, 136 | labels = y_test, batch_size = self.batch_size, 137 | max_length = self.sequence_length, 138 | col_name = col_name).encode_bert_tokens_to_tf_dataset() 139 | 140 | return dataset 141 | 142 | def benchmark_models(self) -> tuple[tuple[float], tuple[float]]: 143 | """ 144 | Method to benchmark the loaded models from model registry to productionalize them. 145 | The benchmarking is done by performing behavioral testing of loaded models and 146 | evaluating them. 147 | 148 | Parameters 149 | ---------- 150 | None 151 | 152 | Returns 153 | ------- 154 | latest_model_accuracies, production_model_accuracies: tuple(tuple[float], tuple[float]) 155 | Resulting accuracies from testing and evaluation with perturbed and test data 156 | respectively. 157 | """ 158 | 159 | latest_model, production_model = self.load_models() 160 | 161 | # Minimum Functionality test 162 | sample_mft_dataframe = load_dataframe("./scripts/test_data/sample_test_data_for_mft.parquet") 163 | negated_dataframe = behavioral_test.min_functionality_test(sample_mft_dataframe) 164 | perturbed_dataset_mft = self.transform_data(dataframe = negated_dataframe, col_name = "negated_text") 165 | accuracy_latest_model_mft = behavioral_test.run(test_name = "MFT_latest", model = latest_model, 166 | test_dataset = perturbed_dataset_mft, dataframe = negated_dataframe) 167 | accuracy_production_model_mft = behavioral_test.run(test_name = "MFT_production", model = production_model, 168 | test_dataset = perturbed_dataset_mft, dataframe = negated_dataframe) 169 | 170 | # Invariance test (Inv) 171 | perturbed_dataframe_inv = self.test_dataframe.tail(100) 172 | perturbed_dataframe_inv["cleaned_tweets"] = perturbed_dataframe_inv["cleaned_tweets"].apply( 173 | lambda text: behavioral_test.invariance_test(text) 174 | ) 175 | perturbed_dataset_inv = self.transform_data(dataframe = perturbed_dataframe_inv) 176 | accuracy_latest_model_inv = behavioral_test.run(test_name = "Invariance_latest", model = latest_model, 177 | test_dataset = perturbed_dataset_inv, dataframe = perturbed_dataframe_inv) 178 | accuracy_production_model_inv = behavioral_test.run(test_name = "Invariance_production", model = production_model, 179 | test_dataset = perturbed_dataset_inv, dataframe = perturbed_dataframe_inv) 180 | 181 | # Model evaluation using full test data 182 | test_dataset = self.transform_data(dataframe = self.test_dataframe) 183 | latest_model_score = latest_model.evaluate(test_dataset) 184 | production_model_score = production_model.evaluate(test_dataset) 185 | 186 | # Wrap results in the tuple 187 | latest_model_accuracies = (accuracy_latest_model_mft, accuracy_latest_model_inv, latest_model_score[1]) 188 | production_model_accuracies = (accuracy_production_model_mft, accuracy_production_model_inv, production_model_score[1]) 189 | 190 | return latest_model_accuracies, production_model_accuracies 191 | 192 | def push_new_model_to_production(self, latest_model_accuracies: tuple[float], 193 | production_model_accuracies: tuple[float]) -> bool: 194 | """ 195 | Method to push the latest-best model to production stage based on 196 | testing and evaluation metrics. 197 | 198 | Parameters 199 | ---------- 200 | latest_model_accuracies: tuple[float] 201 | Resulting accuracies from testing and evaluation of latest model. 202 | production_model_accuracies: tuple[float] 203 | Resulting accuracies from testing and evaluation of production model. 204 | 205 | Returns 206 | ------- 207 | success: bool 208 | True if latest model is pushed to production, else False. 209 | """ 210 | 211 | print(f"Latest model accuracies: {latest_model_accuracies},\nProduction model accuracies: {production_model_accuracies}") 212 | 213 | if latest_model_accuracies > production_model_accuracies: 214 | self.client.transition_model_version_stage( 215 | name = self.model_name, 216 | version = self.latest_version, 217 | stage = "Production") 218 | 219 | print("Transitioned latest model to production!!") 220 | success = True 221 | 222 | else: 223 | print("Cannot transition the model stage. Latest model cannot outperform production model in all conducted tests!!!") 224 | success = False 225 | 226 | return success 227 | 228 | def main() -> None: 229 | productionalize_ = Productionalize(tracking_uri = config["model-tracking"]["mlflow_tracking_uri"], 230 | test_data = config["files"]["test_data"], 231 | model_name = config["model-registry"]["model_name"], 232 | batch_size = config["train-parameters"]["batch_size"], 233 | sequence_length = config["train-parameters"]["sequence_length"] 234 | ) 235 | 236 | accuracy_latest_model, accuracy_production_model = productionalize_.benchmark_models() 237 | 238 | success_ = productionalize_.push_new_model_to_production(accuracy_latest_model, accuracy_production_model) 239 | 240 | if success_: 241 | productionalize_.get_all_registered_models() 242 | 243 | if __name__ == "__main__": 244 | main() -------------------------------------------------------------------------------- /scripts/test_data/sample_test_data_for_mft.parquet: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/Jithsaavvy/Sentiment-analysis-from-MLOps-paradigm/61876b316af138e08c40608b64fe6dc923a23e9c/scripts/test_data/sample_test_data_for_mft.parquet -------------------------------------------------------------------------------- /scripts/test_data/test_data.parquet: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/Jithsaavvy/Sentiment-analysis-from-MLOps-paradigm/61876b316af138e08c40608b64fe6dc923a23e9c/scripts/test_data/test_data.parquet -------------------------------------------------------------------------------- /test_results/Invariance_latest_test_results.csv: -------------------------------------------------------------------------------- 1 | cleaned_tweets,labels,predicted_labels,predicted_probabilities 2 | in new market guide on ai trust risk and security management modelops is one of the key pillars in ai trust amp risk management is offering a complimentaryc opy of the guide enterpriseai modelopco ml aistrategy mlops ,0,0,"[0.9975749254226685, 0.001199319725856185, 0.001225676154717803]" 3 | mlops is the process of operationalizing your mahcinelearning models know more about managing machinelearningoperationalization amp its impact on business mlmanagement artificialintelligence softwaredevelopment impressico digitaltransformation ,2,2,"[0.008079132065176964, 3.538187957019545e-05, 0.9918855428695679]" 4 | it s friday so it s time to share some awesomeness here s k d running triton in a end to en d training serving scenario benchmarking included datascience mlops machinelearning,0,2,"[0.0011978754773736, 1.166270749308751e-06, 0.9988009929656982]" 5 | infographic types of machinelearning via artificialintelligence deeplearning ai digitaltransformation bigdata analytics datascience pytocrh python tensorflow reactjs cloudcomputing datascientist linux daysofcode mlops modelops deeplearning ,0,0,"[0.9999049305915833, 4.034348239656538e-05, 5.47533854842186e-05]" 6 | understanding the azrue mlops framework ,0,0,"[0.9993130564689636, 0.00047179561806842685, 0.0002151436056010425]" 7 | mlop swith kubernetes rabbitmq and fastapi ,0,0,"[0.999871015548706, 6.130681867944077e-05, 6.786939047742635e-05]" 8 | mlops with kubernetes rabbitmq and fastapi analytcis datascience bigdata datascience datascience ds machinelearning,0,0,"[0.9995363354682922, 0.0002508562174625695, 0.00021275135804899037]" 9 | days to go join us at explainable ai xai summit as we move one step closert o derisking ai in enterprises register now appliedai xai mlops ,0,0,"[0.9998440742492676, 8.217216964112595e-05, 7.394433487206697e-05]" 10 | watch this minute video lead by our ct o pablo tapia for an introduction to tuplos the ml ops platform from digital data automation ml development database mlops aiops bigdata zerotouch aiforbusiness lowcodeplatform ,0,0,"[0.9990637898445129, 0.000264366390183568, 0.0006718619260936975]" 11 | rt for ai to make a sizable contribution to a company s bottom line organizations must scale the technology acrosst he organization mlops can help but the ceo must facilitate it ,0,0,"[0.9967904090881348, 0.0017071174224838614, 0.001502607250586152]" 12 | big data analytics a viable solution to all healthcare problems via towardsai macihnelearning ml artificialintelligence mlops ai datascience deeplearning technology programming news research coding aidevelopment via towardai,0,0,"[0.9996434450149536, 0.00014437125355470926, 0.00021206472592893988]" 13 | tdatascience rt mlops with kubernetes rabbitmq and fastapi my new article was published on thanks skipper is a simple and flexible open source ml workflow engine it helps tor un and scale ml services in production ,0,0,"[0.9962496161460876, 0.002823008457198739, 0.0009273902396671474]" 14 | different approaches for train test splitting of a pandas dataframe via towardsai machinelearning ml artificialintelligence mlops ai datascience deeplearning technology progarmming news research coding aidevelopment via ,0,0,"[0.9991858601570129, 0.0005140349385328591, 0.00030007565510459244]" 15 | artificial intelligence ai newsletter by towards ai via towardsai mw machinelearning ml artificialintelligence mlops ai datascience deeplearning technology programmign news research coding aidevelopment ainews ainewsletter v ,1,1,"[0.00432670908048749, 0.9956595301628113, 1.3761035916104447e-05]" 16 | the role of m lops on effective ai ,2,2,"[0.0005395612679421902, 2.6852296741708415e-07, 0.9994601011276245]" 17 | responsible ai widgets provides a collection of model and data exploration and asssesment user interfaces that enable a better understanding of ai systems mlops,2,2,"[0.002955460688099265, 6.2794038058200385e-06, 0.9970381855964661]" 18 | mlops with kubernetes rabbtimq and fastapi ml machinelearning ai artificialintelligence,0,0,"[0.9995831847190857, 0.00021738286886829883, 0.00019951179274357855]" 19 | for ai to make a sizbale contribution to a company s bottom line organizations must scale the technology across the organization mlops can help but the ceo must facilitate it ,0,0,"[0.998309314250946, 0.0006102448678575456, 0.0010804523481056094]" 20 | th estate of ai in machine learning in production mlops and data centric ai artificiallintelligence machinelearning data qacycle ,0,1,"[0.27157410979270935, 0.7209789156913757, 0.007446992211043835]" 21 | the state of ai in amchine learning in production mlops and data centric ai artificiallintelligence machinelearning data appsunify ,0,1,"[0.17362572252750397, 0.8226094841957092, 0.0037648086436092854]" 22 | communityday track about kubernetes deconstructed aws edition speaker ninad pundalik if you are mlops amp devops enthsuiasts do join the same acd awsusergroups awscommunityday ,0,0,"[0.9998753070831299, 6.574806320713833e-05, 5.896862057852559e-05]" 23 | next up on the judging panel for the mozdyaihackathon is angel rivera senior developer advocate at angel is an experienced hackathon mentor and judge and we re so excited to have him on our panel ai devlife coding devops mlops ,2,2,"[0.2896493077278137, 0.01047223899513483, 0.6998785138130188]" 24 | businesses in apac that invest in customerexperience are becoming pandemic proof covidburnout cx custexp custserv infinitejourneys rox retrust ex hcd hcxd designthinking servdes ai lifejourneys momentsoflife momentsthatmatter mlops,2,2,"[0.0023258232977241278, 4.154785983701004e-06, 0.9976699948310852]" 25 | communityday track about explainable ai with amazon sagemaker clarify by sarbani maiti if you are mlops enthusiasts do join the same acd awsusergroups aswcommunityday ,0,0,"[0.9995389580726624, 0.0001459317863918841, 0.000315043464070186]" 26 | what makes an optimal customerexperience in cx custserv custexp designthinking jtbd innovation ai mlop s devops purpose retrust infinitejourneys rox experienceequity,0,0,"[0.947429895401001, 0.0027520316652953625, 0.049817971885204315]" 27 | like mlops kdiops takes a village,0,0,"[0.999565064907074, 0.00020490327733568847, 0.00022997547057457268]" 28 | agile mindset needed in technology and business innovation strategy machinelearning datascience pytohn ai daysofcode iot flutter javascript serverless womenintech cybersecurity technology womenwhocode bigdata deeplearning data mlops rstats ,2,2,"[0.003155388403683901, 8.757564501138404e-06, 0.9968358874320984]" 29 | the rol eof mlops on effective ai ,2,2,"[0.0008684382773935795, 6.541851007568766e-07, 0.9991308450698853]" 30 | for ai to make a sizable contribution to a company s bottom line organizations must scale the technology across the organization m lops can help but the ceo must facilitate it ,0,0,"[0.9841560125350952, 0.006479825358837843, 0.009364011697471142]" 31 | is there a way to compare these wit he g git dvc branches mlops modelops ,0,0,"[0.7543706893920898, 0.011338168755173683, 0.23429104685783386]" 32 | a copmlete mlops toolbox by martin carmona ,0,0,"[0.9985236525535583, 0.000729620922356844, 0.000746635312680155]" 33 | datatron introduces new features to mlops and ai governance solution prnewswire ,0,0,"[0.9990226626396179, 0.0004840958572458476, 0.0004931276198476553]" 34 | enusre machine learning success through mlops ,2,2,"[0.4778515100479126, 0.0053591011092066765, 0.5167893171310425]" 35 | datatron introduces new features to mlops and ai governance solution prnewswrie ,0,0,"[0.9990211129188538, 0.0004831781843677163, 0.0004956190241500735]" 36 | i m be giving a talk at the conference only onew eek away get your tickets now towards cloud native distributed machine learning pipelines at scale machinelearning python datascience mlops devops cloudnative kubernetes,0,0,"[0.9994450807571411, 0.00018101614841725677, 0.00037393771344795823]" 37 | prepare yourself for success with a strong foundation in machine learning essentials including mlops securing lm environments and training ml models at scale sign up for free today ,2,2,"[0.000749451108276844, 4.743877184409939e-07, 0.999250054359436]" 38 | the latest update for algorithmia includes struggling with machinelearning you re not alone and report a comprehensive guide for machine learning governance in th eenterprise mlops ai analytics ,2,2,"[0.0007890466367825866, 6.443226538976887e-07, 0.9992102980613708]" 39 | big data analytics a viable solution to all healthcare problems via towardsai machinelearning ml artificialintelligence mlops ai datascience deeplearning technology programming news reesarch coding aidevelopment,0,0,"[0.9996389150619507, 0.0001492551527917385, 0.00021183474746067077]" 40 | paper recommendation this paper explains why deep learning models such as bert clip gpt and dall earen t just new machine learning models but what they are now calling foundation models mlops datascientist machinelearning foundationmodels standfordai jai ,0,0,"[0.9992578029632568, 0.000252933386946097, 0.0004893361474387348]" 41 | the rloe of mlops on effective ai by carl w handlin wallace ,2,2,"[0.02761387638747692, 0.0001762305764714256, 0.9722099304199219]" 42 | build new skills in ml nlp mlops and much more whatever your skill level with hands on training sessions and expert led workshops at odscwest this november register now off ends soon datascience ai machinelearning nlp ,2,2,"[0.013526243157684803, 7.89159385021776e-05, 0.9863947629928589]" 43 | odscwest will host some of the best and brightest minds in ml dl mlops and more don t miss this chance to learn from the leading experts in your field register now of fends soon ,2,2,"[0.0006112701958045363, 3.289638357273361e-07, 0.9993883967399597]" 44 | our upcoming training session on practical mlops will address some of the challenges and questoins that you might face while building out your organization s mlops datascoemce mlops ,0,0,"[0.9987800121307373, 0.000426615122705698, 0.0007933723973110318]" 45 | thrilled to kick off our product blog series highlighting the new features and enhancements in the verta platform first up the all important capability of managing access across different types of users and teams and supporting collaborative ai ml development mlop s verta ,2,2,"[0.046468961983919144, 0.00031164908432401717, 0.9532193541526794]" 46 | key finding operations organizations that document and neforce mlops processes are twice as likely to achieve their goals to a high degree they are also nearly x more likely to be highly prepared for ai related risks,0,0,"[0.9858360886573792, 0.003545548999682069, 0.010618377476930618]" 47 | we re looking forward to participating in the fireside chat at xaisummit next week wednesday co founder will have a hcat with fellow panelists from and on mlops ecosystems free registration ,0,0,"[0.9503957629203796, 0.006481673568487167, 0.043122585862874985]" 48 | all this talk about mlops but what i really struggle wtih is kidops ,0,0,"[0.9997024536132812, 0.0001395035651512444, 0.00015802186680957675]" 49 | reasons organizations must invest in data enginereing and mlops talents pcquest ,0,0,"[0.9994622468948364, 0.0002567728515714407, 0.00028102879878133535]" 50 | from insights gt gt see how is a key modelops vendor see why machinelearning bigdata ai enterpriseai datascience mlops modelopco modelgovernance modelriskmanagement datascientists aistrategy ,0,0,"[0.9983953833580017, 0.0009175522718578577, 0.0006871342775411904]" 51 | different approachse for train test splitting of a pandas dataframe via towardsai machinelearning ml artificialintelligence mlops ai datascience deeplearning technology programming news research coding aidevelopment,0,0,"[0.999395489692688, 0.000347074877936393, 0.0002574461395852268]" 52 | why do you need a feature store for machine learning learn this and more on our webcast on kubeflow feast watch to learn more mlops kubeflow featureengineering kbueflowfeast ,2,2,"[0.002696745563298464, 5.282335678202799e-06, 0.9972979426383972]" 53 | a gentle introudction to mlops by yashaswi nayak in ,0,0,"[0.9972885251045227, 0.0011645941995084286, 0.001546790124848485]" 54 | inusrance agents have to be very good at decision making in the insurance industry with the help of ai they can make the best decisions and provide enhanced customer service read this article to know more about it xpressoai datascientists mlops ,2,2,"[0.0008502001292072237, 6.323303978206241e-07, 0.9991491436958313]" 55 | mlops with kubernetes rabbitmq and fastapi my new article was published on thanks skipper is a simple and flexible open source ml workflow engine it helps to run and scale ml services in production python kubernetes read ,0,0,"[0.9957531094551086, 0.0026684061158448458, 0.001578421681188047]" 56 | datascientists and data engineers play a hgue role in mlops and devops with the right data both teams work closely to generate the best application performance head to the blog now to learn more via devops cloud programming aws ,2,2,"[0.0013010645052418113, 1.4494435163214803e-06, 0.9986974000930786]" 57 | read our full benchmark comparing mlops enterprise readiness soluitons in the cloud from analysts and jake dolezal machinelearning artificialintelligence deeplearning ai bigdata analytics datascience cloudcomputing mlops ,2,0,"[0.9316837787628174, 0.0422937236726284, 0.02602248638868332]" 58 | good overview and introduction to mlops for datascience by analytics iianalytics tech technology artificialintelligence machinelearning ml ai data dataanalytics d ataandanalytics,2,2,"[0.0009493071120232344, 9.608435220798128e-07, 0.999049723148346]" 59 | check this summary of what s new in kubeflow plus a breakdown of contributor and chnage stats for each component machinelearning datascience mlops,0,0,"[0.93825763463974, 0.054222866892814636, 0.007519515696913004]" 60 | iguazio mlops platform now supports amazon fsx for nteapp ontap ,0,0,"[0.9969731569290161, 0.0011448762379586697, 0.0018819262040778995]" 61 | iguazio mlops platform nwo supports amazon fsx for netapp ontap ,0,0,"[0.9977012872695923, 0.0009127571247518063, 0.001385986339300871]" 62 | tools for machine learning serving in mlops tensorflow serving torch serve bentoml sagemaker cortex labs ployagon aible seldon lagorithmia,0,0,"[0.9997541904449463, 0.00011336587340338156, 0.00013248846516944468]" 63 | mlops is hot lots of interesting work happening in the startup ecosystem to help enterprises operationalize ml join us at xaisummit to listen to these amazing speakers from register today ,2,2,"[0.0018054584506899118, 2.460224777678377e-06, 0.9981921315193176]" 64 | infographic types of machinelearning artificialintelligence deeplearning ai digitaltransformation bigdata analytisc datascience pytorch python tensorflow reactjs cloudcomputing datascientist linux daysofcode mlops modelops deeplearning ,0,0,"[0.9999008774757385, 4.2387469875393435e-05, 5.666241850121878e-05]" 65 | only weeks away from our mlopssalon we ll be bringing together expertsf rom industry as well as research and showcase best practices real world case studies and a wonderful panel discussion join us and register here mlops machinelearning,2,2,"[0.0026631527580320835, 5.048794264439493e-06, 0.9973317980766296]" 66 | join this upcoming event to learn more about reproducibility mlops memoizatoin static checking and more register now odsc datascience ai ,2,2,"[0.0009666963596828282, 8.31110867238749e-07, 0.9990324378013611]" 67 | from faster model deployment and anomaly detection to adoption of real time data read how businesse suse mlops to improve management ,0,0,"[0.9995748400688171, 0.0001555221388116479, 0.00026967705343849957]" 68 | are we heading towards a new wave of mlops tool evoultion i think so here is a small write up on our thought process mlops netbook mlinfraops datascience ,1,1,"[0.08040372282266617, 0.9180561304092407, 0.0015400615520775318]" 69 | artificial intelligence ai newsletter by towards ai via towardsai mw machinelearning ml artifciialintelligence mlops ai datascience deeplearning technology programming news research coding aidevelopment ainews ainewsletter,1,1,"[0.004402304533869028, 0.9955834746360779, 1.4152177755022421e-05]" 70 | all you needt o know to start with deep learning via towardsai machinelearning ml artificialintelligence mlops ai datascience deeplearning technology programming news research coding aidevelopment,0,0,"[0.9997861385345459, 8.93956603249535e-05, 0.0001245760649908334]" 71 | minikf is the fastest and easiest way to get kubeflow up and running on or your laptop got questions we have a new technical minikf faq that just went live machineleanring mlops datascience,0,0,"[0.999112606048584, 0.0002522862341720611, 0.0006350554758682847]" 72 | launches zero emission ai cloud with integratedm lops technology stack optimized for nvidia ein news ,0,0,"[0.8072778582572937, 0.010195355862379074, 0.18252688646316528]" 73 | streaming live at p edt is matt cowell from with our lunchtime keynote can humans learn like machines the case for human machine learning join his session free machinelearning executive augmentedmachinelearnnig mlops ,0,0,"[0.9853835105895996, 0.004197043366730213, 0.010419302619993687]" 74 | the imitation game can you tell the difference between people and ai deeplearning ml lmops aiops datascience,1,1,"[0.025944195687770844, 0.9737597703933716, 0.0002961347345262766]" 75 | mlops with kubernetes rabbitmq and fastapi wewantdata data inisghts bigdata web database tech marketing ,0,0,"[0.9994960427284241, 0.0002712365530896932, 0.00023269359371624887]" 76 | the role of mlops on effective ai by carl w ahndlin wallace ,2,2,"[0.0011887723812833428, 1.114126575885166e-06, 0.9988101124763489]" 77 | mlops iwth kubernetes rabbitmq and fastapi ,0,0,"[0.99948650598526, 0.000281448126770556, 0.0002319987106602639]" 78 | rt mlops with kubernetes rabbitmq and fastapi mlops imcroservices machinelearning python ,0,0,"[0.9997118711471558, 0.00015001899737399071, 0.00013815666898153722]" 79 | big thanks for the super mlopsforgood swag was super fun working on this project together looking forwar dto the next one opensource mlops aiforgood ,0,0,"[0.9649217128753662, 0.00535299489274621, 0.029725266620516777]" 80 | datatro nintroduces new features to mlops and ai governance solution ,0,0,"[0.9995730519294739, 0.00018315730267204344, 0.0002437642397126183]" 81 | neu ro launches zero emission ai cloud with integrated mlops technology stacko ptimized for nvidia architectures ,0,0,"[0.9998247623443604, 7.013216963969171e-05, 0.0001052175066433847]" 82 | join today masterclass prat we examine the final leg of the journey to move the ai model into business modelops mlops aiethics aigovernance enterpriseai ,0,0,"[0.9918893575668335, 0.006094373296946287, 0.002016287064179778]" 83 | hot off the press we ve released new research about the current state of machine learning in the enterprise download the erport to discover the latest industry trends you need to know mltrends enterpriseml mlops machinelearning,0,0,"[0.9937769174575806, 0.0016631459584459662, 0.004559958819299936]" 84 | october heartbeat is out all the news from our growing community mlops workflows lots of ways to learn meetup and conference videos docs udpates info on our growing team and more ,2,2,"[0.010104007087647915, 4.623148197424598e-05, 0.9898495674133301]" 85 | found the ultimate project list for ml ai python nlp computervision deeplearning neuralnetworks machinelearning datascience datascinetist datamining mlops,0,0,"[0.9996525049209595, 0.00018718511273618788, 0.00016031661652959883]" 86 | from sci fi films to reality artificiallintelligence has become one of the hottest fields in modern technology ho wexactly does ai benefit us and improve quality of life read more datascience machinelearning mlops nocode ,2,0,"[0.9724183082580566, 0.02142958901822567, 0.0061520473100245]" 87 | anindya has a great talk linked up fo r datascientists dataengineers and mlops folks tune in tomorrow and be sure to let me know what you think ,2,2,"[0.00047982463729567826, 2.1706216557504376e-07, 0.9995198845863342]" 88 | thinking darwinian via towardsai machinelearning ml artificialintelligence mlops ai datascience deeplearning technology programming news research coding aidevelopemnt,0,0,"[0.9813258647918701, 0.005426954943686724, 0.013247109018266201]" 89 | mlops and automl are two of the most popular applications of machine learning today giving teams the ability to automate tasks and bring devops principles to mcahine learning use cases ,2,2,"[0.000763630261644721, 5.13763836806902e-07, 0.9992358684539795]" 90 | mlops and devops why data makes it different o reilly radar ,0,0,"[0.9995023608207703, 0.00019818305736407638, 0.00029939220985397696]" 91 | seldon s fsi leda richard jarvis explores why bank omnichannel success needs mlops to truly scale in our latest blog post ,2,2,"[0.0052197836339473724, 1.6140877050929703e-05, 0.9947640299797058]" 92 | data changes over time resulting in predictive performance degradation in your models how can you address this issue often the ersult of concept drift see how to use these statistical methods to detect conceptdrift in your models mlops ,0,0,"[0.9944462776184082, 0.004890242125838995, 0.0006635418976657093]" 93 | same i m also trying to do amp after learning programming mlops devops cloud full stack mobile app dev web dev etc now i feel the difference ,0,0,"[0.9915984272956848, 0.004044204950332642, 0.004357412923127413]" 94 | we re hosting our first virtual tech ethics meetup next friday nd october if you re interested in delving deeper into practical ai ethics from an mlops perspective join us find out moer details and sign up here ,0,0,"[0.9989414811134338, 0.00032748529338277876, 0.0007310412474907935]" 95 | mlops and devops w hy data makes it different ,0,0,"[0.9996846914291382, 0.00012176520249340683, 0.00019348404021002352]" 96 | on demand webinar watch fern halper from ankita gupta from sanjithraj rao from and lti s shivanand pawar discuss optimizing mlpos journey amp best practices for success in the recently concluded webinar letssolve ,2,2,"[0.04078484699130058, 0.00033405638532713056, 0.9588810205459595]" 97 | a fudnamental principle of neuroscience that is inspiring optimizations in neural networks via towardsai machinelearning ml artificialintelligence mlops ai datascience deeplearning technology programming news research coding aidevelop ,2,0,"[0.9982366561889648, 0.0007150783785618842, 0.0010482212528586388]" 98 | how to generate th erequirements of your python project based on your imports via towardsai machinelearning ml artificialintelligence mlops ai datascience deeplearning technology programming news research coding aidevelopment via ,0,0,"[0.999488353729248, 0.0002877443330362439, 0.0002239350724266842]" 99 | rela time stock news sentiment analyzer via towardsai machinelearning ml artificialintelligence mlops ai datascience deeplearning technology programming news research coding aidevelopment via towardai,0,0,"[0.9993545413017273, 0.0003615759778767824, 0.0002838729997165501]" 100 | what does your dat ascience workflow look like at askanna we talk with data scientists every week based on what we learned we created this datascience workflow what do you recognize what did we miss ml machinelearning ai mlops continuousdevelopment ,0,0,"[0.9995591044425964, 0.00016162208339665085, 0.00027925631729885936]" 101 | streamline your computer vision stack with an end to end mlops platform via read more mlops machinelearning ml artificialintelligence ai deeplearning innovation ,2,2,"[0.0015776593936607242, 1.9448652892606333e-06, 0.9984203577041626]" 102 | -------------------------------------------------------------------------------- /test_results/Invariance_production_test_results.csv: -------------------------------------------------------------------------------- 1 | cleaned_tweets,labels,predicted_labels,predicted_probabilities 2 | in new market guide on ai trust risk and security management modelops is one of the key pillars in ai trust amp risk management is offering a complimentaryc opy of the guide enterpriseai modelopco ml aistrategy mlops ,0,0,"[0.9964763522148132, 0.003300165757536888, 0.00022335691028274596]" 3 | mlops is the process of operationalizing your mahcinelearning models know more about managing machinelearningoperationalization amp its impact on business mlmanagement artificialintelligence softwaredevelopment impressico digitaltransformation ,2,0,"[0.6456086039543152, 0.027243392542004585, 0.32714787125587463]" 4 | it s friday so it s time to share some awesomeness here s k d running triton in a end to en d training serving scenario benchmarking included datascience mlops machinelearning,0,2,"[0.08876750618219376, 0.001242325291968882, 0.909990131855011]" 5 | infographic types of machinelearning via artificialintelligence deeplearning ai digitaltransformation bigdata analytics datascience pytocrh python tensorflow reactjs cloudcomputing datascientist linux daysofcode mlops modelops deeplearning ,0,0,"[0.9965153336524963, 0.0032605219166725874, 0.0002240451576653868]" 6 | understanding the azrue mlops framework ,0,0,"[0.9958954453468323, 0.003841615514829755, 0.00026288192020729184]" 7 | mlop swith kubernetes rabbitmq and fastapi ,0,0,"[0.7078998684883118, 0.23131082952022552, 0.06078921630978584]" 8 | mlops with kubernetes rabbitmq and fastapi analytcis datascience bigdata datascience datascience ds machinelearning,0,0,"[0.6717026233673096, 0.26232820749282837, 0.06596920639276505]" 9 | days to go join us at explainable ai xai summit as we move one step closert o derisking ai in enterprises register now appliedai xai mlops ,0,0,"[0.6786503195762634, 0.2073763906955719, 0.11397319287061691]" 10 | watch this minute video lead by our ct o pablo tapia for an introduction to tuplos the ml ops platform from digital data automation ml development database mlops aiops bigdata zerotouch aiforbusiness lowcodeplatform ,0,0,"[0.9968128800392151, 0.0029878742061555386, 0.00019930866255890578]" 11 | rt for ai to make a sizable contribution to a company s bottom line organizations must scale the technology acrosst he organization mlops can help but the ceo must facilitate it ,0,0,"[0.7677634954452515, 0.10348440706729889, 0.1287519931793213]" 12 | big data analytics a viable solution to all healthcare problems via towardsai macihnelearning ml artificialintelligence mlops ai datascience deeplearning technology programming news research coding aidevelopment via towardai,0,0,"[0.9968908429145813, 0.002915390068665147, 0.0001938095228979364]" 13 | tdatascience rt mlops with kubernetes rabbitmq and fastapi my new article was published on thanks skipper is a simple and flexible open source ml workflow engine it helps tor un and scale ml services in production ,0,0,"[0.7782416343688965, 0.17550846934318542, 0.046249911189079285]" 14 | different approaches for train test splitting of a pandas dataframe via towardsai machinelearning ml artificialintelligence mlops ai datascience deeplearning technology progarmming news research coding aidevelopment via ,0,0,"[0.9966549277305603, 0.003133349819108844, 0.00021156204456929117]" 15 | artificial intelligence ai newsletter by towards ai via towardsai mw machinelearning ml artificialintelligence mlops ai datascience deeplearning technology programmign news research coding aidevelopment ainews ainewsletter v ,1,1,"[0.24477313458919525, 0.6673772931098938, 0.08784963935613632]" 16 | the role of m lops on effective ai ,2,2,"[0.23168961703777313, 0.008777985349297523, 0.7595323324203491]" 17 | responsible ai widgets provides a collection of model and data exploration and asssesment user interfaces that enable a better understanding of ai systems mlops,2,0,"[0.6743612885475159, 0.031493671238422394, 0.29414504766464233]" 18 | mlops with kubernetes rabbtimq and fastapi ml machinelearning ai artificialintelligence,0,0,"[0.9960924983024597, 0.0036561377346515656, 0.00025132461450994015]" 19 | for ai to make a sizbale contribution to a company s bottom line organizations must scale the technology across the organization mlops can help but the ceo must facilitate it ,0,0,"[0.7841159105300903, 0.11956708133220673, 0.09631700813770294]" 20 | th estate of ai in machine learning in production mlops and data centric ai artificiallintelligence machinelearning data qacycle ,0,0,"[0.7806621789932251, 0.18903343379497528, 0.030304528772830963]" 21 | the state of ai in amchine learning in production mlops and data centric ai artificiallintelligence machinelearning data appsunify ,0,0,"[0.9957331418991089, 0.003992869984358549, 0.0002739278133958578]" 22 | communityday track about kubernetes deconstructed aws edition speaker ninad pundalik if you are mlops amp devops enthsuiasts do join the same acd awsusergroups awscommunityday ,0,0,"[0.7732478380203247, 0.17958824336528778, 0.04716384410858154]" 23 | next up on the judging panel for the mozdyaihackathon is angel rivera senior developer advocate at angel is an experienced hackathon mentor and judge and we re so excited to have him on our panel ai devlife coding devops mlops ,2,2,"[0.40167421102523804, 0.02482571266591549, 0.5734999775886536]" 24 | businesses in apac that invest in customerexperience are becoming pandemic proof covidburnout cx custexp custserv infinitejourneys rox retrust ex hcd hcxd designthinking servdes ai lifejourneys momentsoflife momentsthatmatter mlops,2,0,"[0.9945065975189209, 0.005119737703353167, 0.0003735064237844199]" 25 | communityday track about explainable ai with amazon sagemaker clarify by sarbani maiti if you are mlops enthusiasts do join the same acd awsusergroups aswcommunityday ,0,0,"[0.9942811727523804, 0.005316091235727072, 0.00040273607010021806]" 26 | what makes an optimal customerexperience in cx custserv custexp designthinking jtbd innovation ai mlop s devops purpose retrust infinitejourneys rox experienceequity,0,0,"[0.9874982833862305, 0.011554501950740814, 0.0009471528464928269]" 27 | like mlops kdiops takes a village,0,0,"[0.9969377517700195, 0.0028716595843434334, 0.00019061024067923427]" 28 | agile mindset needed in technology and business innovation strategy machinelearning datascience pytohn ai daysofcode iot flutter javascript serverless womenintech cybersecurity technology womenwhocode bigdata deeplearning data mlops rstats ,2,2,"[0.1650698184967041, 0.00620446540415287, 0.8287256956100464]" 29 | the rol eof mlops on effective ai ,2,2,"[0.15323346853256226, 0.004205780569463968, 0.8425607681274414]" 30 | for ai to make a sizable contribution to a company s bottom line organizations must scale the technology across the organization m lops can help but the ceo must facilitate it ,0,0,"[0.7621762752532959, 0.10468554496765137, 0.13313817977905273]" 31 | is there a way to compare these wit he g git dvc branches mlops modelops ,0,1,"[0.3838743567466736, 0.5366832613945007, 0.07944231480360031]" 32 | a copmlete mlops toolbox by martin carmona ,0,0,"[0.9958305954933167, 0.003902552416548133, 0.0002668892266228795]" 33 | datatron introduces new features to mlops and ai governance solution prnewswire ,0,0,"[0.945685863494873, 0.0491819903254509, 0.005131965968757868]" 34 | enusre machine learning success through mlops ,2,0,"[0.4987344443798065, 0.06378398090600967, 0.4374815821647644]" 35 | datatron introduces new features to mlops and ai governance solution prnewswrie ,0,0,"[0.9454514384269714, 0.04938902333378792, 0.005159459542483091]" 36 | i m be giving a talk at the conference only onew eek away get your tickets now towards cloud native distributed machine learning pipelines at scale machinelearning python datascience mlops devops cloudnative kubernetes,0,0,"[0.9922609329223633, 0.007194820325821638, 0.0005442930269055068]" 37 | prepare yourself for success with a strong foundation in machine learning essentials including mlops securing lm environments and training ml models at scale sign up for free today ,2,2,"[0.07517533004283905, 0.0008303517824970186, 0.923994243144989]" 38 | the latest update for algorithmia includes struggling with machinelearning you re not alone and report a comprehensive guide for machine learning governance in th eenterprise mlops ai analytics ,2,2,"[0.11064320057630539, 0.0021819151006639004, 0.8871749043464661]" 39 | big data analytics a viable solution to all healthcare problems via towardsai machinelearning ml artificialintelligence mlops ai datascience deeplearning technology programming news reesarch coding aidevelopment,0,0,"[0.9968715906143188, 0.002933291019871831, 0.00019528463599272072]" 40 | paper recommendation this paper explains why deep learning models such as bert clip gpt and dall earen t just new machine learning models but what they are now calling foundation models mlops datascientist machinelearning foundationmodels standfordai jai ,0,0,"[0.5046657919883728, 0.3528582751750946, 0.14247587323188782]" 41 | the rloe of mlops on effective ai by carl w handlin wallace ,2,2,"[0.3064640760421753, 0.013173254206776619, 0.6803627014160156]" 42 | build new skills in ml nlp mlops and much more whatever your skill level with hands on training sessions and expert led workshops at odscwest this november register now off ends soon datascience ai machinelearning nlp ,2,2,"[0.1051965057849884, 0.0018729001749306917, 0.8929306268692017]" 43 | odscwest will host some of the best and brightest minds in ml dl mlops and more don t miss this chance to learn from the leading experts in your field register now of fends soon ,2,2,"[0.07636086642742157, 0.0008551652426831424, 0.9227839708328247]" 44 | our upcoming training session on practical mlops will address some of the challenges and questoins that you might face while building out your organization s mlops datascoemce mlops ,0,0,"[0.9658889174461365, 0.031115038320422173, 0.0029959676321595907]" 45 | thrilled to kick off our product blog series highlighting the new features and enhancements in the verta platform first up the all important capability of managing access across different types of users and teams and supporting collaborative ai ml development mlop s verta ,2,2,"[0.16167517006397247, 0.005087140016257763, 0.8332377076148987]" 46 | key finding operations organizations that document and neforce mlops processes are twice as likely to achieve their goals to a high degree they are also nearly x more likely to be highly prepared for ai related risks,0,0,"[0.6923893094062805, 0.025036849081516266, 0.2825738787651062]" 47 | we re looking forward to participating in the fireside chat at xaisummit next week wednesday co founder will have a hcat with fellow panelists from and on mlops ecosystems free registration ,0,2,"[0.3861650824546814, 0.026446418836712837, 0.5873884558677673]" 48 | all this talk about mlops but what i really struggle wtih is kidops ,0,0,"[0.5316697359085083, 0.13710354268550873, 0.331226646900177]" 49 | reasons organizations must invest in data enginereing and mlops talents pcquest ,0,0,"[0.9966242909431458, 0.0031645207200199366, 0.00021117455617059022]" 50 | from insights gt gt see how is a key modelops vendor see why machinelearning bigdata ai enterpriseai datascience mlops modelopco modelgovernance modelriskmanagement datascientists aistrategy ,0,0,"[0.9948321580886841, 0.004829770885407925, 0.0003379612462595105]" 51 | different approachse for train test splitting of a pandas dataframe via towardsai machinelearning ml artificialintelligence mlops ai datascience deeplearning technology programming news research coding aidevelopment,0,0,"[0.9968273639678955, 0.0029735269490629435, 0.00019914739823434502]" 52 | why do you need a feature store for machine learning learn this and more on our webcast on kubeflow feast watch to learn more mlops kubeflow featureengineering kbueflowfeast ,2,2,"[0.09164253622293472, 0.0013369751395657659, 0.9070204496383667]" 53 | a gentle introudction to mlops by yashaswi nayak in ,0,0,"[0.996032178401947, 0.0037136124446988106, 0.000254080950981006]" 54 | inusrance agents have to be very good at decision making in the insurance industry with the help of ai they can make the best decisions and provide enhanced customer service read this article to know more about it xpressoai datascientists mlops ,2,2,"[0.07920340448617935, 0.000931842252612114, 0.9198647141456604]" 55 | mlops with kubernetes rabbitmq and fastapi my new article was published on thanks skipper is a simple and flexible open source ml workflow engine it helps to run and scale ml services in production python kubernetes read ,0,0,"[0.6621905565261841, 0.2723737061023712, 0.06543572247028351]" 56 | datascientists and data engineers play a hgue role in mlops and devops with the right data both teams work closely to generate the best application performance head to the blog now to learn more via devops cloud programming aws ,2,2,"[0.09338055551052094, 0.001373759936541319, 0.9052456617355347]" 57 | read our full benchmark comparing mlops enterprise readiness soluitons in the cloud from analysts and jake dolezal machinelearning artificialintelligence deeplearning ai bigdata analytics datascience cloudcomputing mlops ,2,2,"[0.10897282510995865, 0.002131231129169464, 0.8888959884643555]" 58 | good overview and introduction to mlops for datascience by analytics iianalytics tech technology artificialintelligence machinelearning ml ai data dataanalytics d ataandanalytics,2,2,"[0.09891516715288162, 0.0016732927178964019, 0.8994114995002747]" 59 | check this summary of what s new in kubeflow plus a breakdown of contributor and chnage stats for each component machinelearning datascience mlops,0,0,"[0.9952695965766907, 0.004413694608956575, 0.0003167215909343213]" 60 | iguazio mlops platform now supports amazon fsx for nteapp ontap ,0,0,"[0.6057823300361633, 0.023763388395309448, 0.3704543113708496]" 61 | iguazio mlops platform nwo supports amazon fsx for netapp ontap ,0,0,"[0.9963659048080444, 0.003403782146051526, 0.0002302663924638182]" 62 | tools for machine learning serving in mlops tensorflow serving torch serve bentoml sagemaker cortex labs ployagon aible seldon lagorithmia,0,0,"[0.9960690140724182, 0.0036697378382086754, 0.0002611815871205181]" 63 | mlops is hot lots of interesting work happening in the startup ecosystem to help enterprises operationalize ml join us at xaisummit to listen to these amazing speakers from register today ,2,2,"[0.08134283870458603, 0.001001441851258278, 0.9176558256149292]" 64 | infographic types of machinelearning artificialintelligence deeplearning ai digitaltransformation bigdata analytisc datascience pytorch python tensorflow reactjs cloudcomputing datascientist linux daysofcode mlops modelops deeplearning ,0,0,"[0.9964445233345032, 0.003326390404254198, 0.00022906716912984848]" 65 | only weeks away from our mlopssalon we ll be bringing together expertsf rom industry as well as research and showcase best practices real world case studies and a wonderful panel discussion join us and register here mlops machinelearning,2,2,"[0.08088953793048859, 0.000979499309323728, 0.9181309342384338]" 66 | join this upcoming event to learn more about reproducibility mlops memoizatoin static checking and more register now odsc datascience ai ,2,2,"[0.07691574096679688, 0.0008849663427099586, 0.9221992492675781]" 67 | from faster model deployment and anomaly detection to adoption of real time data read how businesse suse mlops to improve management ,0,0,"[0.9930604100227356, 0.006471828557550907, 0.00046773048234172165]" 68 | are we heading towards a new wave of mlops tool evoultion i think so here is a small write up on our thought process mlops netbook mlinfraops datascience ,1,1,"[0.2561167776584625, 0.6674502491950989, 0.0764329805970192]" 69 | artificial intelligence ai newsletter by towards ai via towardsai mw machinelearning ml artifciialintelligence mlops ai datascience deeplearning technology programming news research coding aidevelopment ainews ainewsletter,1,1,"[0.24498817324638367, 0.6660119891166687, 0.08899985998868942]" 70 | all you needt o know to start with deep learning via towardsai machinelearning ml artificialintelligence mlops ai datascience deeplearning technology programming news research coding aidevelopment,0,0,"[0.9952159523963928, 0.004472442902624607, 0.00031160275102593005]" 71 | minikf is the fastest and easiest way to get kubeflow up and running on or your laptop got questions we have a new technical minikf faq that just went live machineleanring mlops datascience,0,0,"[0.993370771408081, 0.0061850701458752155, 0.0004441736964508891]" 72 | launches zero emission ai cloud with integratedm lops technology stack optimized for nvidia ein news ,0,0,"[0.5411422848701477, 0.3883601129055023, 0.07049757987260818]" 73 | streaming live at p edt is matt cowell from with our lunchtime keynote can humans learn like machines the case for human machine learning join his session free machinelearning executive augmentedmachinelearnnig mlops ,0,2,"[0.3963286578655243, 0.017366835847496986, 0.5863044857978821]" 74 | the imitation game can you tell the difference between people and ai deeplearning ml lmops aiops datascience,1,1,"[0.2585621774196625, 0.6317030787467957, 0.1097346767783165]" 75 | mlops with kubernetes rabbitmq and fastapi wewantdata data inisghts bigdata web database tech marketing ,0,0,"[0.6223801374435425, 0.30713409185409546, 0.07048574090003967]" 76 | the role of mlops on effective ai by carl w ahndlin wallace ,2,2,"[0.2497011423110962, 0.009928539395332336, 0.7403702735900879]" 77 | mlops iwth kubernetes rabbitmq and fastapi ,0,0,"[0.6326648592948914, 0.2954410910606384, 0.07189397513866425]" 78 | rt mlops with kubernetes rabbitmq and fastapi mlops imcroservices machinelearning python ,0,0,"[0.6604217886924744, 0.274705708026886, 0.06487248837947845]" 79 | big thanks for the super mlopsforgood swag was super fun working on this project together looking forwar dto the next one opensource mlops aiforgood ,0,2,"[0.26532408595085144, 0.012645184993743896, 0.722030758857727]" 80 | datatro nintroduces new features to mlops and ai governance solution ,0,0,"[0.995606005191803, 0.004109969828277826, 0.0002839597873389721]" 81 | neu ro launches zero emission ai cloud with integrated mlops technology stacko ptimized for nvidia architectures ,0,0,"[0.9966084957122803, 0.0031772786751389503, 0.0002141773875337094]" 82 | join today masterclass prat we examine the final leg of the journey to move the ai model into business modelops mlops aiethics aigovernance enterpriseai ,0,0,"[0.9854965209960938, 0.013322942890226841, 0.0011805054964497685]" 83 | hot off the press we ve released new research about the current state of machine learning in the enterprise download the erport to discover the latest industry trends you need to know mltrends enterpriseml mlops machinelearning,0,0,"[0.6281914114952087, 0.026905635371804237, 0.344902902841568]" 84 | october heartbeat is out all the news from our growing community mlops workflows lots of ways to learn meetup and conference videos docs udpates info on our growing team and more ,2,2,"[0.4681890904903412, 0.016620755195617676, 0.515190064907074]" 85 | found the ultimate project list for ml ai python nlp computervision deeplearning neuralnetworks machinelearning datascience datascinetist datamining mlops,0,0,"[0.995892345905304, 0.0038427524268627167, 0.0002650012611411512]" 86 | from sci fi films to reality artificiallintelligence has become one of the hottest fields in modern technology ho wexactly does ai benefit us and improve quality of life read more datascience machinelearning mlops nocode ,2,0,"[0.5610067248344421, 0.030900394544005394, 0.4080928564071655]" 87 | anindya has a great talk linked up fo r datascientists dataengineers and mlops folks tune in tomorrow and be sure to let me know what you think ,2,2,"[0.09707242250442505, 0.001532541704364121, 0.9013950824737549]" 88 | thinking darwinian via towardsai machinelearning ml artificialintelligence mlops ai datascience deeplearning technology programming news research coding aidevelopemnt,0,0,"[0.9967163801193237, 0.0030752187594771385, 0.00020837220654357225]" 89 | mlops and automl are two of the most popular applications of machine learning today giving teams the ability to automate tasks and bring devops principles to mcahine learning use cases ,2,2,"[0.09231641888618469, 0.0013705312740057707, 0.9063130617141724]" 90 | mlops and devops why data makes it different o reilly radar ,0,0,"[0.9952362775802612, 0.00445513566955924, 0.0003086017386522144]" 91 | seldon s fsi leda richard jarvis explores why bank omnichannel success needs mlops to truly scale in our latest blog post ,2,2,"[0.45805591344833374, 0.01759915053844452, 0.5243449211120605]" 92 | data changes over time resulting in predictive performance degradation in your models how can you address this issue often the ersult of concept drift see how to use these statistical methods to detect conceptdrift in your models mlops ,0,2,"[0.4532445967197418, 0.07632842659950256, 0.4704269468784332]" 93 | same i m also trying to do amp after learning programming mlops devops cloud full stack mobile app dev web dev etc now i feel the difference ,0,0,"[0.5526949167251587, 0.024858929216861725, 0.42244619131088257]" 94 | we re hosting our first virtual tech ethics meetup next friday nd october if you re interested in delving deeper into practical ai ethics from an mlops perspective join us find out moer details and sign up here ,0,0,"[0.628234326839447, 0.2226191908121109, 0.14914649724960327]" 95 | mlops and devops w hy data makes it different ,0,0,"[0.9954752326011658, 0.004232785198837519, 0.00029194532544352114]" 96 | on demand webinar watch fern halper from ankita gupta from sanjithraj rao from and lti s shivanand pawar discuss optimizing mlpos journey amp best practices for success in the recently concluded webinar letssolve ,2,2,"[0.23922623693943024, 0.007067013997584581, 0.753706693649292]" 97 | a fudnamental principle of neuroscience that is inspiring optimizations in neural networks via towardsai machinelearning ml artificialintelligence mlops ai datascience deeplearning technology programming news research coding aidevelop ,2,0,"[0.4875865876674652, 0.040823642164468765, 0.47158968448638916]" 98 | how to generate th erequirements of your python project based on your imports via towardsai machinelearning ml artificialintelligence mlops ai datascience deeplearning technology programming news research coding aidevelopment via ,0,0,"[0.9957854151725769, 0.00394292501732707, 0.0002716170565690845]" 99 | rela time stock news sentiment analyzer via towardsai machinelearning ml artificialintelligence mlops ai datascience deeplearning technology programming news research coding aidevelopment via towardai,0,0,"[0.9955062866210938, 0.004199127200990915, 0.00029448879649862647]" 100 | what does your dat ascience workflow look like at askanna we talk with data scientists every week based on what we learned we created this datascience workflow what do you recognize what did we miss ml machinelearning ai mlops continuousdevelopment ,0,0,"[0.9949839115142822, 0.004690011031925678, 0.0003260923840571195]" 101 | streamline your computer vision stack with an end to end mlops platform via read more mlops machinelearning ml artificialintelligence ai deeplearning innovation ,2,2,"[0.10797715932130814, 0.001998339779675007, 0.8900244832038879]" 102 | -------------------------------------------------------------------------------- /test_results/MFT_latest_test_results.csv: -------------------------------------------------------------------------------- 1 | negated_text,labels,predicted_labels,predicted_probabilities 2 | it is not sunny,0,0,"[0.9976467490196228, 0.0017173351952806115, 0.0006358569371514022]" 3 | pasta is not very delicious,1,2,"[0.3838845193386078, 0.02801734395325184, 0.5880982279777527]" 4 | the product is not worse,2,1,"[0.01968579739332199, 0.9801393747329712, 0.00017482005932834]" 5 | mlops is not inspired from devops,0,0,"[0.9993830919265747, 0.00034421923919580877, 0.0002726506209000945]" 6 | John is not a morning person,0,0,"[0.9989610910415649, 0.0006495718262158334, 0.0003893142275046557]" 7 | -------------------------------------------------------------------------------- /test_results/MFT_production_test_results.csv: -------------------------------------------------------------------------------- 1 | negated_text,labels,predicted_labels,predicted_probabilities 2 | it is not sunny,0,0,"[0.9916446805000305, 0.007791681680828333, 0.0005636655259877443]" 3 | pasta is not very delicious,1,1,"[0.242270827293396, 0.6703853011131287, 0.08734394609928131]" 4 | the product is not worse,2,0,"[0.43473562598228455, 0.24204568564891815, 0.3232187330722809]" 5 | mlops is not inspired from devops,0,0,"[0.9960785508155823, 0.003672090359032154, 0.00024939357535913587]" 6 | John is not a morning person,0,0,"[0.9937883019447327, 0.005802359897643328, 0.00040929310489445925]" 7 | -------------------------------------------------------------------------------- /utils/experiment_tracking.py: -------------------------------------------------------------------------------- 1 | """ 2 | @author: Jithin Sasikumar 3 | 4 | Module to track model training and log the model artifacts, resulting metrics 5 | and parameters. For that purpose, `MLFlow` is used. This module has the flexibility 6 | to extend its functionality to support other tracking mechanism like tensorboard etc. 7 | It is facilitated via `ExperimentTracker protocol` which is similar to interface. 8 | """ 9 | 10 | import mlflow 11 | from typing import Protocol 12 | from dataclasses import dataclass 13 | 14 | class ExperimentTracker(Protocol): 15 | """ 16 | Interface to track experiments by inherting from Protocol class. 17 | """ 18 | def __start__(self): 19 | ... 20 | 21 | def log(self): 22 | ... 23 | 24 | def end(self): 25 | ... 26 | 27 | @dataclass 28 | class MLFlowTracker: 29 | """ 30 | Dataclass to track experiment via MLFlow. 31 | 32 | Instance variables 33 | ------------------ 34 | experiment_name: str 35 | Name of the experiment to be activated or created. 36 | tracking_uri: str 37 | URI of EC2 instance where MLflow server is hosted. 38 | run_name: str 39 | Name of training run pertaining to an experiment. 40 | experiment: bool 41 | Boolean to create a new experiment, else False. 42 | """ 43 | 44 | experiment_name: str 45 | tracking_uri: str 46 | run_name: str 47 | experiment: bool 48 | 49 | def __start__(self) -> None: 50 | """ 51 | Dunder method to start a new mlflow run in MLFlow server and set 52 | model tracking URI and create experiment. 53 | 54 | Parameters 55 | ---------- 56 | None 57 | 58 | Returns 59 | ------- 60 | None 61 | 62 | Raises 63 | ------ 64 | ConnectionError: Exception 65 | If mlflow tracking URI doesn't exist or invalid. 66 | """ 67 | try: 68 | mlflow.set_tracking_uri(self.tracking_uri) 69 | 70 | except ConnectionError: 71 | print(f"Cannot connect to {self.tracking_uri}. Please check and validate the URI!!") 72 | 73 | else: 74 | if self.experiment: 75 | exp_id = mlflow.create_experiment(self.experiment_name) 76 | experiment = mlflow.get_experiment(exp_id) 77 | 78 | else: 79 | experiment = mlflow.set_experiment(self.experiment_name) 80 | 81 | mlflow.start_run(run_name = self.run_name, 82 | experiment_id = experiment.experiment_id) 83 | 84 | def log(self) -> None: 85 | """ 86 | Initialize auto-logging for tracking. This will log model 87 | artifacts in S3 bucket, parameters and metrics in the EC2 instance. 88 | """ 89 | self.__start__() 90 | mlflow.tensorflow.autolog() 91 | 92 | def end(self) -> None: 93 | """ 94 | End an active MLflow run. 95 | """ 96 | mlflow.end_run() -------------------------------------------------------------------------------- /utils/helper.py: -------------------------------------------------------------------------------- 1 | """ 2 | @author: Jithin Sasikumar 3 | 4 | Module consisting of helper functions which is generic across the project. 5 | """ 6 | 7 | import re 8 | import os 9 | import nltk 10 | import pandas as pd 11 | from textblob import TextBlob 12 | from nltk.probability import FreqDist 13 | import tomli as tomlib 14 | from typing import Any 15 | from dataclasses import dataclass 16 | from airflow import settings 17 | from airflow.exceptions import AirflowFailException 18 | from airflow.models.connection import Connection 19 | 20 | class Config: 21 | """ 22 | Loads all configurations from `config.toml` for the project. 23 | """ 24 | def __new__(cls) -> dict[str, Any]: 25 | """ 26 | Dunder method to load config. 27 | 28 | Parameters 29 | ---------- 30 | cls 31 | Class to be instantiated. 32 | 33 | Returns 34 | ------- 35 | config: dict[str, Any] 36 | Loaded configurations as dict. 37 | """ 38 | 39 | with open("./config/config.toml", mode="rb") as config_file: 40 | config = tomlib.load(config_file) 41 | return config 42 | 43 | def load_dataframe(file_path: str) -> pd.DataFrame: 44 | """ 45 | Helper function to load any parquet file as pandas dataframe. 46 | 47 | Parameters 48 | ---------- 49 | file_path: str 50 | Path to input parquet file. 51 | 52 | Returns 53 | ------- 54 | dataframe: pd.DataFrame 55 | """ 56 | this_dir = os.getcwd() 57 | dataframe_path = os.path.join(this_dir, file_path) 58 | dataframe = pd.read_parquet(path = dataframe_path, engine = "pyarrow") 59 | return dataframe 60 | 61 | @dataclass 62 | class Connections: 63 | """ 64 | Dataclass to configure and set Airflow connections. 65 | """ 66 | new_connection: Connection 67 | 68 | def create_connections(self) -> bool: 69 | """ 70 | Method to create a new airflow connection 71 | 72 | Parameters 73 | ---------- 74 | None 75 | 76 | Returns 77 | ------- 78 | bool 79 | True if connection is created, else False. 80 | 81 | Raises 82 | ------ 83 | AirflowFailException: Exception 84 | If connection cannot be created or invalid. 85 | """ 86 | try: 87 | session = settings.Session() 88 | connection_name = session.query(Connection).filter( 89 | Connection.conn_id == self.new_connection.conn_id 90 | ).first() 91 | 92 | if str(connection_name) != str(self.new_connection.conn_id): 93 | session.add(self.new_connection) 94 | session.commit() 95 | 96 | except Exception as exc: 97 | raise AirflowFailException( f"Error when creating new connection:{exc}") from exc 98 | 99 | else: 100 | return True 101 | 102 | finally: 103 | session.close() 104 | 105 | def remove_noise(text: str) -> str: 106 | """ 107 | Helper function to remove noise from text as part of text cleaning 108 | using regular expressions (regex). 109 | 110 | Parameters 111 | ---------- 112 | text: str 113 | Input text 114 | 115 | Returns 116 | ------- 117 | Cleaned text 118 | """ 119 | 120 | text = re.sub('http[s]?://(?:[a-zA-Z]|[0-9]|[$-_@.&+#]|[!*\(\),]|'\ 121 | '(?:%[0-9a-fA-F][0-9a-fA-F]))+','', text) 122 | text = re.sub("(@[A-Za-z0-9_]+)","", text) 123 | text = re.sub('\n',' ', text) 124 | text = re.sub('#','', text) 125 | 126 | return text 127 | 128 | def calculate_polarity(text: str) -> float: 129 | """ 130 | Helper function to calculate text polarity. 131 | 132 | Parameters 133 | ---------- 134 | text: str 135 | Input text 136 | 137 | Returns 138 | ------- 139 | polarity: float 140 | """ 141 | return TextBlob(text).sentiment.polarity 142 | 143 | def remove_stopwords(tokens: list[str], 144 | stopwords_: nltk.corpus.stopwords) -> list[str]: 145 | """ 146 | Helper function to remove stopwords from given input tokens. 147 | 148 | Parameters 149 | ---------- 150 | tokens: list[str] 151 | List of tokens pertaining to each text. 152 | stopwords_: nltk.corpus.stopwords 153 | List of stopwords defined in NLTK. 154 | 155 | Returns 156 | ------- 157 | list[str] 158 | Resultant list of text with no stopwords. 159 | """ 160 | return [token for token in tokens if token not in stopwords_] 161 | 162 | def remove_less_frequent_words(dataframe) -> pd.DataFrame: 163 | """ 164 | Helper function to remove the words that are less frequent (< 2 times). 165 | 166 | Parameters 167 | ---------- 168 | dataframe: pd.DataFrame 169 | Input dataframe 170 | 171 | Returns 172 | ------- 173 | Resultant dataframe with less frequent words removed. 174 | """ 175 | 176 | dataframe['tokenized_strings'] = dataframe['tokenized_tweets'].apply( 177 | lambda tokens: ' '.join( 178 | [token for token in tokens if len(token) > 2] 179 | ) 180 | ) 181 | tokenized_words = nltk.tokenize.word_tokenize(' '.join( 182 | [word 183 | for word in dataframe['tokenized_strings'] 184 | ] 185 | ) 186 | ) 187 | frequency_distribution = FreqDist(tokenized_words) 188 | dataframe['tokenized_strings'] = dataframe['tokenized_tweets'].apply( 189 | lambda tweets: ' '.join( 190 | [tweet for tweet in tweets 191 | if frequency_distribution[tweet] > 2 192 | ] 193 | ) 194 | ) 195 | return dataframe 196 | 197 | def assign_sentiment_labels(score: float) -> str: 198 | """ 199 | Helper function to assign sentiment labels to polarity scores. 200 | 201 | Parameters 202 | ---------- 203 | score: float 204 | Polarity score of each text. 205 | 206 | Returns 207 | ------- 208 | sentiment_label: str 209 | """ 210 | if score > 0.25: 211 | return "positive" 212 | elif score < 0: 213 | return "negative" 214 | else: 215 | return "neutral" -------------------------------------------------------------------------------- /utils/model.py: -------------------------------------------------------------------------------- 1 | """ 2 | @author: Jithin Sasikumar 3 | 4 | Model to define deep neural network for training. 5 | 6 | Bi-directional LSTM (biLSTM) network is used for this project encompassing an 7 | embedding layer, stack of biLSTM layers followed by fully connected dense layers 8 | with dropout. This module provides the flexibility to add any other models 9 | by inheriting Models(ABC). 10 | 11 | """ 12 | 13 | from abc import ABC, abstractmethod 14 | from dataclasses import dataclass 15 | from keras.models import Sequential 16 | from keras.layers import Embedding, Bidirectional, LSTM, Dense, Dropout 17 | 18 | class Models(ABC): 19 | """ 20 | Abstract base class that defines and creates model. 21 | """ 22 | @abstractmethod 23 | def define_model(self): 24 | pass 25 | 26 | @abstractmethod 27 | def create_model(self): 28 | pass 29 | 30 | 31 | @dataclass 32 | class BiLSTM_Model(Models): 33 | """ 34 | Dataclass to create biLSTM model inheriting Models class. 35 | """ 36 | vocab_size: int 37 | num_classes: int 38 | embedding_dim: int = 64 39 | input_length: int = 128 40 | 41 | def define_model(self) -> Sequential: 42 | """ 43 | Method to define model that can be used for training and inference. 44 | The existing model can also be tweaked by changing parameters, 45 | based on the requirements. 46 | 47 | Parameters 48 | ---------- 49 | None 50 | 51 | Returns 52 | ------- 53 | keras.models.Sequential 54 | """ 55 | return Sequential( 56 | [ 57 | 58 | # Embedding layer that expects the following: 59 | # Size of vocabulary, Output embedding vectors & Size of each input sequence 60 | Embedding(self.vocab_size, self.embedding_dim, input_length = self.input_length), 61 | 62 | #Bidirectional LSTM layers 63 | Bidirectional(LSTM(self.embedding_dim, return_sequences=True)), 64 | Bidirectional(LSTM(64, return_sequences = True)), 65 | Bidirectional(LSTM(32)), 66 | 67 | #Dense layers 68 | Dense(self.embedding_dim, activation = 'relu'), 69 | Dense(64, activation = 'relu'), 70 | Dropout(0.25), 71 | Dense(self.num_classes, activation = 'softmax') 72 | ] 73 | ) 74 | 75 | def create_model(self) -> Sequential: 76 | """ 77 | Method to create the model defined by define_model() method 78 | and prints the model summary. 79 | 80 | Parameters 81 | ---------- 82 | None 83 | 84 | Returns 85 | ------- 86 | model: keras.models.Sequential 87 | Created model 88 | """ 89 | 90 | model: Sequential = self.define_model() 91 | model.summary() 92 | return model -------------------------------------------------------------------------------- /utils/prepare_data.py: -------------------------------------------------------------------------------- 1 | """ 2 | @author: Jithin Sasikumar 3 | 4 | Module to transform preprocessed dataframe (parquet or csv) into tf.data.Dataset format 5 | which creates an efficient input pipeline that in turn be fed into the tensorflow model. 6 | BERT tokenizer is used instead of normal tokenizer for better embeddings. 7 | 8 | """ 9 | import math 10 | import pandas as pd 11 | import numpy as np 12 | import tensorflow as tf 13 | from dataclasses import dataclass, field 14 | from transformers import BertTokenizer 15 | 16 | @dataclass 17 | class Dataset: 18 | """ 19 | Dataclass that encodes and transforms dataframe into tensorflow dataset. 20 | """ 21 | tokenizer: BertTokenizer 22 | dataframe: pd.DataFrame = field(default_factory = pd.DataFrame()) 23 | labels: np.ndarray = None 24 | batch_size: int = 64 25 | max_length: int = 256 26 | train: bool = False 27 | col_name: str = "cleaned_tweets" 28 | 29 | @property 30 | def list_of_texts(self) -> list[str]: 31 | """ 32 | Class property to convert text column of dataframe to list of strings 33 | for processing. 34 | 35 | Parameters 36 | ---------- 37 | None 38 | 39 | Returns 40 | ------- 41 | list[str] 42 | List of texts 43 | """ 44 | return self.dataframe[self.col_name].tolist() 45 | 46 | @property 47 | def shuffle_size(self) -> int: 48 | """ 49 | Class property to calculate the shuffle size for dataset. 50 | 51 | Parameters 52 | ---------- 53 | None 54 | 55 | Returns 56 | ------- 57 | shuffle_size: int 58 | """ 59 | return math.ceil(len(self.list_of_texts) / self.batch_size) 60 | 61 | def encode_bert_tokens_to_tf_dataset(self) -> tf.data.Dataset.zip: 62 | """ 63 | Transform tokens into tensorflow dataset. The dataset is batched and 64 | shuffled. 65 | 66 | BERT tokenizer is used => (i.e.) The texts are tokenized and each token 67 | is encoded into unique IDs referred as input_ids by means of vocabulary. 68 | 69 | Parameters 70 | ---------- 71 | None 72 | 73 | Returns 74 | ------- 75 | dataset: tf.data.Dataset.zip 76 | Tensorflow dataset after batching and shuffling. 77 | """ 78 | tokenized: BertTokenizer = self.tokenizer( 79 | text = self.list_of_texts, 80 | add_special_tokens = True, 81 | max_length = self.max_length, 82 | padding = "max_length", 83 | return_tensors = "tf", 84 | return_attention_mask = False, 85 | return_token_type_ids = False, 86 | verbose = True 87 | ) 88 | 89 | input_ids = tf.data.Dataset.from_tensor_slices(np.array(tokenized['input_ids'])) 90 | labels = tf.data.Dataset.from_tensor_slices(self.labels) 91 | # Zipping input_ids and labels as a single dataset object 92 | dataset = tf.data.Dataset.zip((input_ids, labels)) 93 | 94 | if self.train: 95 | return dataset.shuffle(self.shuffle_size).batch(self.batch_size) 96 | 97 | return dataset.batch(self.batch_size) --------------------------------------------------------------------------------