├── .github
└── workflows
│ ├── benchmark_and_test_model.yml
│ ├── deploy.yml
│ └── run_dags.yml
├── .gitignore
├── Makefile
├── README.md
├── config
└── config.toml
├── dags
├── etl_twitter_dag.py
├── model_training_dag.py
└── task_definitions
│ ├── etl_task_definitions.py
│ └── model_training.py
├── dependencies
├── Dockerfile
└── requirements.txt
├── docker-compose.yaml
├── images
├── Sagemaker_endpoint.jpg
├── architecture_diagram.jpeg
├── ecr_image.PNG
├── etl_dag.PNG
├── mlflow_exps.PNG
├── model_dag.PNG
├── model_plot.png
├── model_registry_latest1.PNG
├── model_registry_latest2.PNG
└── model_registry_org.PNG
├── scripts
├── behavioral_test.py
├── deploy.py
├── stage_model_to_production.py
└── test_data
│ ├── sample_test_data_for_mft.parquet
│ └── test_data.parquet
├── test_results
├── Invariance_latest_test_results.csv
├── Invariance_production_test_results.csv
├── MFT_latest_test_results.csv
└── MFT_production_test_results.csv
└── utils
├── experiment_tracking.py
├── helper.py
├── model.py
└── prepare_data.py
/.github/workflows/benchmark_and_test_model.yml:
--------------------------------------------------------------------------------
1 | # Name of the workflow
2 | name: Test and benchmark models
3 |
4 | on:
5 | push:
6 | branches: [main]
7 |
8 | jobs:
9 | build:
10 | runs-on: self-hosted
11 | steps:
12 | - uses: actions/checkout@v2
13 |
14 | - name: Test and benchmark models
15 | id: test_benchmark
16 | env:
17 | AWS_ACCESS_KEY_ID: ${{ secrets.AWS_ACCESS_KEY_ID }}
18 | AWS_SECRET_ACCESS_KEY: ${{ secrets.AWS_SECRET_ACCESS_KEY }}
19 | REGION: ${{ secrets.REGION }}
20 |
21 | run: |
22 | python3 -m pip install --upgrade pip
23 | pip install -r ./dependencies/requirements.txt
24 | python -m spacy download en_core_web_sm
25 | python ./scripts/stage_model_to_production.py
26 |
27 |
--------------------------------------------------------------------------------
/.github/workflows/deploy.yml:
--------------------------------------------------------------------------------
1 | # Name of the workflow
2 | name: Deploy to sagemaker
3 |
4 | on: workflow_dispatch
5 |
6 | jobs:
7 | build:
8 | runs-on: self-hosted
9 | steps:
10 | - uses: actions/checkout@v2
11 |
12 | - name: Deploy production-ready image from AWS ECR to Sagemaker
13 | id: deploy_to_prod
14 | env:
15 | AWS_ACCESS_KEY_ID: ${{ secrets.AWS_ACCESS_KEY_ID }}
16 | AWS_SECRET_ACCESS_KEY: ${{ secrets.AWS_SECRET_ACCESS_KEY }}
17 | REGION: ${{ secrets.REGION }}
18 | IMAGE_URI: ${{ secrets.IMAGE_URI }}
19 | ARN_ROLE: ${{ secrets.ARN_ROLE }}
20 |
21 | run: |
22 | python3 -m pip install --upgrade pip
23 | pip install -r ./dependencies/requirements.txt
24 | python ./scripts/deploy.py
--------------------------------------------------------------------------------
/.github/workflows/run_dags.yml:
--------------------------------------------------------------------------------
1 | # Name of the workflow
2 | name: Run Airflow DAG
3 |
4 | on: workflow_dispatch
5 |
6 | jobs:
7 | build:
8 | runs-on: self-hosted
9 | steps:
10 | - uses: actions/checkout@v2
11 | with:
12 | # Loading the secrets
13 | secrets: |
14 | "AWS_ACCESS_KEY_ID=${{ secrets.AWS_ACCESS_KEY_ID }}"
15 | "AWS_SECRET_ACCESS_KEY=${{ secrets.AWS_SECRET_ACCESS_KEY }}"
16 | "REGION=${{ secrets.REGION }}"
17 | "LOGIN=${{ secrets.LOGIN }}"
18 | "PASSWORD=${{ secrets.PASSWORD }}"
19 | "HOST=${{ secrets.HOST }}"
20 | "ACCOUNT=${{ secrets.ACCOUNT }}"
21 | "WAREHOUSE=${{ secrets.WAREHOUSE }}"
22 | "DATABASE=${{ secrets.DATABASE }}"
23 | "SCHEMA=${{ secrets.SCHEMA }}"
24 |
25 | - name: Run airflow dag
26 | run: make run_dag
--------------------------------------------------------------------------------
/.gitignore:
--------------------------------------------------------------------------------
1 | # Byte-compiled / optimized / DLL files
2 | __pycache__/
3 | *.py[cod]
4 | *$py.class
5 |
6 | # C extensions
7 | *.so
8 |
9 | # Distribution / packaging
10 | .Python
11 | build/
12 | develop-eggs/
13 | dist/
14 | downloads/
15 | eggs/
16 | .eggs/
17 | lib/
18 | lib64/
19 | parts/
20 | sdist/
21 | var/
22 | wheels/
23 | pip-wheel-metadata/
24 | share/python-wheels/
25 | *.egg-info/
26 | .installed.cfg
27 | *.egg
28 | MANIFEST
29 |
30 | # PyInstaller
31 | # Usually these files are written by a python script from a template
32 | # before PyInstaller builds the exe, so as to inject date/other infos into it.
33 | *.manifest
34 | *.spec
35 |
36 | # Installer logs
37 | pip-log.txt
38 | pip-delete-this-directory.txt
39 |
40 | # Unit test / coverage reports
41 | htmlcov/
42 | .tox/
43 | .nox/
44 | .coverage
45 | .coverage.*
46 | .cache
47 | nosetests.xml
48 | coverage.xml
49 | *.cover
50 | *.py,cover
51 | .hypothesis/
52 | .pytest_cache/
53 |
54 | # Translations
55 | *.mo
56 | *.pot
57 |
58 | # Django stuff:
59 | *.log
60 | local_settings.py
61 | db.sqlite3
62 | db.sqlite3-journal
63 |
64 | # Flask stuff:
65 | instance/
66 | .webassets-cache
67 |
68 | # Scrapy stuff:
69 | .scrapy
70 |
71 | # Sphinx documentation
72 | docs/_build/
73 |
74 | # PyBuilder
75 | target/
76 |
77 | # Jupyter Notebook
78 | .ipynb_checkpoints
79 |
80 | # IPython
81 | profile_default/
82 | ipython_config.py
83 |
84 | # pyenv
85 | .python-version
86 |
87 | # pipenv
88 | # According to pypa/pipenv#598, it is recommended to include Pipfile.lock in version control.
89 | # However, in case of collaboration, if having platform-specific dependencies or dependencies
90 | # having no cross-platform support, pipenv may install dependencies that don't work, or not
91 | # install all needed dependencies.
92 | #Pipfile.lock
93 |
94 | # PEP 582; used by e.g. github.com/David-OConnor/pyflow
95 | __pypackages__/
96 |
97 | # Celery stuff
98 | celerybeat-schedule
99 | celerybeat.pid
100 |
101 | # SageMath parsed files
102 | *.sage.py
103 |
104 | # Environments
105 | .env
106 | .venv
107 | env/
108 | venv/
109 | ENV/
110 | env.bak/
111 | venv.bak/
112 |
113 | # Spyder project settings
114 | .spyderproject
115 | .spyproject
116 |
117 | # Rope project settings
118 | .ropeproject
119 |
120 | # mkdocs documentation
121 | /site
122 |
123 | # mypy
124 | .mypy_cache/
125 | .dmypy.json
126 | dmypy.json
127 |
128 | # Pyre type checker
129 | .pyre/
130 |
--------------------------------------------------------------------------------
/Makefile:
--------------------------------------------------------------------------------
1 | # Makefile to run Airflow DAG in docker container with external dependencies
2 |
3 | include .env
4 |
5 | run_dag:
6 | # Build extended airflow docker image with required pip dependencies
7 | docker build . -f ./dependencies/Dockerfile --tag extending_airflow:latest
8 | # Rebuild airflow webserver and scheduler with our newly build image
9 | docker-compose up -d --no-deps --build airflow-webserver airflow-scheduler
10 |
11 | # Start all required containers to run all airflow services
12 | docker-compose -f docker-compose.yaml up -d
13 | docker ps
14 | sleep 15
15 |
16 | # Triggering DAG for the first time by accessing the webserver container
17 | docker exec -it twitter_bot_airflow-webserver_1 bash -c "airflow dags trigger twitter_data_pipeline_dag_etl
18 |
19 | stop_dag:
20 | docker-compose -f docker-compose.yaml down
--------------------------------------------------------------------------------
/README.md:
--------------------------------------------------------------------------------
1 | # Sentiment analysis from MLOps paradigm
2 |
3 | 
4 | 
5 |
6 | This project promulgates an **automated end-to-end ML pipeline** that trains a **bi-directional LSTM** network for sentiment analysis task, **tracks** experiments, **pushes** trained models to **model registry**, benchmark them by means of **model testing** and **evaluation**, pushes the best model into production followed by **dockerizing** the production model artifacts into a deployable image and **deploys** the same into cloud instance via **CI/CD**.
7 |
8 | ## Author
9 |
10 | - [@Jithin Sasikumar](https://www.github.com/Jithsaavvy)
11 |
12 | ## Languages and Tools
13 |
14 |
15 |

16 |

17 |

18 |

19 |

20 |

21 |

22 |

23 |

24 |

25 |

26 |
27 |
28 |
29 | ## Motivation
30 |
31 | In a machine learning (ML) project, it comprises of a chain of tasks like data collection, pre-processing, transforming datasets, feature extraction, model training, model selection, evaluation, deployment. For a small-scale project, these tasks can be managed manually but, as the scalability and scope of the project increases, manual process is really a pain. The actual problem arises when the model has to be productionalized in order to make value out of it. MLOps defines various disciplines to nullify such problems and work efficiently. Thus, pipelines are crucial in an ML project and automating such end-to-end pipelines are also vital.
32 |
33 | ## Description
34 |
35 | The project is a concoction of research (sentiment analysis, NLP, BERT, biLSTM), development (text normalization, ETL, transformation, deep neural network training, evaluation, model testing) and deployment (building and packaging model artifacts, tracking, docker, workflows, pipelines, cloud) by integrating CI/CD pipelines with automated releases.
36 |
37 | |  |
38 | |:--:|
39 | | Figure 1: Complete end-to-end project pipeline|
40 |
41 | ## Technical facets
42 |
43 | 1. Setting up `Airflow` in docker for `workflow orchestration`.
44 | 2. Writing a `Dockerfile` that creates a base docker image with all dependencies installed and secrets containing sensitive credentials and access tokens are mounted.
45 | 2. Defining `ETL` and `model training` workflow followed by scheduling them for orchestration.
46 | 3. Executing `Airflow DAGs`:
47 | - **ETL** - Performs Extract, Transform and Load operation on twitter data. As a result, raw tweets scraped from twitter are processed and loaded into `Snowflake data warehouse` as database table.
48 | - **Model_training** - Deep end-to-end `biLSTM` model is trained using `Tensorflow` by fetching processed data from data warehouse.
49 | 4. Tracking the entire model training using `MLflow server` hosted on `AWS EC2 instance` from which trained model artifacts, metrics and parameters are logged.
50 | 5. Using `AWS S3 buckets` to store the model artifacts and data.
51 | 6. Adding the trained model to `MLflow model registry` on `AWS EC2 instance` that facilitates in managing, maintaining, versioning, staging, testing and productionalizing the model collaboratively.
52 | 7. Automating the `pipeline` as follows:
53 | - Initialize `GitHub Actions` workflows.
54 | - `benchmark_and_test_model.yml` => In order to productionalize a model, simply evaluating the model is not sufficient. So, it is very important to test them. Thus, the best model is pushed into **production stage** by means of **benchmarking** (`behavioral testing` + evaluation).
55 | - `deploy.yml` => The production model from model registry in `EC2 instance` is packaged into a docker image with all required dependencies & metadata as a `deployable model artifact` and pushed into `Amazon ECR` **(CI job)**. The deployable image is then deployed into `AWS Sagemaker` instance which creates an **endpoint** that can be used to communicate with the model for inference **(CD job)**.
56 | - `run_dags.yml` - Triggers Airflow DAG run that performs ETL and model training task based on schedule.
57 | - `release.yml` => A new release will be created automatically when tags are pushed to the repository.
58 |
59 |
60 | ## Directory structure
61 |
62 | ```
63 | ├── .github
64 | │ └── workflows
65 | │ ├── benchmark_and_test_model.yaml
66 | | ├── deploy.yaml
67 | | ├── release.yaml
68 | | └── run_dags.yaml
69 | ├── config
70 | │ └── config.toml
71 | ├── dags # Directory where every Airflow DAG is defined
72 | │ ├── etl_twitter_dag.py
73 | │ ├── model_training_dag.py
74 | │ └── task_definitions
75 | │ ├── etl_task_definitions.py
76 | │ └── model_training.py
77 | ├── dependencies
78 | │ ├── Dockerfile
79 | │ └── requirements.txt
80 | ├── docker-compose.yaml # Airflow and it's components run as docker containers
81 | ├── images
82 | ├── Makefile # Set of docker commands for Airflow run
83 | ├── README.md
84 | ├── scripts # Contains code for model testing, evaluation and deployment to AWS Sagemaker
85 | │ ├── behavioral_test.py
86 | │ ├── deploy.py
87 | │ ├── stage_model_to_production.py
88 | │ └── test_data
89 | │ ├── sample_test_data_for_mft.parquet
90 | │ └── test_data.parquet
91 | ├── test_results
92 | └── utils
93 | ├── experiment_tracking.py
94 | ├── helper.py
95 | ├── model.py
96 | └── prepare_data.py
97 |
98 | ```
99 |
100 | ## Pipeline
101 | ### Dependencies & Secrets management
102 |
103 | As mentioned above, Airflow is running in a docker container. In order to install dependencies, a docker image is build with all installed dependencies and it will be used as a base image for `docker-compose`. The dependencies are listed in [requirements.txt](./dependencies/requirements.txt). A more better way would be, to use any kind of dependency management tools like **Poetry** for organizational projects but, it is out of scope for this project.
104 |
105 | One important challenge would be, to manage & handle sensitive information such as **credentials, access tokens** etc needed for Airflow to connect with other services like `AWS S3`, `Snowflake`, `EC2`. It is vulnerable to use any such sensitive info during `docker build` as they will be exposed as a result of layer caching during image build. The secure way is to mount them into the image as **docker secrets** and then export it as environment variables, so that they aren't leaked. It can be done as follows:
106 |
107 | - Create secrets using the command
108 | ```
109 | docker secret create
110 | ```
111 |
112 | - Mount those secrets into `/run/secrets/` of the container
113 | ```
114 | RUN --mount=type=secret,id= \
115 | export =$(cat /run/secrets/)
116 | ```
117 |
118 | #### ProTip to do the same in production environment
119 |
120 | The aforementioned steps are not well suitable for production. To do so, use `docker stack`. For more info, refer [here](https://docs.docker.com/engine/swarm/stack-deploy/)
121 |
122 | ### Workflow Orchestration - Airflow
123 |
124 | [Apache Airflow](https://airflow.apache.org/) is used to orchestrate workflows in this project. The workflows are represented as **Directed Acyclic Graph** `(DAG)`.
125 |
126 | ### DAGS
127 |
128 | ### ETL
129 | It is a data workflow that performs Extract Transform Load `(ETL)` task defined in [etl_twitter_dag.py](./dags/etl_twitter_dag.py) on scheduled interval. It performs the following tasks:
130 | - The raw tweets are scraped from twitter using [snscrape](https://pypi.org/project/snscrape/) library and loaded to `AWS S3 bucket`.
131 | - They are cleaned using **regular expressions** and labelled by calculating **polarity** and also loaded to the same `S3 bucket`.
132 | - The labelled data is normalized and preprocessed using NLP techniques and loaded as **database table** to `Snowflake data warehouse` which can be used for analysis and model training.
133 | - The data is stored in the `parquet` format for efficient storage and retrieval.
134 |
135 | |  |
136 | |:--:|
137 | | Figure 2: ETL Data pipeline - Airflow|
138 |
139 | ### Model training
140 | It is a model training workflow that trains deep end-to-end `biLSTM` network with `BERT tokenizer`. Detailed explanation of biLSTM model can be found [here](#bi-directional-lstm-model). The DAG performs the following tasks:
141 |
142 | - Preprocessed data loaded as a result of ETL pipeline is fetched from the database
143 | table of **snowflake data warehouse** as a **dataframe**.
144 | - External (user-build) **docker container** with `tensorflow GPU` and other dependencies installed, is used to train the model. It is facilitated in Airflow by `DockerOperator` as:
145 | ```
146 | DockerOperator(
147 | task_id = "train_model_task",
148 | image = "model_training_tf:latest",
149 | auto_remove = True,
150 | docker_url = "unix://var/run/docker.sock",
151 | api_version = "auto",
152 | command = "python3 model_training.py"
153 | )
154 | ```
155 | - The **GPU accelerated** training for the above task is defined in [model_training.py](./dags/task_definitions/model_training.py). Additionally, `BERT tokenizer` is used instead of normal tokenizer **(i.e.)** The texts are tokenized and each tokens are encoded into unique IDs referred as `input_ids`. Finally, they are transformed as `tensorflow datasets` for efficient input pipeline and fed into the model. All these are defined in [prepare_data.py](./utils/prepare_data.py).
156 |
157 | |  |
158 | |:--:|
159 | | Figure 3: Model training pipeline - Airflow|
160 |
161 | **Note:**
162 | *GPU used for training*: NVIDIA GeForce GTX 980M with `8GB GDDR5` memory
163 |
164 | ### Bi-directional LSTM model
165 | biLSTM network encompassing an
166 | `embedding layer`, stack of `biLSTM layers` followed by `fully connected dense layers`
167 | with `dropout` is used for this project. The **model plot** is depicted in the below image:
168 |
169 |
170 |
171 |
172 |
173 | ### MLflow Server
174 |
175 | All the experiments are tracked and logged by [MLflow](https://mlflow.org/docs/latest/tracking.html). It is not done locally in a **localhost**, instead the `MLflow Server` is installed and hosted in `AWS EC2 instance` as a **remote tracking server** which paves way for centralized access. The **trained model artifacts** are saved in `AWS S3 bucket` which serves as an artifact store and parameters, metrics (per epoch), all other metadata are logged into EC2 instance itself.
176 |
177 | |  |
178 | |:--:|
179 | | Figure 4: All experiment runs on MLflow Server - EC2 Instance|
180 |
181 | ### MLflow Model Registry
182 |
183 | The models to be staged and tested are pushed to the model registry which serves as a **centralized model store**. It facilitates to manage, version, stage, test and productionalize the model and provides functionalities to work on the models collaboratively.
184 |
185 | |  |
186 | |:--:|
187 | | Figure 5: Model Registry with already existing production model and staged model - EC2 Instance |
188 |
189 | ### Benchmarking
190 |
191 | The model with latest version and model in production stage are benchmarked by means of behavioral testing and evaluation. This is done to find out whether the latest model outperforms the current production model. If yes, it triggers the `CI/CD` workflow job.
192 |
193 | Model testing differs from model evaluation. For instance, a model with high evaluation metric doesn't always guarantee to be the best performing model because, it might fail in some specific scenarios. To solve and quantify that, **model testing** is an important aspect in production.
194 |
195 | ### Behavioral testing
196 |
197 | It is based on this [paper](https://homes.cs.washington.edu/~marcotcr/acl20_checklist.pdf) to test the behavior of the model in specific conditions. [Checklist](https://github.com/marcotcr/checklist) library is used for performing both the tests. These testing functions are defined in [behavioral_test.py](./scripts/behavioral_test.py). Three different types of tests are proposed in the paper but only two of them are performed in this project namely:
198 | - Minimum Functionality test (MFT)
199 | - Invariance test (INV)
200 |
201 | ### MFT:
202 | MFT is inspired from unit test. A specific behavior (or) capability of the model is tested.
203 |
204 | | 1. | **Model** | Sentiment Analysis |
205 | |----|:-------------------------:|:------------------------------------------------------------------------------------------------------------------:|
206 | | 2. | **Dataset** | Perturbed dataset created from a small subset of test dataset with labels. Original texts are negated as perturbed |
207 | | 3. | **Minimum functionality** | Negations (i.e.) how well the model handles negated inputs |
208 | | 4. | **Example** | *Original text*: This product is very good - **Positive**
*Negated text*: This product is not very good - **Negative** |
209 | | 5. | **Expected behavior** | Model should be generalized to predict correct labels for both original and negated text |
210 |
211 | ### INV
212 |
213 | Label-preserving perturbations are applied to the test data. Despite perturbing the data, the model is expected to give the same prediction.
214 |
215 | | 1. | **Model** | Sentiment Analysis |
216 | |:---:|:-------------------------:|:-----------------------------------------------------------------------------------------------------------------------------:|
217 | | 2. | **Dataset** | Larger subset of test dataset is perturbed by adding invariances and their contexts are preserved |
218 | | 3. | **Invariance** | Typos and expanding contractions (i.e.) how well the model handle these invariances |
219 | | 4. | **Example** | *Original text*: I haven't liked this product - **Negative**
*Invariance text*: I have not liekd this prodcut - **Negative** |
220 | | 5. | **Expected behavior** | Model should be generalized to handles these invariances and predict same label for both original and invariance texts |
221 |
222 | Benchmarking (defined in [stage_model_to_production.py](./scripts/stage_model_to_production.py)) is done as follows:
223 | - Latest and current production models are pulled from the model registry.
224 | - Test data (fresh data that the model hasn't seen during training) is fetched from S3 bucket.
225 | - **Behavioral testing** (perturbed data) and **evaluation** (original test data) is performed for both the models and metrics are returned.
226 | - If the latest model outperform the current production model, then push latest model into production and archive current production model.
227 |
228 | ```
229 | productionalize_ = Productionalize(tracking_uri = config["model-tracking"]["mlflow_tracking_uri"],
230 | test_data = config["files"]["test_data"],
231 | model_name = config["model-registry"]["model_name"],
232 | batch_size = config["train-parameters"]["batch_size"],
233 | sequence_length = config["train-parameters"]["sequence_length"]
234 | )
235 |
236 | accuracy_latest_model, accuracy_production_model = productionalize_.benchmark_models()
237 |
238 | success_ = productionalize_.push_new_model_to_production(accuracy_latest_model, accuracy_production_model)
239 | ```
240 |
241 | |  |
242 | |:--:|
243 | | Figure 6: Model Registry with latest model pushed to production model and archiving the other one - EC2 Instance |
244 |
245 | |  |
246 | |:--:|
247 | | Figure 7: Model Registry with latest production model - EC2 Instance |
248 |
249 | ### CI/CD
250 |
251 | It involves packaging the model artifacts into an image and deploy them to cloud instance. The steps are as follows:
252 | - The model registry in **EC2 instance** holds the **latest production model** that have passed both testing and evaluation.
253 | - The production model from the model registry is packaged and build into a docker image with all required dependencies & metadata as a **deployable model artifact**.
254 | - This artifact is then pushed into **Amazon ECR** that serves as a container registry.
255 |
256 | |  |
257 | |:--:|
258 | | Figure 8: Deployable docker image pushed to AWS ECR |
259 |
260 | - Finally, the deployable image from ECR is deployed into `AWS Sagemaker` instance which creates an **endpoint** that can be used to communicate with the model for inferencing.
261 | - The endpoint can be tested using some tools like `Postman`.
262 | - The aforementioned steps are defined in [deploy.py](./scripts/deploy.py). All the necessary secrets are exported as environment variables. Specific IAM role and user have been created for deployment.
263 |
264 | ```
265 | sagemaker._deploy(
266 | mode = 'create',
267 | app_name = app_name,
268 | model_uri = model_uri,
269 | image_url = docker_image_url,
270 | execution_role_arn = role,
271 | instance_type = 'ml.m5.xlarge',
272 | instance_count = 1,
273 | region_name = region
274 | )
275 | ```
276 |
277 | |  |
278 | |:--:|
279 | | Figure 9: Production model deployed to AWS Sagemaker |
280 |
281 | **Note:**
282 | *Every AWS resources created for this project will be deleted after the pipeline is executed successfully. This is done on purpose, to restrict and limit any incurring additional cost!!*
283 |
284 | ## Feedback
285 |
286 | If you have any feedback, please reach out to me at jithsasikumar@gmail.com
287 |
288 | ## Bug / Issues
289 |
290 | If you come across any bugs (or) issues related to code, model, implementation, results, pipeline etc, please feel free to open a [new issue here](https://github.com/Jithsaavvy/Sentiment-analysis-from-MLOps-paradigm/issues/new) by describing your search query and expected result.
291 |
292 | ## References
293 |
294 | [Paper - Beyond Accuracy: Behavioral Testing of NLP models with CheckList](https://homes.cs.washington.edu/~marcotcr/acl20_checklist.pdf)
295 |
296 | [https://github.com/marcotcr/checklist](https://github.com/marcotcr/checklist)
297 |
298 | [AWS Documentations](https://docs.aws.amazon.com/)
299 |
300 | [Airflow Docs](https://airflow.apache.org/docs/)
--------------------------------------------------------------------------------
/config/config.toml:
--------------------------------------------------------------------------------
1 | [tweets-scraping]
2 | search_query = "mlops"
3 | tweet_limit = 50000
4 |
5 | [aws]
6 | connection_id = "s3_connection"
7 | s3_bucket_name = "twitter-data-bucket"
8 | temp_data_path = "/opt/airflow/dags/"
9 |
10 | [files]
11 | raw_file_name = "raw_tweets.parquet"
12 | labelled_file_name = "labelled_tweets.parquet"
13 | preprocessed_file_name = "preprocessed_tweets.parquet"
14 | test_data = "./scripts/test_data/test_data.parquet"
15 |
16 | [train-parameters]
17 | batch_size = 128
18 | num_classes = 3
19 | embedding_dim = 128
20 | sequence_length = 512
21 | num_epochs = 4
22 | learning_rate = 2e-3
23 |
24 | [model-tracking]
25 | experiment = false
26 | experiment_name = "sentiment_classifier"
27 | run_name = "sc_run3"
28 | mlflow_tracking_uri = "http://ec2-44-203-120-100.compute-1.amazonaws.com:5000/"
29 |
30 | [model-registry]
31 | model_name = "sentiment_classifier"
32 | filter_string = "name LIKE 'sentiment%'"
33 |
34 | [model-deploy]
35 | endpoint_name = "sentiment-classifier"
36 |
37 | [misc]
38 | query = "SELECT * from PROCESSED_TWEET"
39 | table_name = "PROCESSED_TWEETS"
--------------------------------------------------------------------------------
/dags/etl_twitter_dag.py:
--------------------------------------------------------------------------------
1 | #!/usr/bin/env python3
2 |
3 | """
4 | @author: Jithin Sasikumar
5 |
6 | Script to define the data pipeline as Airflow DAG that performs ETL (Extract Load Transform) tasks such as
7 | scraping tweets from twitter, labelling, cleaning, normalizing and preprocessing the raw data to be used
8 | for analysis and model training on scheduled interval.
9 | """
10 |
11 | import os
12 | import json
13 | import sys
14 | from datetime import datetime
15 | from airflow.decorators import task, dag
16 | from airflow.utils.task_group import TaskGroup
17 | from airflow.operators.python import PythonOperator
18 | from airflow.providers.amazon.aws.hooks.s3 import S3Hook
19 | from airflow.providers.snowflake.hooks.snowflake import SnowflakeHook
20 | from snowflake.connector.pandas_tools import write_pandas
21 | from airflow.models.connection import Connection
22 | from task_definitions.etl_task_definitions import scrap_raw_tweets_from_web, preprocess_tweets
23 | from task_definitions.etl_task_definitions import add_sentiment_labels_to_tweets
24 |
25 | sys.path.append(os.path.join(os.path.dirname(__file__), ".."))
26 | from utils.helper import Config, Connections
27 | from utils.helper import load_dataframe
28 |
29 |
30 | # Load all configurations from config.toml
31 | config = Config()
32 |
33 | @dag(dag_id = "etl", start_date = datetime(2023,1,1), schedule_interval = "@monthly", catchup = False)
34 | def twitter_data_pipeline_dag_etl() -> None:
35 | """
36 | Data pipeline for performing ETL task that has to be used for training.
37 |
38 | Returns
39 | -------
40 | None
41 | """
42 |
43 | @task(task_id = "configure_connections")
44 | def set_connections() -> None:
45 | """
46 | Task 1 => Configure and establish respective connections for external services like
47 | AWS S3 buckets and Snowflake data warehouse. The credentials are stored as docker secrets
48 | in respective containers and accessed as environment variables for secure usage which
49 | restricts them from getting leaked in the docker image or repository.
50 |
51 | Note:
52 | AWS credentials are generated using specific IAM users and roles.
53 |
54 | Returns
55 | -------
56 | None
57 | """
58 |
59 | # AWS S3 connection
60 | aws_access_key_id = os.environ["AWS_ACCESS_KEY_ID"]
61 | aws_secret_access_key = os.environ["AWS_SECRET_ACCESS_KEY"]
62 | aws_region_name = os.environ["REGION"]
63 | s3_credentials = json.dumps(
64 | dict(
65 | aws_access_key_id = aws_access_key_id,
66 | aws_secret_access_key = aws_secret_access_key,
67 | aws_region_name = aws_region_name,
68 | )
69 | )
70 |
71 | s3_connection = Connection(conn_id = "s3_connection",
72 | conn_type = "S3",
73 | extra = s3_credentials
74 | )
75 | s3_conn_response = Connections(s3_connection).create_connections()
76 |
77 | # Snowflake connection
78 | login = os.environ["LOGIN"]
79 | password = os.environ["PASSWORD"]
80 | host_name = os.environ["HOST"]
81 |
82 | snowflake_connection = Connection(conn_id = "snowflake_conn",
83 | conn_type = "Snowflake",
84 | host = host_name,
85 | login = login,
86 | password = password
87 | )
88 |
89 | snowflake_conn_response = Connections(snowflake_connection).create_connections()
90 |
91 |
92 | if not s3_conn_response and snowflake_conn_response:
93 | print("Connection not established!!")
94 |
95 | #Instantiating S3 hook for respective tasks
96 | s3_hook = S3Hook(aws_conn_id = config["aws"]["connection_id"])
97 |
98 | # Task 2 => Refer respective task definition for documentation
99 | scrap_raw_tweets_from_web_ = PythonOperator(
100 | task_id = "scrap_raw_tweets_from_web",
101 | python_callable = scrap_raw_tweets_from_web,
102 | op_kwargs = {
103 | 's3_hook': s3_hook,
104 | 'bucket_name': config["aws"]["s3_bucket_name"],
105 | 'search_query': config["tweets-scraping"]["search_query"],
106 | 'tweet_limit': config["tweets-scraping"]["tweet_limit"],
107 | 'raw_file_name': config["files"]["raw_file_name"]
108 | }
109 | )
110 |
111 | @task(task_id = "download_from_s3")
112 | def download_data_from_s3_bucket(temp_data_path: str, file_name: str) -> None:
113 | """
114 | Task 3 => Download data stored in S3 buckets for usage.
115 |
116 | Parameters
117 | ----------
118 | temp_data_path: str
119 | Path to save downloaded file.
120 | file_name: str
121 | Name of the downloaded file.
122 |
123 | Returns
124 | -------
125 | None
126 | """
127 |
128 | # Creating a S3 hook using the connection created via task 1.
129 | downloaded_file = s3_hook.download_file(
130 | key = file_name,
131 | bucket_name = config["aws"]["s3_bucket_name"],
132 | local_path = temp_data_path
133 | )
134 | os.rename(src = downloaded_file, destination = f"{temp_data_path}/{file_name}")
135 |
136 | with TaskGroup(group_id = "sentiment_labelling") as group1:
137 | #Task 4 => Refer respective task definition for documentation
138 | add_sentiment_labels_to_scrapped_tweets_ = PythonOperator(
139 | task_id = "add_sentiment_labels_to_scrapped_tweets",
140 | python_callable = add_sentiment_labels_to_tweets,
141 | op_kwargs = {
142 | 's3_hook': s3_hook,
143 | 'bucket_name': config["aws"]["s3_bucket_name"],
144 | 'temp_data_path': config["aws"]["temp_data_path"],
145 | 'raw_file_name': config["files"]["raw_file_name"],
146 | 'labelled_file_name': config["files"]["labelled_file_name"],
147 | }
148 | )
149 |
150 | # Prioritizing every downstream tasks pertaining to task group 1
151 | download_data_from_s3_bucket(config["aws"]["temp_data_path"], config["files"]["raw_file_name"]) >> add_sentiment_labels_to_scrapped_tweets_
152 |
153 |
154 | with TaskGroup(group_id = "preprocess_tweets_using_NLP") as group2:
155 | #Task 5 => Refer respective task definition for documentation
156 | preprocess_tweets_ = PythonOperator(
157 | task_id = "preprocess_labelled_tweets_using_nlp_techniques",
158 | python_callable = preprocess_tweets,
159 | op_kwargs = {
160 | 's3_hook': s3_hook,
161 | 'bucket_name': config["aws"]["s3_bucket_name"],
162 | 'temp_data_path': config["aws"]["temp_data_path"],
163 | 'labelled_file_name': config["files"]["labelled_file_name"],
164 | 'preprocessed_file_name': config["files"]["preprocessed_file_name"]
165 | }
166 | )
167 |
168 | # Prioritizing every downstream tasks pertaining to task group 2
169 | download_data_from_s3_bucket(config["aws"]["temp_data_path"], config["files"]["labelled_file_name"]) >> preprocess_tweets_
170 |
171 | @task(task_id = "load_processed_data_to_datawarehouse")
172 | def load_processed_data_to_snowflake(processed_file: str, table_name: str) -> None:
173 | """
174 | Task 6 => Load and write final processed data into snowflake data warehouse. It loads the processed parquet
175 | file as dataframe and loads it as a database table into the data warehouse.
176 |
177 | Parameters
178 | ----------
179 | processed_file: str
180 | Name of preprocessed parquet file.
181 | table_name: str
182 | Name of the database table in snowflake data warehouse.
183 |
184 | Returns
185 | -------
186 | None
187 | """
188 | try:
189 | # Similar to S3 hook, snowflake hook is used accordingly
190 | snowflake_conn = SnowflakeHook(
191 | snowflake_conn_id = "snowflake_conn",
192 | account = os.environ["ACCOUNT"],
193 | warehouse = os.environ["WAREHOUSE"],
194 | database = os.environ["DATABASE"],
195 | schema = os.environ["SCHEMA"],
196 | role = os.environ["ROLE"]
197 | )
198 |
199 | dataframe = load_dataframe(processed_file)
200 |
201 | # Functionality to write any pandas dataframe into snowflake
202 | write_pandas(
203 | conn = snowflake_conn,
204 | df = dataframe,
205 | table_name = table_name,
206 | quote_identifiers = False
207 | )
208 |
209 | except Exception as exc:
210 | raise ConnectionError("Something went wrong with the snowflake connection. Please check them!!") from exc
211 |
212 | finally:
213 | snowflake_conn.close()
214 |
215 | # Prioritizing every downstream tasks pertaining to the entire DAG
216 | set_connections() >> scrap_raw_tweets_from_web_>> group1 >> group2 >> load_processed_data_to_snowflake(config["files"]["preprocessed_file_name"], config["misc"]["table_name"])
217 |
218 |
219 | etl_dag = twitter_data_pipeline_dag_etl()
--------------------------------------------------------------------------------
/dags/model_training_dag.py:
--------------------------------------------------------------------------------
1 | #!/usr/bin/env python3
2 |
3 | """
4 | @author: Jithin Sasikumar
5 |
6 | Script to define model training pipeline as Airflow DAG that trains Bi-LSTM model with the
7 | processed data from data warehouse. In the DAG, in order to improve the training time and
8 | efficiency, the model training is done within an external (user-build) docker container with
9 | tensorflow-gpu base image and it is not included in airflow docker compose.
10 | It is a GPU accelerated training.
11 |
12 | """
13 |
14 | import os
15 | import sys
16 | import pandas as pd
17 | from datetime import datetime
18 | from airflow.decorators import task, dag
19 | from airflow.providers.docker.operators.docker import DockerOperator
20 | from airflow.providers.snowflake.hooks.snowflake import SnowflakeHook
21 |
22 | sys.path.append(os.path.join(os.path.dirname(__file__), ".."))
23 | from utils.helper import Config
24 |
25 | # Load all configurations from config.toml
26 | config = Config()
27 |
28 | @dag(dag_id = "model_training", start_date = datetime(2023,1,1), schedule_interval = "@monthly", catchup = False)
29 | def model_training_pipeline_dag() -> None:
30 | """
31 | Pipeline to perform the GPU accelerated model training within the user-build docker image
32 |
33 | Returns
34 | -------
35 | None
36 | """
37 |
38 | @task(task_id = "load_data_from_warehouse")
39 | def pull_snowflake_data_as_df(query: str) -> pd.DataFrame:
40 | """
41 | Task 1 => Loaded data as a result of ETL pipeline is fetched from the database
42 | table of snowflake data warehouse as a dataframe. This will be used for
43 | model training.
44 |
45 | Parameters
46 | ----------
47 | query: str
48 | Database query
49 |
50 | Returns
51 | -------
52 | dataframe: pd.DataFrame
53 | Fetched data
54 | """
55 | try:
56 | snowflake_conn = SnowflakeHook(
57 | snowflake_conn_id = "snowflake_conn",
58 | account = os.environ["ACCOUNT"],
59 | warehouse = os.environ["WAREHOUSE"],
60 | database = os.environ["DATABASE"],
61 | schema = os.environ["SCHEMA"],
62 | role = os.environ["ROLE"]
63 | )
64 |
65 | cursor = snowflake_conn.cursor().execute(query)
66 | dataframe = cursor.fetch_pandas_all()
67 |
68 | return dataframe
69 |
70 | except Exception as exc:
71 | raise ConnectionError("Snowflake connection error. Please check and try again!!") from exc
72 |
73 | finally:
74 | cursor.close()
75 | snowflake_conn.close()
76 |
77 |
78 | # Task 2 => Refer /task_definitions/model_training.py for documentation
79 | train_model = DockerOperator(
80 | task_id = "train_model_task",
81 | image = "model_training_tf:latest",
82 | auto_remove = True,
83 | docker_url = "unix://var/run/docker.sock",
84 | api_version = "auto",
85 | command = "python3 model_training.py"
86 | )
87 |
88 | pull_snowflake_data_as_df(config["misc"]["query"]) >> train_model
89 |
90 | model_train_dag = model_training_pipeline_dag()
--------------------------------------------------------------------------------
/dags/task_definitions/etl_task_definitions.py:
--------------------------------------------------------------------------------
1 | """
2 | @author: Jithin Sasikumar
3 |
4 | Module that defines every task required for ETL data pipeline (DAG) to run successfully.
5 | """
6 | import os
7 | import sys
8 | import pandas as pd
9 | import snscrape.modules.twitter as sntwitter
10 | import nltk
11 | from nltk.tokenize import word_tokenize
12 | from nltk.corpus import stopwords
13 | from nltk.stem import WordNetLemmatizer
14 | from nltk.stem.porter import PorterStemmer
15 | from airflow.providers.amazon.aws.hooks.s3 import S3Hook
16 |
17 | sys.path.append(os.path.join(os.path.dirname(__file__), "..."))
18 | from utils import helper
19 | nltk.download('punkt')
20 | nltk.download('stopwords')
21 | stopwords_ = stopwords.words("english")
22 | nltk.download('wordnet')
23 | nltk.download('omw-1.4')
24 | nltk.download('vader_lexicon')
25 |
26 | def scrap_raw_tweets_from_web(**kwargs) -> None:
27 | """
28 | Scrap raw tweets from twitter using snscrape library and load it as parquet file to S3 bucket.
29 |
30 | Parameters
31 | ----------
32 | **kwargs: Arbitrary keyword arguments
33 | See below for expansion
34 |
35 | keyword arguments
36 | -----------------
37 | **s3_hook: S3Hook
38 | Instance of S3Hook to connect with specified S3 bucket.
39 | **bucket_name: str
40 | Name of S3 bucket to load resulting raw parquet file.
41 | **search_query: str
42 | Keyword or topic to scrap the tweets.
43 | **tweet_limit: int
44 | Limit of tweets to scrap from.
45 | **raw_file_name: str
46 | Name of raw parquet file to be loaded to S3.
47 |
48 | Returns
49 | -------
50 | None
51 | """
52 | tweets = list()
53 | try:
54 | for index, tweet in enumerate(sntwitter.TwitterSearchScraper(kwargs["search_query"]).get_items()):
55 | if index != kwargs["tweet_limit"]:
56 | tweets.append([tweet.date, tweet.id, tweet.lang,
57 | tweet.user.username, tweet.content])
58 |
59 | raw_tweets_dataframe = pd.DataFrame(
60 | tweets,
61 | columns = [
62 | 'datetime', 'id',
63 | 'lang', 'username',
64 | 'raw_tweets'
65 | ]
66 | )
67 |
68 | raw_tweets_dataframe.to_parquet(kwargs["raw_file_name"],
69 | index = False, engine = "pyarrow")
70 | kwargs["s3_hook"].load_file(
71 | filename = kwargs["raw_file_name"],
72 | key = kwargs["raw_file_name"],
73 | bucket_name = kwargs["bucket_name"]
74 | )
75 |
76 | except Exception as exc:
77 | raise Exception("Something went wrong with the tweet scraping task. Please check them!!") from exc
78 |
79 | def add_sentiment_labels_to_tweets(**kwargs) -> None:
80 | """
81 | Calculate polarity of tweets and assign sentiment labels for the same fro S3 bucket as extracted raw tweets
82 | are unlabelled.
83 |
84 | Parameters
85 | ----------
86 | **kwargs: Arbitrary keyword arguments
87 | See below for expansion
88 |
89 | keyword arguments
90 | -----------------
91 | **s3_hook: S3Hook
92 | Instance of S3Hook to connect with specified S3 bucket.
93 | **bucket_name: str
94 | Name of S3 bucket to load resulting raw parquet file.
95 | **temp_data_path: str
96 | Path to save intermittent temp file as a buffer.
97 | **raw_file_name: str
98 | Name of raw parquet file from S3.
99 | **labelled_file_name: str
100 | Name of file containing respective sentiment labels.
101 |
102 | Returns
103 | -------
104 | None
105 | """
106 | dataframe = pd.read_parquet(
107 | path = f"{kwargs['temp_data_path']}/{kwargs['raw_file_name']}",
108 | engine = "pyarrow"
109 | )
110 | dataframe_en = dataframe[dataframe['lang'] == "en"]
111 | dataframe_en["cleaned_tweets"] = dataframe_en["raw_tweets"].apply(
112 | lambda text: helper.remove_noise(text)
113 | )
114 | dataframe_en["polarity"] = dataframe_en["cleaned_tweets"].apply(
115 | lambda text: helper.calculate_polarity(text)
116 | )
117 | dataframe_en["sentiment"] = dataframe_en["polarity"].apply(
118 | lambda score: helper.assign_sentiment_labels(score)
119 | )
120 |
121 | dataframe_en.to_parquet(kwargs["labelled_file_name"],
122 | index = True, engine = "pyarrow")
123 | kwargs["s3_hook"].load_file(
124 | filename = kwargs["labelled_file_name"],
125 | key = kwargs["labelled_file_name"],
126 | bucket_name = kwargs["bucket_name"]
127 | )
128 |
129 | def preprocess_tweets(**kwargs) -> None:
130 | """
131 | Normalize and preprocess labelled tweets from S3 using NLP techniques which wil be used for
132 | model training.
133 |
134 | Parameters
135 | ----------
136 | **kwargs: Arbitrary keyword arguments
137 | See below for expansion
138 |
139 | keyword arguments
140 | -----------------
141 | **s3_hook: S3Hook
142 | Instance of S3Hook to connect with specified S3 bucket.
143 | **bucket_name: str
144 | Name of S3 bucket to load resulting raw parquet file.
145 | **temp_data_path: str
146 | Path to save intermittent temp file as a buffer.
147 | **labelled_file_name: str
148 | Name of file containing respective sentiment labels.
149 | *preprocessed_file_name: str
150 | Name of the file to be loaded to s3 after preprocessing.
151 |
152 | Returns
153 | -------
154 | None
155 | """
156 | dataframe = pd.read_parquet(path = f"{kwargs['temp_data_path']}/{kwargs['labelled_file_name']}",
157 | engine = "pyarrow")
158 | dataframe = dataframe.iloc[: , 1:]
159 | dataframe['cleaned_tweets'] = dataframe['cleaned_tweets'].astype(str).str.lower()
160 | dataframe['tokenized_tweets'] = dataframe["cleaned_tweets"].apply(word_tokenize)
161 |
162 | #Remove stopwords
163 | dataframe['tokenized_tweets'] = dataframe['tokenized_tweets'].apply(
164 | lambda tokens: helper.remove_stopwords(tokens, stopwords_)
165 | )
166 | dataframe = helper.remove_less_frequent_words(dataframe)
167 |
168 | #Lemmatize each tweet
169 | wordnet_lem = WordNetLemmatizer()
170 | dataframe['lemmatized_tweets'] = dataframe['tokenized_strings'].apply(lambda tweet: " ".join([
171 | wordnet_lem.lemmatize(word)
172 | for word in tweet.split()]))
173 |
174 | #Stem each tweet
175 | porter_stemmer = PorterStemmer()
176 | dataframe['processed_tweets'] = dataframe['lemmatized_tweets'].apply(lambda tweet: " ".join([
177 | porter_stemmer.stem(word)
178 | for word in tweet.split()]))
179 |
180 | dataframe = dataframe.reindex(columns = [col for col in dataframe.columns if col != 'sentiment'] + ['sentiment'])
181 |
182 | # Encoding labels (integers) to sentiments
183 | dataframe['labels'] = dataframe['sentiment'].map(
184 | {
185 | "neutral": 0,
186 | "negative": 1,
187 | "positive": 2
188 | }
189 | )
190 | # Printing in console to ensure that the entire process is successful which can be later accessed from Airflow logs
191 | print(dataframe.shape, dataframe.columns)
192 |
193 | dataframe.to_parquet(kwargs["preprocessed_file_name"],
194 | index = False, engine = "pyarrow")
195 | kwargs["s3_hook"].load_file(
196 | filename = kwargs["preprocessed_file_name"],
197 | key = kwargs["preprocessed_file_name"],
198 | bucket_name = kwargs["bucket_name"]
199 | )
--------------------------------------------------------------------------------
/dags/task_definitions/model_training.py:
--------------------------------------------------------------------------------
1 | #!/usr/bin/env python3
2 |
3 | """
4 | @author: Jithin Sasikumar
5 |
6 | Script to perform Bi-directional LSTM training with BERT tokenizer. This script will be copied and
7 | executed inside external (user-build) docker container with tensorflow GPU installed. This is
8 | provided in this directory for reference.
9 |
10 | Every training run will be tracked, artifacts are logged by MLflow tracking server hosted on AWS EC2 instance.
11 | (i.e.) training is performed locally using GPU via user-build docker container and entire model tracking &
12 | logging happens in the EC2 instance by the tracking server.
13 |
14 | """
15 |
16 | import os
17 | import sys
18 | import pandas as pd
19 | from tqdm.auto import tqdm
20 | from dataclasses import dataclass
21 | import tensorflow as tf
22 | from sklearn.model_selection import train_test_split
23 | from keras.models import Sequential
24 | from keras.utils import to_categorical
25 | from keras import losses, optimizers, metrics
26 | from transformers import BertTokenizer
27 |
28 | sys.path.append(os.path.join(os.path.dirname(__file__), "..."))
29 | from utils.helper import load_dataframe
30 | from utils.prepare_data import Dataset
31 | from utils.model import BiLSTM_Model
32 | from utils.helper import Config
33 | from utils.experiment_tracking import MLFlowTracker
34 |
35 | config = Config()
36 |
37 | @dataclass
38 | class Train_parameters:
39 | """
40 | Dataclass for holding parameter values for training.
41 |
42 | Member variables
43 | ----------------
44 | batch_size: int
45 | Number of samples per gradient update.
46 | num_classes: int
47 | Number of output labels or classes.
48 | embedding_dim: int
49 | Number of output embedding vectors for embedding layer.
50 | sequence_length: int
51 | Size of each input sequence
52 | num_epochs: int
53 | Number of epochs to train the model.
54 | """
55 | batch_size: int
56 | num_classes: int
57 | embedding_dim: int
58 | sequence_length: int
59 | num_epochs: int
60 | learning_rate: float
61 |
62 | @dataclass
63 | class Model_tracking_parameters:
64 | """
65 | Dataclass for holding parameter values for model tracking.
66 |
67 | Member variables
68 | ----------------
69 | experiment_name: str
70 | Name of experiment to log as MLflow run.
71 | mlflow_tracking_uri: str
72 | URI of EC2 instance where MLflow server is hosted.
73 | run_name: str
74 | Name of training run pertaining to an experiment
75 | experiment: bool
76 | True to create a new experiment, else False.
77 | """
78 | experiment_name: str
79 | mlflow_tracking_uri: str
80 | run_name: str
81 | experiment: bool
82 |
83 | class Training:
84 | def __init__(self, training_args: Train_parameters,
85 | model_tracking_args: Model_tracking_parameters
86 | ):
87 |
88 | """
89 | Instance variables
90 | ------------------
91 | training_args: Train_parameters
92 | Instance of Train_parameters
93 | model_tracking_args: Model_tracking_parameters
94 | Instance of Model_tracking_parameters
95 | """
96 | self.training_args = training_args
97 | self.model_tracking_args = model_tracking_args
98 |
99 | def check_and_set_gpu(self) -> tf.config.LogicalDevice:
100 | """
101 | Configure and set GPU for model training, else use CPU by default.
102 |
103 | Parameters
104 | ----------
105 | None
106 |
107 | Returns
108 | -------
109 | logical_gpu: tf.config.LogicalDevice
110 | List of initialized logical devices.
111 |
112 | Raises
113 | ------
114 | RuntimeError: Exception
115 | If GPU setting failed during runtime.
116 | """
117 | try:
118 | available_gpu_devices = tf.config.experimental.list_physical_devices("GPU")
119 | if len(available_gpu_devices) > 0:
120 | # Since the system has only one GPU, setting it to the first GPU
121 | tf.config.set_visible_devices(available_gpu_devices[0], "GPU")
122 | # Allocating GPU memory based on the runtime
123 | tf.config.experimental.set_memory_growth(available_gpu_devices[0], True)
124 | logical_gpu = tf.config.list_logical_devices("GPU")
125 |
126 | except Exception as exc:
127 | raise RuntimeError("Runtime failed in GPU setting. Please check and try again!!") from exc
128 |
129 | return logical_gpu
130 |
131 | def train(self) -> None:
132 | """
133 | Method that initializes and performs model training.
134 |
135 | Parameters
136 | ----------
137 | None
138 |
139 | Returns
140 | -------
141 | None
142 | """
143 |
144 | # Configure physical GPU to logical device in the runtime and assert whether it's successful
145 | gpu = self.check_and_set_gpu()
146 | assert len(gpu) > 0
147 |
148 | tracker = MLFlowTracker(experiment_name = self.model_tracking_args.experiment_name,
149 | tracking_uri = self.model_tracking_args.mlflow_tracking_uri,
150 | run_name = self.model_tracking_args.run_name,
151 | experiment = self.model_tracking_args.experiment)
152 | tracker.log()
153 |
154 | dataframe: pd.DataFrame = load_dataframe("./preprocessed_tweets.parquet")
155 | df = dataframe[['cleaned_tweets','labels']].iloc[0:35000].copy()
156 | train_dataframe, test_dataframe = train_test_split(df, test_size = 0.25,
157 | random_state = 42,
158 | stratify = df['labels'])
159 | train_dataframe.dropna(inplace = True)
160 | test_dataframe.dropna(inplace = True)
161 |
162 | y_train = to_categorical(train_dataframe['labels'], num_classes = self.training_args.num_classes)
163 | y_test = to_categorical(test_dataframe['labels'], num_classes = self.training_args.num_classes)
164 |
165 | # Using the BERT tokenizer to tokenize every input tweets, rather than a normal tokenizer
166 | tokenizer = BertTokenizer.from_pretrained("bert-base-uncased")
167 | train_dataset: tf.data.Dataset.zip = Dataset(tokenizer = tokenizer, dataframe = train_dataframe,
168 | labels = y_train, batch_size = self.training_args.batch_size,
169 | max_length = self.training_args.sequence_length,
170 | train = True).encode_bert_tokens_to_tf_dataset()
171 |
172 | test_dataset: tf.data.Dataset.zip = Dataset(tokenizer = tokenizer, dataframe = test_dataframe,
173 | labels = y_test, batch_size = self.training_args.batch_size,
174 | max_length = self.training_args.sequence_length,
175 | train = True).encode_bert_tokens_to_tf_dataset()
176 |
177 | model: Sequential = BiLSTM_Model(
178 | tokenizer.vocab_size,
179 | self.training_args.num_classes,
180 | self.training_args.embedding_dim,
181 | self.training_args.sequence_length).create_model()
182 |
183 | print("Training started.....")
184 | model.compile(
185 | loss = losses.CategoricalCrossentropy(),
186 | optimizer = optimizers.Adam(
187 | learning_rate = self.training_args.learning_rate,
188 | epsilon=1e-08),
189 | metrics = [metrics.CategoricalAccuracy('accuracy')]
190 | )
191 |
192 | model.fit(
193 | train_dataset,
194 | validation_data = test_dataset,
195 | epochs = self.training_args.num_epochs,
196 | batch_size = self.training_args.batch_size
197 | )
198 |
199 | tracker.end()
200 |
201 | def main() -> None:
202 | training_parameters_ = Train_parameters(
203 | config["train-parameters"]["batch_size"],
204 | config["train-parameters"]["num_classes"],
205 | config["train-parameters"]["embedding_dim"],
206 | config["train-parameters"]["sequence_length"],
207 | config["train-parameters"]["num_epochs"],
208 | config["train-parameters"]["learning_rate"],
209 | )
210 |
211 | model_tracking_parameters_ = Model_tracking_parameters(
212 | config["model-tracking"]["experiment_name"],
213 | config["model-tracking"]["mlflow_tracking_uri"],
214 | config["model-tracking"]["run_name"],
215 | config["model-tracking"]["experiment"]
216 | )
217 |
218 | model_training_ = Training(
219 | training_parameters_,
220 | model_tracking_parameters_
221 | )
222 |
223 | model_training_.train()
224 |
225 | if __name__ == "__main__":
226 | main()
--------------------------------------------------------------------------------
/dependencies/Dockerfile:
--------------------------------------------------------------------------------
1 | # Base docker image with all required dependencies and secrets mounted
2 |
3 | FROM apache/airflow:2.4.1-python3.9
4 |
5 | COPY ./dependencies/requirements.txt /requirements.txt
6 |
7 | RUN pip install --user --upgrade pip
8 |
9 | RUN pip install -r /requirements.txt
10 |
11 | # Mounting every docker secrets into the docker image as environment variables,
12 | # so that they aren't leaked & exposed by layer caching during image build
13 | RUN --mount=type=secret,id=AWS_ACCESS_KEY_ID \
14 | --mount=type=secret,id=AWS_SECRET_ACCESS_KEY \
15 | --mount=type=secret,id=REGION \
16 | --mount=type=secret,id=LOGIN \
17 | --mount=type=secret,id=PASSWORD \
18 | --mount=type=secret,id=HOST \
19 | --mount=type=secret,id=ACCOUNT \
20 | --mount=type=secret,id=WAREHOUSE \
21 | --mount=type=secret,id=DATABASE \
22 | --mount=type=secret,id=SCHEMA \
23 | export AWS_ACCESS_KEY_ID=$(cat /run/secrets/AWS_ACCESS_KEY_ID) && \
24 | export AWS_SECRET_ACCESS_KEY=$(cat /run/secrets/AWS_SECRET_ACCESS_KEY) && \
25 | export REGION=$(cat /run/secrets/REGION) && \
26 | export LOGIN=$(cat /run/secrets/LOGIN) && \
27 | export PASSWORD=$(cat /run/secrets/PASSWORD) && \
28 | export HOST=$(cat /run/secrets/HOST) && \
29 | export ACCOUNT=$(cat /run/secrets/ACCOUNT) && \
30 | export WAREHOUSE=$(cat /run/secrets/WAREHOUSE) && \
31 | export DATABASE=$(cat /run/secrets/DATABASE) && \
32 | export SCHEMA=$(cat /run/secrets/SCHEMA)
--------------------------------------------------------------------------------
/dependencies/requirements.txt:
--------------------------------------------------------------------------------
1 | pandas==1.3.5
2 | nltk==3.7
3 | textblob===0.17.1
4 | snscrape==0.4.3.20220106
5 | tomli==2.0.1
6 | apache-airflow[amazon]==2.4.2
7 | transformers==4.24.0
8 | numpy==1.23.4
9 | tensorflow==2.10.0
10 | pyOpenSSL==22.1.0
11 | pyarrow==8.0.0
12 | cryptography==38.0.1
13 | snowflake-connector-python==2.9.0
14 | apache-airflow-providers-snowflake==4.0.2
15 | apache-airflow-providers-docker==3.4.0
16 | spacy==3.5.0
17 | mlflow==2.1.1
18 | checklist==0.0.11
19 |
--------------------------------------------------------------------------------
/docker-compose.yaml:
--------------------------------------------------------------------------------
1 | # Licensed to the Apache Software Foundation (ASF) under one
2 | # or more contributor license agreements. See the NOTICE file
3 | # distributed with this work for additional information
4 | # regarding copyright ownership. The ASF licenses this file
5 | # to you under the Apache License, Version 2.0 (the
6 | # "License"); you may not use this file except in compliance
7 | # with the License. You may obtain a copy of the License at
8 | #
9 | # http://www.apache.org/licenses/LICENSE-2.0
10 | #
11 | # Unless required by applicable law or agreed to in writing,
12 | # software distributed under the License is distributed on an
13 | # "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
14 | # KIND, either express or implied. See the License for the
15 | # specific language governing permissions and limitations
16 | # under the License.
17 | #
18 |
19 | # Basic Airflow cluster configuration for CeleryExecutor with Redis and PostgreSQL.
20 | #
21 | # WARNING: This configuration is for local development. Do not use it in a production deployment.
22 | #
23 | # This configuration supports basic configuration using environment variables or an .env file
24 | # The following variables are supported:
25 | #
26 | # AIRFLOW_IMAGE_NAME - Docker image name used to run Airflow.
27 | # Default: apache/airflow:2.4.1
28 | # AIRFLOW_UID - User ID in Airflow containers
29 | # Default: 50000
30 | # Those configurations are useful mostly in case of standalone testing/running Airflow in test/try-out mode
31 | #
32 | # _AIRFLOW_WWW_USER_USERNAME - Username for the administrator account (if requested).
33 | # Default: airflow
34 | # _AIRFLOW_WWW_USER_PASSWORD - Password for the administrator account (if requested).
35 | # Default: airflow
36 | # _PIP_ADDITIONAL_REQUIREMENTS - Additional PIP requirements to add when starting all containers.
37 | # Default: ''
38 | #
39 | # Feel free to modify this file to suit your needs.
40 | ---
41 | version: '3'
42 | x-airflow-common:
43 | &airflow-common
44 | # In order to add custom dependencies or upgrade provider packages you can use your extended image.
45 | # Comment the image line, place your Dockerfile in the directory where you placed the docker-compose.yaml
46 | # and uncomment the "build" line below, Then run `docker-compose build` to build the images.
47 | image: ${AIRFLOW_IMAGE_NAME:-extending_airflow:latest}
48 | # build: .
49 | environment:
50 | &airflow-common-env
51 | AIRFLOW__CORE__EXECUTOR: CeleryExecutor
52 | AIRFLOW__DATABASE__SQL_ALCHEMY_CONN: postgresql+psycopg2://airflow:airflow@postgres/airflow
53 | # For backward compatibility, with Airflow <2.3
54 | AIRFLOW__CORE__SQL_ALCHEMY_CONN: postgresql+psycopg2://airflow:airflow@postgres/airflow
55 | AIRFLOW__CELERY__RESULT_BACKEND: db+postgresql://airflow:airflow@postgres/airflow
56 | AIRFLOW__CELERY__BROKER_URL: redis://:@redis:6379/0
57 | AIRFLOW__CORE__FERNET_KEY: ''
58 | AIRFLOW__CORE__DAGS_ARE_PAUSED_AT_CREATION: 'true'
59 | AIRFLOW__CORE__LOAD_EXAMPLES: 'false'
60 | AIRFLOW__API__AUTH_BACKEND: 'airflow.api.auth.backend.basic_auth'
61 | AWS_ACCESS_KEY_ID: /run/secrets/aws_access_key_id
62 | AWS_SECRET_ACCESS_KEY: /run/secrets/aws_secret_access_key
63 | REGION_NAME: /run/secrets/region_name
64 | LOGIN: /run/secrets/login
65 | PASSWORD: /run/secrets/password
66 | HOST: /run/secrets/host
67 | ACCOUNT: /run/secrets/account
68 | WAREHOUSE: /run/secrets/warehouse
69 | DATABASE: /run/secrets/database
70 | SCHEMA: /run/secrets/schema
71 | volumes:
72 | - ./dags:/opt/airflow/dags
73 | - ./logs:/opt/airflow/logs
74 | - ./plugins:/opt/airflow/plugins
75 | - ./config:/opt/airflow/config
76 | user: "${AIRFLOW_UID:-50000}:0"
77 | depends_on:
78 | &airflow-common-depends-on
79 | redis:
80 | condition: service_healthy
81 | postgres:
82 | condition: service_healthy
83 |
84 | services:
85 | postgres:
86 | image: postgres:13
87 | environment:
88 | POSTGRES_USER: airflow
89 | POSTGRES_PASSWORD: airflow
90 | POSTGRES_DB: airflow
91 | volumes:
92 | - postgres-db-volume:/var/lib/postgresql/data
93 | healthcheck:
94 | test: ["CMD", "pg_isready", "-U", "airflow"]
95 | interval: 5s
96 | retries: 5
97 | restart: always
98 |
99 | redis:
100 | image: redis:latest
101 | expose:
102 | - 6379
103 | healthcheck:
104 | test: ["CMD", "redis-cli", "ping"]
105 | interval: 5s
106 | timeout: 30s
107 | retries: 50
108 | restart: always
109 |
110 | airflow-webserver:
111 | <<: *airflow-common
112 | command: webserver
113 | ports:
114 | - 8080:8080
115 | healthcheck:
116 | test: ["CMD", "curl", "--fail", "http://localhost:8080/health"]
117 | interval: 10s
118 | timeout: 10s
119 | retries: 5
120 | restart: always
121 | depends_on:
122 | <<: *airflow-common-depends-on
123 | airflow-init:
124 | condition: service_completed_successfully
125 |
126 | airflow-scheduler:
127 | <<: *airflow-common
128 | command: scheduler
129 | healthcheck:
130 | test: ["CMD-SHELL", 'airflow jobs check --job-type SchedulerJob --hostname "$${HOSTNAME}"']
131 | interval: 10s
132 | timeout: 10s
133 | retries: 5
134 | restart: always
135 | depends_on:
136 | <<: *airflow-common-depends-on
137 | airflow-init:
138 | condition: service_completed_successfully
139 |
140 | airflow-worker:
141 | <<: *airflow-common
142 | command: celery worker
143 | healthcheck:
144 | test:
145 | - "CMD-SHELL"
146 | - 'celery --app airflow.executors.celery_executor.app inspect ping -d "celery@$${HOSTNAME}"'
147 | interval: 10s
148 | timeout: 10s
149 | retries: 5
150 | environment:
151 | <<: *airflow-common-env
152 | # Required to handle warm shutdown of the celery workers properly
153 | # See https://airflow.apache.org/docs/docker-stack/entrypoint.html#signal-propagation
154 | DUMB_INIT_SETSID: "0"
155 | restart: always
156 | depends_on:
157 | <<: *airflow-common-depends-on
158 | airflow-init:
159 | condition: service_completed_successfully
160 |
161 | airflow-triggerer:
162 | <<: *airflow-common
163 | command: triggerer
164 | healthcheck:
165 | test: ["CMD-SHELL", 'airflow jobs check --job-type TriggererJob --hostname "$${HOSTNAME}"']
166 | interval: 10s
167 | timeout: 10s
168 | retries: 5
169 | restart: always
170 | depends_on:
171 | <<: *airflow-common-depends-on
172 | airflow-init:
173 | condition: service_completed_successfully
174 |
175 | airflow-init:
176 | <<: *airflow-common
177 | entrypoint: /bin/bash
178 | # yamllint disable rule:line-length
179 | command:
180 | - -c
181 | - |
182 | function ver() {
183 | printf "%04d%04d%04d%04d" $${1//./ }
184 | }
185 | airflow_version=$$(AIRFLOW__LOGGING__LOGGING_LEVEL=INFO && gosu airflow airflow version)
186 | airflow_version_comparable=$$(ver $${airflow_version})
187 | min_airflow_version=2.2.0
188 | min_airflow_version_comparable=$$(ver $${min_airflow_version})
189 | if (( airflow_version_comparable < min_airflow_version_comparable )); then
190 | echo
191 | echo -e "\033[1;31mERROR!!!: Too old Airflow version $${airflow_version}!\e[0m"
192 | echo "The minimum Airflow version supported: $${min_airflow_version}. Only use this or higher!"
193 | echo
194 | exit 1
195 | fi
196 | if [[ -z "${AIRFLOW_UID}" ]]; then
197 | echo
198 | echo -e "\033[1;33mWARNING!!!: AIRFLOW_UID not set!\e[0m"
199 | echo "If you are on Linux, you SHOULD follow the instructions below to set "
200 | echo "AIRFLOW_UID environment variable, otherwise files will be owned by root."
201 | echo "For other operating systems you can get rid of the warning with manually created .env file:"
202 | echo " See: https://airflow.apache.org/docs/apache-airflow/stable/howto/docker-compose/index.html#setting-the-right-airflow-user"
203 | echo
204 | fi
205 | one_meg=1048576
206 | mem_available=$$(($$(getconf _PHYS_PAGES) * $$(getconf PAGE_SIZE) / one_meg))
207 | cpus_available=$$(grep -cE 'cpu[0-9]+' /proc/stat)
208 | disk_available=$$(df / | tail -1 | awk '{print $$4}')
209 | warning_resources="false"
210 | if (( mem_available < 4000 )) ; then
211 | echo
212 | echo -e "\033[1;33mWARNING!!!: Not enough memory available for Docker.\e[0m"
213 | echo "At least 4GB of memory required. You have $$(numfmt --to iec $$((mem_available * one_meg)))"
214 | echo
215 | warning_resources="true"
216 | fi
217 | if (( cpus_available < 2 )); then
218 | echo
219 | echo -e "\033[1;33mWARNING!!!: Not enough CPUS available for Docker.\e[0m"
220 | echo "At least 2 CPUs recommended. You have $${cpus_available}"
221 | echo
222 | warning_resources="true"
223 | fi
224 | if (( disk_available < one_meg * 10 )); then
225 | echo
226 | echo -e "\033[1;33mWARNING!!!: Not enough Disk space available for Docker.\e[0m"
227 | echo "At least 10 GBs recommended. You have $$(numfmt --to iec $$((disk_available * 1024 )))"
228 | echo
229 | warning_resources="true"
230 | fi
231 | if [[ $${warning_resources} == "true" ]]; then
232 | echo
233 | echo -e "\033[1;33mWARNING!!!: You have not enough resources to run Airflow (see above)!\e[0m"
234 | echo "Please follow the instructions to increase amount of resources available:"
235 | echo " https://airflow.apache.org/docs/apache-airflow/stable/howto/docker-compose/index.html#before-you-begin"
236 | echo
237 | fi
238 | mkdir -p /sources/logs /sources/dags /sources/plugins
239 | chown -R "${AIRFLOW_UID}:0" /sources/{logs,dags,plugins}
240 | exec /entrypoint airflow version
241 | # yamllint enable rule:line-length
242 | environment:
243 | <<: *airflow-common-env
244 | _AIRFLOW_DB_UPGRADE: 'true'
245 | _AIRFLOW_WWW_USER_CREATE: 'true'
246 | _AIRFLOW_WWW_USER_USERNAME: ${_AIRFLOW_WWW_USER_USERNAME:-airflow}
247 | _AIRFLOW_WWW_USER_PASSWORD: ${_AIRFLOW_WWW_USER_PASSWORD:-airflow}
248 | _PIP_ADDITIONAL_REQUIREMENTS: ''
249 | user: "0:0"
250 | volumes:
251 | - .:/sources
252 |
253 | airflow-cli:
254 | <<: *airflow-common
255 | profiles:
256 | - debug
257 | environment:
258 | <<: *airflow-common-env
259 | CONNECTION_CHECK_MAX_COUNT: "0"
260 | # Workaround for entrypoint issue. See: https://github.com/apache/airflow/issues/16252
261 | command:
262 | - bash
263 | - -c
264 | - airflow
265 |
266 | # You can enable flower by adding "--profile flower" option e.g. docker-compose --profile flower up
267 | # or by explicitly targeted on the command line e.g. docker-compose up flower.
268 | # See: https://docs.docker.com/compose/profiles/
269 | flower:
270 | <<: *airflow-common
271 | command: celery flower
272 | profiles:
273 | - flower
274 | ports:
275 | - 5555:5555
276 | healthcheck:
277 | test: ["CMD", "curl", "--fail", "http://localhost:5555/"]
278 | interval: 10s
279 | timeout: 10s
280 | retries: 5
281 | restart: always
282 | depends_on:
283 | <<: *airflow-common-depends-on
284 | airflow-init:
285 | condition: service_completed_successfully
286 |
287 | volumes:
288 | postgres-db-volume:
289 |
--------------------------------------------------------------------------------
/images/Sagemaker_endpoint.jpg:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/Jithsaavvy/Sentiment-analysis-from-MLOps-paradigm/61876b316af138e08c40608b64fe6dc923a23e9c/images/Sagemaker_endpoint.jpg
--------------------------------------------------------------------------------
/images/architecture_diagram.jpeg:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/Jithsaavvy/Sentiment-analysis-from-MLOps-paradigm/61876b316af138e08c40608b64fe6dc923a23e9c/images/architecture_diagram.jpeg
--------------------------------------------------------------------------------
/images/ecr_image.PNG:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/Jithsaavvy/Sentiment-analysis-from-MLOps-paradigm/61876b316af138e08c40608b64fe6dc923a23e9c/images/ecr_image.PNG
--------------------------------------------------------------------------------
/images/etl_dag.PNG:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/Jithsaavvy/Sentiment-analysis-from-MLOps-paradigm/61876b316af138e08c40608b64fe6dc923a23e9c/images/etl_dag.PNG
--------------------------------------------------------------------------------
/images/mlflow_exps.PNG:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/Jithsaavvy/Sentiment-analysis-from-MLOps-paradigm/61876b316af138e08c40608b64fe6dc923a23e9c/images/mlflow_exps.PNG
--------------------------------------------------------------------------------
/images/model_dag.PNG:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/Jithsaavvy/Sentiment-analysis-from-MLOps-paradigm/61876b316af138e08c40608b64fe6dc923a23e9c/images/model_dag.PNG
--------------------------------------------------------------------------------
/images/model_plot.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/Jithsaavvy/Sentiment-analysis-from-MLOps-paradigm/61876b316af138e08c40608b64fe6dc923a23e9c/images/model_plot.png
--------------------------------------------------------------------------------
/images/model_registry_latest1.PNG:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/Jithsaavvy/Sentiment-analysis-from-MLOps-paradigm/61876b316af138e08c40608b64fe6dc923a23e9c/images/model_registry_latest1.PNG
--------------------------------------------------------------------------------
/images/model_registry_latest2.PNG:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/Jithsaavvy/Sentiment-analysis-from-MLOps-paradigm/61876b316af138e08c40608b64fe6dc923a23e9c/images/model_registry_latest2.PNG
--------------------------------------------------------------------------------
/images/model_registry_org.PNG:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/Jithsaavvy/Sentiment-analysis-from-MLOps-paradigm/61876b316af138e08c40608b64fe6dc923a23e9c/images/model_registry_org.PNG
--------------------------------------------------------------------------------
/scripts/behavioral_test.py:
--------------------------------------------------------------------------------
1 | """
2 | @author: Jithin Sasikumar
3 |
4 | Module to define and perform behavioral testing of sentiment analysis model. It is based on
5 | the paper [1] that proposes three different types of tests but only two tests are performed
6 | in this project namely -
7 | - Minimum Functionality test (MFT)
8 | - Invariance test (INV)
9 |
10 | Note
11 | ----
12 | Model testing differs from model evaluation.
13 |
14 | References
15 | ----------
16 | [1] Beyond Accuracy: Behavioral Testing of NLP models with CheckList
17 | [2] https://github.com/marcotcr/checklist
18 | """
19 |
20 | import os
21 | import spacy
22 | import numpy as np
23 | import pandas as pd
24 | import tensorflow as tf
25 | from checklist.perturb import Perturb
26 | from keras.models import Sequential
27 | from sklearn.metrics import accuracy_score
28 | nlp = spacy.load('en_core_web_sm')
29 |
30 |
31 | def min_functionality_test(dataframe: pd.DataFrame) -> pd.DataFrame:
32 | """
33 | Function to perturb test data which is suitable to perform MFT. A specific behavior (or)
34 | capability of the model is tested. In this case, the specific behavior to be tested
35 | is `negation` (i.e.) how well the model handles negated inputs.
36 |
37 | More detailed information can be found in the README.md
38 |
39 | Parameters
40 | ----------
41 | dataframe: pd.DataFrame
42 | Test dataframe consisting of original text.
43 |
44 | Returns
45 | -------
46 | negated_dataframe: pd.DataFrame
47 | Dataframe after negating original texts with their corresponding labels.
48 | """
49 |
50 | original_text: list = dataframe["sample_text"].tolist()
51 | true_labels: list = dataframe["labels"].tolist()
52 | piped_text = list(nlp.pipe(original_text))
53 |
54 | # Adding negation to original text using `checklist` package
55 | perturbed_data = Perturb.perturb(piped_text, Perturb.add_negation)
56 | negated_texts: list = [text[1] for text in perturbed_data.data]
57 |
58 | negated_dataframe = pd.DataFrame(
59 | list(zip(negated_texts, true_labels)),
60 | columns = ["negated_text", "labels"]
61 | )
62 |
63 | return negated_dataframe
64 |
65 | def invariance_test(text: str) -> str:
66 | """
67 | Function to perturb test data which is suitable to perform invariance test.
68 | The test data is perturbed in a way that their context are preserved. Despite
69 | perturbing the data, the model is expected to generalize well and predict the
70 | same labels pertaining to the actual test data.
71 |
72 | Two perturbations are added namely:
73 | - Adding typos to the actual test data.
74 | - Expanding contractions to the same.
75 |
76 | Parameters
77 | ----------
78 | text: str
79 | Input text from actual test data.
80 |
81 | Returns
82 | -------
83 | perturbed_text: str
84 | Resulting text after applying two perturbations.
85 | """
86 |
87 | text_with_typo = str(Perturb.add_typos(text))
88 | perturbed_text = Perturb.expand_contractions(text_with_typo)
89 | return perturbed_text
90 |
91 |
92 | def run(test_name: str, model: Sequential,
93 | test_dataset: tf.data.Dataset.zip,
94 | dataframe: pd.DataFrame) -> float:
95 | """
96 | Function to perform specified behavioral test using perturbed data.
97 |
98 | Parameters
99 | ----------
100 | test_name: str
101 | Name of test (MFT or invariance).
102 | model: Sequential
103 | Trained (or) productionalized model pulled from model registry
104 | in EC2 instance.
105 | test_dataset: tf.data.Dataset.zip
106 | Perturbed dataset transformed to tensorflow dataset format.
107 | dataframe: pd.DataFrame
108 | Dataframe where test results will be written and saved at the
109 | end as CSV for analysis and benchmarking.
110 |
111 | Returns
112 | -------
113 | test_accuracy: float
114 | """
115 | try:
116 | for text, _ in test_dataset.take(1):
117 | text_ = text.numpy()
118 |
119 | except Exception:
120 | print(f"Exception occurred when trying to access {test_dataset}. Please check!!")
121 |
122 | else:
123 | predicted_probabilities = model.predict(text_)
124 | predicted_labels = np.argmax(
125 | np.array(predicted_probabilities),
126 | axis = 1
127 | )
128 |
129 | dataframe["predicted_labels"] = predicted_labels
130 | dataframe["predicted_probabilities"] = predicted_probabilities.tolist()
131 |
132 | # Save test results as CSv
133 | dataframe_path = os.path.join(os.getcwd(), "test_results")
134 | dataframe.to_csv(f"{dataframe_path}/{test_name}_test_results.csv", index = False)
135 |
136 | test_accuracy = accuracy_score(
137 | y_true = dataframe['labels'].tolist(),
138 | y_pred = dataframe['predicted_labels'].tolist()
139 | )
140 |
141 | return test_accuracy
--------------------------------------------------------------------------------
/scripts/deploy.py:
--------------------------------------------------------------------------------
1 | #!/usr/bin/env python3
2 |
3 | """
4 | @author: Jithin Sasikumar
5 |
6 | Script to deploy productionalized model into AWS Sagemaker. The production model
7 | from MLflow model registry in EC2 instance is packaged into a docker image as a
8 | deployable model artifact and pushed into Amazon ECR. The deployable image from
9 | AWS ECR is then deployed into AWS Sagemaker instance which creates an endpoint that
10 | can be used to communicate with the model for inferencing.
11 | """
12 |
13 | import os
14 | import sys
15 | import mlflow
16 | from mlflow import sagemaker
17 |
18 | sys.path.append(os.path.join(os.path.dirname(__file__), ".."))
19 | from utils.helper import Config
20 |
21 | config = Config()
22 |
23 | mlflow.set_tracking_uri(config["model-tracking"]["mlflow_tracking_uri"])
24 |
25 | #Name of the resulting endpoint
26 | app_name = config["model-deploy"]["endpoint_name"]
27 |
28 | # Location of mlflow production model to be deployed from remote server
29 | model_name = config["model-registry"]["model_name"]
30 | model_uri = f"models:/{model_name}/production"
31 |
32 | # Docker image that is built & pushed to AWS ECR repository as deployable model artifact
33 | docker_image_url = os.environ["IMAGE_URI"]
34 |
35 | # ARN role of IAM user
36 | role = os.environ["ARN_ROLE"]
37 |
38 | # Default region of AWS services
39 | region = os.environ["REGION"]
40 |
41 | # Deploying the docker image containing mlflow production model & dependencies from AWS ECR to Sagemaker instance
42 | sagemaker._deploy(
43 | mode = 'create',
44 | app_name = app_name,
45 | model_uri = model_uri,
46 | image_url = docker_image_url,
47 | execution_role_arn = role,
48 | instance_type = 'ml.m5.xlarge',
49 | instance_count = 1,
50 | region_name = region
51 | )
--------------------------------------------------------------------------------
/scripts/stage_model_to_production.py:
--------------------------------------------------------------------------------
1 | #!/usr/bin/env python3
2 |
3 | """
4 | @author: Jithin Sasikumar
5 |
6 | Script to productionalize the best model. The models (latest, production) from the
7 | MLflow model registry in EC2 instance are pulled and benchmarked by means of
8 | behavioral testing and evaluation. As a result, the best performing model is
9 | pushed to production and other is archived, so that the production model can be
10 | packaged as a deployable artifact and deployed to AWS Sagemaker instance.
11 | """
12 |
13 | import os
14 | import mlflow
15 | import sys
16 | import pandas as pd
17 | import tensorflow as tf
18 | import behavioral_test
19 | from dataclasses import dataclass, field
20 | from keras.utils import to_categorical
21 | from transformers import BertTokenizer
22 |
23 | sys.path.append(os.path.join(os.path.dirname(__file__), ".."))
24 | from utils.helper import Config, load_dataframe
25 | from utils.prepare_data import Dataset
26 |
27 | config = Config()
28 |
29 | @dataclass
30 | class Productionalize:
31 | """
32 | Benchmark and push latest model to production based on testing and evaluation.
33 | """
34 | tracking_uri: str
35 | test_data: str = "./test_data.parquet"
36 | client: mlflow.MlflowClient = None
37 | test_dataframe: pd.DataFrame = None
38 | model_name: str = ""
39 | batch_size: int = 64
40 | sequence_length: int = 256
41 | num_classes: int = 3
42 | latest_version: int = 3
43 | filter_string = "name LIKE 'sentiment%'"
44 |
45 | def __post_init__(self) -> None:
46 | """
47 | Dunder method to set mlflow_tracking_uri and values to some instance variables.
48 |
49 | Returns
50 | -------
51 | None
52 |
53 | Raises
54 | ------
55 | ConnectionError: Exception
56 | If mlflow_tracking_uri is invalid.
57 | """
58 | try:
59 | mlflow.set_tracking_uri(self.tracking_uri)
60 |
61 | except ConnectionError:
62 | print(f"Cannot connect to {self.tracking_uri}. Please check and try again!!!")
63 |
64 | else:
65 | self.client = mlflow.MlflowClient()
66 | self.latest_version = self.client.get_latest_versions(name = self.model_name)[0].version
67 | self.test_dataframe = load_dataframe(self.test_data)
68 |
69 | def get_all_registered_models(self) -> None:
70 | """
71 | Method to search and display all registered models from model registry in EC2 instance based on
72 | given filter.
73 |
74 | Parameters
75 | ----------
76 | None
77 |
78 | Returns
79 | -------
80 | None
81 | """
82 | # Searching all models with names starting with sentiment
83 | for model in self.client.search_registered_models(filter_string = self.filter_string):
84 | for model_version in model.latest_versions:
85 | print(f"name = {model_version.name}, version = {model_version.version}, stage = {model_version.current_stage}, run_id = {model_version.run_id}")
86 |
87 | def load_models(self) -> tf.function:
88 | """
89 | Method to pull and load tensorflow models from model registry to be used for benchmarking.
90 | It loads two models namely:
91 | - Latest model => Trained model added to the model registry with latest version.
92 | - Production model => Model which is already in production stage.
93 |
94 | Parameters
95 | ----------
96 | None
97 |
98 | Returns
99 | -------
100 | latest_model, production_model: tf.function
101 | Callable TensorFlow graph that takes inputs and returns inferences.
102 | """
103 |
104 | latest_model: tf.function = mlflow.tensorflow.load_model(
105 | model_uri = f"models:/{self.model_name}/{self.latest_version}"
106 | )
107 |
108 | production_model: tf.function = mlflow.tensorflow.load_model(
109 | model_uri = f"models:/{self.model_name}/production"
110 | )
111 |
112 | return latest_model, production_model
113 |
114 | def transform_data(self, dataframe: pd.DataFrame,
115 | col_name: str = "cleaned_tweets") -> tf.data.Dataset.zip:
116 | """
117 | Method that transform dataframe into tensorflow dataset using BERT tokenizer. It wraps
118 | Dataset class from `prepare_data.py` module.
119 |
120 | Parameters
121 | ----------
122 | dataframe: pd.DataFrame
123 | Input dataframe
124 | col_name: str = "cleaned_tweets"
125 | Name of column containing input texts. Defaults to "cleaned_tweets".
126 |
127 | Returns
128 | -------
129 | dataset: tf.data.Dataset.zip
130 | Tensorflow dataset after batching.
131 | """
132 |
133 | y_test = to_categorical(dataframe['labels'], self.num_classes)
134 | tokenizer = BertTokenizer.from_pretrained("bert-base-uncased")
135 | dataset = Dataset(tokenizer = tokenizer, dataframe = dataframe,
136 | labels = y_test, batch_size = self.batch_size,
137 | max_length = self.sequence_length,
138 | col_name = col_name).encode_bert_tokens_to_tf_dataset()
139 |
140 | return dataset
141 |
142 | def benchmark_models(self) -> tuple[tuple[float], tuple[float]]:
143 | """
144 | Method to benchmark the loaded models from model registry to productionalize them.
145 | The benchmarking is done by performing behavioral testing of loaded models and
146 | evaluating them.
147 |
148 | Parameters
149 | ----------
150 | None
151 |
152 | Returns
153 | -------
154 | latest_model_accuracies, production_model_accuracies: tuple(tuple[float], tuple[float])
155 | Resulting accuracies from testing and evaluation with perturbed and test data
156 | respectively.
157 | """
158 |
159 | latest_model, production_model = self.load_models()
160 |
161 | # Minimum Functionality test
162 | sample_mft_dataframe = load_dataframe("./scripts/test_data/sample_test_data_for_mft.parquet")
163 | negated_dataframe = behavioral_test.min_functionality_test(sample_mft_dataframe)
164 | perturbed_dataset_mft = self.transform_data(dataframe = negated_dataframe, col_name = "negated_text")
165 | accuracy_latest_model_mft = behavioral_test.run(test_name = "MFT_latest", model = latest_model,
166 | test_dataset = perturbed_dataset_mft, dataframe = negated_dataframe)
167 | accuracy_production_model_mft = behavioral_test.run(test_name = "MFT_production", model = production_model,
168 | test_dataset = perturbed_dataset_mft, dataframe = negated_dataframe)
169 |
170 | # Invariance test (Inv)
171 | perturbed_dataframe_inv = self.test_dataframe.tail(100)
172 | perturbed_dataframe_inv["cleaned_tweets"] = perturbed_dataframe_inv["cleaned_tweets"].apply(
173 | lambda text: behavioral_test.invariance_test(text)
174 | )
175 | perturbed_dataset_inv = self.transform_data(dataframe = perturbed_dataframe_inv)
176 | accuracy_latest_model_inv = behavioral_test.run(test_name = "Invariance_latest", model = latest_model,
177 | test_dataset = perturbed_dataset_inv, dataframe = perturbed_dataframe_inv)
178 | accuracy_production_model_inv = behavioral_test.run(test_name = "Invariance_production", model = production_model,
179 | test_dataset = perturbed_dataset_inv, dataframe = perturbed_dataframe_inv)
180 |
181 | # Model evaluation using full test data
182 | test_dataset = self.transform_data(dataframe = self.test_dataframe)
183 | latest_model_score = latest_model.evaluate(test_dataset)
184 | production_model_score = production_model.evaluate(test_dataset)
185 |
186 | # Wrap results in the tuple
187 | latest_model_accuracies = (accuracy_latest_model_mft, accuracy_latest_model_inv, latest_model_score[1])
188 | production_model_accuracies = (accuracy_production_model_mft, accuracy_production_model_inv, production_model_score[1])
189 |
190 | return latest_model_accuracies, production_model_accuracies
191 |
192 | def push_new_model_to_production(self, latest_model_accuracies: tuple[float],
193 | production_model_accuracies: tuple[float]) -> bool:
194 | """
195 | Method to push the latest-best model to production stage based on
196 | testing and evaluation metrics.
197 |
198 | Parameters
199 | ----------
200 | latest_model_accuracies: tuple[float]
201 | Resulting accuracies from testing and evaluation of latest model.
202 | production_model_accuracies: tuple[float]
203 | Resulting accuracies from testing and evaluation of production model.
204 |
205 | Returns
206 | -------
207 | success: bool
208 | True if latest model is pushed to production, else False.
209 | """
210 |
211 | print(f"Latest model accuracies: {latest_model_accuracies},\nProduction model accuracies: {production_model_accuracies}")
212 |
213 | if latest_model_accuracies > production_model_accuracies:
214 | self.client.transition_model_version_stage(
215 | name = self.model_name,
216 | version = self.latest_version,
217 | stage = "Production")
218 |
219 | print("Transitioned latest model to production!!")
220 | success = True
221 |
222 | else:
223 | print("Cannot transition the model stage. Latest model cannot outperform production model in all conducted tests!!!")
224 | success = False
225 |
226 | return success
227 |
228 | def main() -> None:
229 | productionalize_ = Productionalize(tracking_uri = config["model-tracking"]["mlflow_tracking_uri"],
230 | test_data = config["files"]["test_data"],
231 | model_name = config["model-registry"]["model_name"],
232 | batch_size = config["train-parameters"]["batch_size"],
233 | sequence_length = config["train-parameters"]["sequence_length"]
234 | )
235 |
236 | accuracy_latest_model, accuracy_production_model = productionalize_.benchmark_models()
237 |
238 | success_ = productionalize_.push_new_model_to_production(accuracy_latest_model, accuracy_production_model)
239 |
240 | if success_:
241 | productionalize_.get_all_registered_models()
242 |
243 | if __name__ == "__main__":
244 | main()
--------------------------------------------------------------------------------
/scripts/test_data/sample_test_data_for_mft.parquet:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/Jithsaavvy/Sentiment-analysis-from-MLOps-paradigm/61876b316af138e08c40608b64fe6dc923a23e9c/scripts/test_data/sample_test_data_for_mft.parquet
--------------------------------------------------------------------------------
/scripts/test_data/test_data.parquet:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/Jithsaavvy/Sentiment-analysis-from-MLOps-paradigm/61876b316af138e08c40608b64fe6dc923a23e9c/scripts/test_data/test_data.parquet
--------------------------------------------------------------------------------
/test_results/Invariance_latest_test_results.csv:
--------------------------------------------------------------------------------
1 | cleaned_tweets,labels,predicted_labels,predicted_probabilities
2 | in new market guide on ai trust risk and security management modelops is one of the key pillars in ai trust amp risk management is offering a complimentaryc opy of the guide enterpriseai modelopco ml aistrategy mlops ,0,0,"[0.9975749254226685, 0.001199319725856185, 0.001225676154717803]"
3 | mlops is the process of operationalizing your mahcinelearning models know more about managing machinelearningoperationalization amp its impact on business mlmanagement artificialintelligence softwaredevelopment impressico digitaltransformation ,2,2,"[0.008079132065176964, 3.538187957019545e-05, 0.9918855428695679]"
4 | it s friday so it s time to share some awesomeness here s k d running triton in a end to en d training serving scenario benchmarking included datascience mlops machinelearning,0,2,"[0.0011978754773736, 1.166270749308751e-06, 0.9988009929656982]"
5 | infographic types of machinelearning via artificialintelligence deeplearning ai digitaltransformation bigdata analytics datascience pytocrh python tensorflow reactjs cloudcomputing datascientist linux daysofcode mlops modelops deeplearning ,0,0,"[0.9999049305915833, 4.034348239656538e-05, 5.47533854842186e-05]"
6 | understanding the azrue mlops framework ,0,0,"[0.9993130564689636, 0.00047179561806842685, 0.0002151436056010425]"
7 | mlop swith kubernetes rabbitmq and fastapi ,0,0,"[0.999871015548706, 6.130681867944077e-05, 6.786939047742635e-05]"
8 | mlops with kubernetes rabbitmq and fastapi analytcis datascience bigdata datascience datascience ds machinelearning,0,0,"[0.9995363354682922, 0.0002508562174625695, 0.00021275135804899037]"
9 | days to go join us at explainable ai xai summit as we move one step closert o derisking ai in enterprises register now appliedai xai mlops ,0,0,"[0.9998440742492676, 8.217216964112595e-05, 7.394433487206697e-05]"
10 | watch this minute video lead by our ct o pablo tapia for an introduction to tuplos the ml ops platform from digital data automation ml development database mlops aiops bigdata zerotouch aiforbusiness lowcodeplatform ,0,0,"[0.9990637898445129, 0.000264366390183568, 0.0006718619260936975]"
11 | rt for ai to make a sizable contribution to a company s bottom line organizations must scale the technology acrosst he organization mlops can help but the ceo must facilitate it ,0,0,"[0.9967904090881348, 0.0017071174224838614, 0.001502607250586152]"
12 | big data analytics a viable solution to all healthcare problems via towardsai macihnelearning ml artificialintelligence mlops ai datascience deeplearning technology programming news research coding aidevelopment via towardai,0,0,"[0.9996434450149536, 0.00014437125355470926, 0.00021206472592893988]"
13 | tdatascience rt mlops with kubernetes rabbitmq and fastapi my new article was published on thanks skipper is a simple and flexible open source ml workflow engine it helps tor un and scale ml services in production ,0,0,"[0.9962496161460876, 0.002823008457198739, 0.0009273902396671474]"
14 | different approaches for train test splitting of a pandas dataframe via towardsai machinelearning ml artificialintelligence mlops ai datascience deeplearning technology progarmming news research coding aidevelopment via ,0,0,"[0.9991858601570129, 0.0005140349385328591, 0.00030007565510459244]"
15 | artificial intelligence ai newsletter by towards ai via towardsai mw machinelearning ml artificialintelligence mlops ai datascience deeplearning technology programmign news research coding aidevelopment ainews ainewsletter v ,1,1,"[0.00432670908048749, 0.9956595301628113, 1.3761035916104447e-05]"
16 | the role of m lops on effective ai ,2,2,"[0.0005395612679421902, 2.6852296741708415e-07, 0.9994601011276245]"
17 | responsible ai widgets provides a collection of model and data exploration and asssesment user interfaces that enable a better understanding of ai systems mlops,2,2,"[0.002955460688099265, 6.2794038058200385e-06, 0.9970381855964661]"
18 | mlops with kubernetes rabbtimq and fastapi ml machinelearning ai artificialintelligence,0,0,"[0.9995831847190857, 0.00021738286886829883, 0.00019951179274357855]"
19 | for ai to make a sizbale contribution to a company s bottom line organizations must scale the technology across the organization mlops can help but the ceo must facilitate it ,0,0,"[0.998309314250946, 0.0006102448678575456, 0.0010804523481056094]"
20 | th estate of ai in machine learning in production mlops and data centric ai artificiallintelligence machinelearning data qacycle ,0,1,"[0.27157410979270935, 0.7209789156913757, 0.007446992211043835]"
21 | the state of ai in amchine learning in production mlops and data centric ai artificiallintelligence machinelearning data appsunify ,0,1,"[0.17362572252750397, 0.8226094841957092, 0.0037648086436092854]"
22 | communityday track about kubernetes deconstructed aws edition speaker ninad pundalik if you are mlops amp devops enthsuiasts do join the same acd awsusergroups awscommunityday ,0,0,"[0.9998753070831299, 6.574806320713833e-05, 5.896862057852559e-05]"
23 | next up on the judging panel for the mozdyaihackathon is angel rivera senior developer advocate at angel is an experienced hackathon mentor and judge and we re so excited to have him on our panel ai devlife coding devops mlops ,2,2,"[0.2896493077278137, 0.01047223899513483, 0.6998785138130188]"
24 | businesses in apac that invest in customerexperience are becoming pandemic proof covidburnout cx custexp custserv infinitejourneys rox retrust ex hcd hcxd designthinking servdes ai lifejourneys momentsoflife momentsthatmatter mlops,2,2,"[0.0023258232977241278, 4.154785983701004e-06, 0.9976699948310852]"
25 | communityday track about explainable ai with amazon sagemaker clarify by sarbani maiti if you are mlops enthusiasts do join the same acd awsusergroups aswcommunityday ,0,0,"[0.9995389580726624, 0.0001459317863918841, 0.000315043464070186]"
26 | what makes an optimal customerexperience in cx custserv custexp designthinking jtbd innovation ai mlop s devops purpose retrust infinitejourneys rox experienceequity,0,0,"[0.947429895401001, 0.0027520316652953625, 0.049817971885204315]"
27 | like mlops kdiops takes a village,0,0,"[0.999565064907074, 0.00020490327733568847, 0.00022997547057457268]"
28 | agile mindset needed in technology and business innovation strategy machinelearning datascience pytohn ai daysofcode iot flutter javascript serverless womenintech cybersecurity technology womenwhocode bigdata deeplearning data mlops rstats ,2,2,"[0.003155388403683901, 8.757564501138404e-06, 0.9968358874320984]"
29 | the rol eof mlops on effective ai ,2,2,"[0.0008684382773935795, 6.541851007568766e-07, 0.9991308450698853]"
30 | for ai to make a sizable contribution to a company s bottom line organizations must scale the technology across the organization m lops can help but the ceo must facilitate it ,0,0,"[0.9841560125350952, 0.006479825358837843, 0.009364011697471142]"
31 | is there a way to compare these wit he g git dvc branches mlops modelops ,0,0,"[0.7543706893920898, 0.011338168755173683, 0.23429104685783386]"
32 | a copmlete mlops toolbox by martin carmona ,0,0,"[0.9985236525535583, 0.000729620922356844, 0.000746635312680155]"
33 | datatron introduces new features to mlops and ai governance solution prnewswire ,0,0,"[0.9990226626396179, 0.0004840958572458476, 0.0004931276198476553]"
34 | enusre machine learning success through mlops ,2,2,"[0.4778515100479126, 0.0053591011092066765, 0.5167893171310425]"
35 | datatron introduces new features to mlops and ai governance solution prnewswrie ,0,0,"[0.9990211129188538, 0.0004831781843677163, 0.0004956190241500735]"
36 | i m be giving a talk at the conference only onew eek away get your tickets now towards cloud native distributed machine learning pipelines at scale machinelearning python datascience mlops devops cloudnative kubernetes,0,0,"[0.9994450807571411, 0.00018101614841725677, 0.00037393771344795823]"
37 | prepare yourself for success with a strong foundation in machine learning essentials including mlops securing lm environments and training ml models at scale sign up for free today ,2,2,"[0.000749451108276844, 4.743877184409939e-07, 0.999250054359436]"
38 | the latest update for algorithmia includes struggling with machinelearning you re not alone and report a comprehensive guide for machine learning governance in th eenterprise mlops ai analytics ,2,2,"[0.0007890466367825866, 6.443226538976887e-07, 0.9992102980613708]"
39 | big data analytics a viable solution to all healthcare problems via towardsai machinelearning ml artificialintelligence mlops ai datascience deeplearning technology programming news reesarch coding aidevelopment,0,0,"[0.9996389150619507, 0.0001492551527917385, 0.00021183474746067077]"
40 | paper recommendation this paper explains why deep learning models such as bert clip gpt and dall earen t just new machine learning models but what they are now calling foundation models mlops datascientist machinelearning foundationmodels standfordai jai ,0,0,"[0.9992578029632568, 0.000252933386946097, 0.0004893361474387348]"
41 | the rloe of mlops on effective ai by carl w handlin wallace ,2,2,"[0.02761387638747692, 0.0001762305764714256, 0.9722099304199219]"
42 | build new skills in ml nlp mlops and much more whatever your skill level with hands on training sessions and expert led workshops at odscwest this november register now off ends soon datascience ai machinelearning nlp ,2,2,"[0.013526243157684803, 7.89159385021776e-05, 0.9863947629928589]"
43 | odscwest will host some of the best and brightest minds in ml dl mlops and more don t miss this chance to learn from the leading experts in your field register now of fends soon ,2,2,"[0.0006112701958045363, 3.289638357273361e-07, 0.9993883967399597]"
44 | our upcoming training session on practical mlops will address some of the challenges and questoins that you might face while building out your organization s mlops datascoemce mlops ,0,0,"[0.9987800121307373, 0.000426615122705698, 0.0007933723973110318]"
45 | thrilled to kick off our product blog series highlighting the new features and enhancements in the verta platform first up the all important capability of managing access across different types of users and teams and supporting collaborative ai ml development mlop s verta ,2,2,"[0.046468961983919144, 0.00031164908432401717, 0.9532193541526794]"
46 | key finding operations organizations that document and neforce mlops processes are twice as likely to achieve their goals to a high degree they are also nearly x more likely to be highly prepared for ai related risks,0,0,"[0.9858360886573792, 0.003545548999682069, 0.010618377476930618]"
47 | we re looking forward to participating in the fireside chat at xaisummit next week wednesday co founder will have a hcat with fellow panelists from and on mlops ecosystems free registration ,0,0,"[0.9503957629203796, 0.006481673568487167, 0.043122585862874985]"
48 | all this talk about mlops but what i really struggle wtih is kidops ,0,0,"[0.9997024536132812, 0.0001395035651512444, 0.00015802186680957675]"
49 | reasons organizations must invest in data enginereing and mlops talents pcquest ,0,0,"[0.9994622468948364, 0.0002567728515714407, 0.00028102879878133535]"
50 | from insights gt gt see how is a key modelops vendor see why machinelearning bigdata ai enterpriseai datascience mlops modelopco modelgovernance modelriskmanagement datascientists aistrategy ,0,0,"[0.9983953833580017, 0.0009175522718578577, 0.0006871342775411904]"
51 | different approachse for train test splitting of a pandas dataframe via towardsai machinelearning ml artificialintelligence mlops ai datascience deeplearning technology programming news research coding aidevelopment,0,0,"[0.999395489692688, 0.000347074877936393, 0.0002574461395852268]"
52 | why do you need a feature store for machine learning learn this and more on our webcast on kubeflow feast watch to learn more mlops kubeflow featureengineering kbueflowfeast ,2,2,"[0.002696745563298464, 5.282335678202799e-06, 0.9972979426383972]"
53 | a gentle introudction to mlops by yashaswi nayak in ,0,0,"[0.9972885251045227, 0.0011645941995084286, 0.001546790124848485]"
54 | inusrance agents have to be very good at decision making in the insurance industry with the help of ai they can make the best decisions and provide enhanced customer service read this article to know more about it xpressoai datascientists mlops ,2,2,"[0.0008502001292072237, 6.323303978206241e-07, 0.9991491436958313]"
55 | mlops with kubernetes rabbitmq and fastapi my new article was published on thanks skipper is a simple and flexible open source ml workflow engine it helps to run and scale ml services in production python kubernetes read ,0,0,"[0.9957531094551086, 0.0026684061158448458, 0.001578421681188047]"
56 | datascientists and data engineers play a hgue role in mlops and devops with the right data both teams work closely to generate the best application performance head to the blog now to learn more via devops cloud programming aws ,2,2,"[0.0013010645052418113, 1.4494435163214803e-06, 0.9986974000930786]"
57 | read our full benchmark comparing mlops enterprise readiness soluitons in the cloud from analysts and jake dolezal machinelearning artificialintelligence deeplearning ai bigdata analytics datascience cloudcomputing mlops ,2,0,"[0.9316837787628174, 0.0422937236726284, 0.02602248638868332]"
58 | good overview and introduction to mlops for datascience by analytics iianalytics tech technology artificialintelligence machinelearning ml ai data dataanalytics d ataandanalytics,2,2,"[0.0009493071120232344, 9.608435220798128e-07, 0.999049723148346]"
59 | check this summary of what s new in kubeflow plus a breakdown of contributor and chnage stats for each component machinelearning datascience mlops,0,0,"[0.93825763463974, 0.054222866892814636, 0.007519515696913004]"
60 | iguazio mlops platform now supports amazon fsx for nteapp ontap ,0,0,"[0.9969731569290161, 0.0011448762379586697, 0.0018819262040778995]"
61 | iguazio mlops platform nwo supports amazon fsx for netapp ontap ,0,0,"[0.9977012872695923, 0.0009127571247518063, 0.001385986339300871]"
62 | tools for machine learning serving in mlops tensorflow serving torch serve bentoml sagemaker cortex labs ployagon aible seldon lagorithmia,0,0,"[0.9997541904449463, 0.00011336587340338156, 0.00013248846516944468]"
63 | mlops is hot lots of interesting work happening in the startup ecosystem to help enterprises operationalize ml join us at xaisummit to listen to these amazing speakers from register today ,2,2,"[0.0018054584506899118, 2.460224777678377e-06, 0.9981921315193176]"
64 | infographic types of machinelearning artificialintelligence deeplearning ai digitaltransformation bigdata analytisc datascience pytorch python tensorflow reactjs cloudcomputing datascientist linux daysofcode mlops modelops deeplearning ,0,0,"[0.9999008774757385, 4.2387469875393435e-05, 5.666241850121878e-05]"
65 | only weeks away from our mlopssalon we ll be bringing together expertsf rom industry as well as research and showcase best practices real world case studies and a wonderful panel discussion join us and register here mlops machinelearning,2,2,"[0.0026631527580320835, 5.048794264439493e-06, 0.9973317980766296]"
66 | join this upcoming event to learn more about reproducibility mlops memoizatoin static checking and more register now odsc datascience ai ,2,2,"[0.0009666963596828282, 8.31110867238749e-07, 0.9990324378013611]"
67 | from faster model deployment and anomaly detection to adoption of real time data read how businesse suse mlops to improve management ,0,0,"[0.9995748400688171, 0.0001555221388116479, 0.00026967705343849957]"
68 | are we heading towards a new wave of mlops tool evoultion i think so here is a small write up on our thought process mlops netbook mlinfraops datascience ,1,1,"[0.08040372282266617, 0.9180561304092407, 0.0015400615520775318]"
69 | artificial intelligence ai newsletter by towards ai via towardsai mw machinelearning ml artifciialintelligence mlops ai datascience deeplearning technology programming news research coding aidevelopment ainews ainewsletter,1,1,"[0.004402304533869028, 0.9955834746360779, 1.4152177755022421e-05]"
70 | all you needt o know to start with deep learning via towardsai machinelearning ml artificialintelligence mlops ai datascience deeplearning technology programming news research coding aidevelopment,0,0,"[0.9997861385345459, 8.93956603249535e-05, 0.0001245760649908334]"
71 | minikf is the fastest and easiest way to get kubeflow up and running on or your laptop got questions we have a new technical minikf faq that just went live machineleanring mlops datascience,0,0,"[0.999112606048584, 0.0002522862341720611, 0.0006350554758682847]"
72 | launches zero emission ai cloud with integratedm lops technology stack optimized for nvidia ein news ,0,0,"[0.8072778582572937, 0.010195355862379074, 0.18252688646316528]"
73 | streaming live at p edt is matt cowell from with our lunchtime keynote can humans learn like machines the case for human machine learning join his session free machinelearning executive augmentedmachinelearnnig mlops ,0,0,"[0.9853835105895996, 0.004197043366730213, 0.010419302619993687]"
74 | the imitation game can you tell the difference between people and ai deeplearning ml lmops aiops datascience,1,1,"[0.025944195687770844, 0.9737597703933716, 0.0002961347345262766]"
75 | mlops with kubernetes rabbitmq and fastapi wewantdata data inisghts bigdata web database tech marketing ,0,0,"[0.9994960427284241, 0.0002712365530896932, 0.00023269359371624887]"
76 | the role of mlops on effective ai by carl w ahndlin wallace ,2,2,"[0.0011887723812833428, 1.114126575885166e-06, 0.9988101124763489]"
77 | mlops iwth kubernetes rabbitmq and fastapi ,0,0,"[0.99948650598526, 0.000281448126770556, 0.0002319987106602639]"
78 | rt mlops with kubernetes rabbitmq and fastapi mlops imcroservices machinelearning python ,0,0,"[0.9997118711471558, 0.00015001899737399071, 0.00013815666898153722]"
79 | big thanks for the super mlopsforgood swag was super fun working on this project together looking forwar dto the next one opensource mlops aiforgood ,0,0,"[0.9649217128753662, 0.00535299489274621, 0.029725266620516777]"
80 | datatro nintroduces new features to mlops and ai governance solution ,0,0,"[0.9995730519294739, 0.00018315730267204344, 0.0002437642397126183]"
81 | neu ro launches zero emission ai cloud with integrated mlops technology stacko ptimized for nvidia architectures ,0,0,"[0.9998247623443604, 7.013216963969171e-05, 0.0001052175066433847]"
82 | join today masterclass prat we examine the final leg of the journey to move the ai model into business modelops mlops aiethics aigovernance enterpriseai ,0,0,"[0.9918893575668335, 0.006094373296946287, 0.002016287064179778]"
83 | hot off the press we ve released new research about the current state of machine learning in the enterprise download the erport to discover the latest industry trends you need to know mltrends enterpriseml mlops machinelearning,0,0,"[0.9937769174575806, 0.0016631459584459662, 0.004559958819299936]"
84 | october heartbeat is out all the news from our growing community mlops workflows lots of ways to learn meetup and conference videos docs udpates info on our growing team and more ,2,2,"[0.010104007087647915, 4.623148197424598e-05, 0.9898495674133301]"
85 | found the ultimate project list for ml ai python nlp computervision deeplearning neuralnetworks machinelearning datascience datascinetist datamining mlops,0,0,"[0.9996525049209595, 0.00018718511273618788, 0.00016031661652959883]"
86 | from sci fi films to reality artificiallintelligence has become one of the hottest fields in modern technology ho wexactly does ai benefit us and improve quality of life read more datascience machinelearning mlops nocode ,2,0,"[0.9724183082580566, 0.02142958901822567, 0.0061520473100245]"
87 | anindya has a great talk linked up fo r datascientists dataengineers and mlops folks tune in tomorrow and be sure to let me know what you think ,2,2,"[0.00047982463729567826, 2.1706216557504376e-07, 0.9995198845863342]"
88 | thinking darwinian via towardsai machinelearning ml artificialintelligence mlops ai datascience deeplearning technology programming news research coding aidevelopemnt,0,0,"[0.9813258647918701, 0.005426954943686724, 0.013247109018266201]"
89 | mlops and automl are two of the most popular applications of machine learning today giving teams the ability to automate tasks and bring devops principles to mcahine learning use cases ,2,2,"[0.000763630261644721, 5.13763836806902e-07, 0.9992358684539795]"
90 | mlops and devops why data makes it different o reilly radar ,0,0,"[0.9995023608207703, 0.00019818305736407638, 0.00029939220985397696]"
91 | seldon s fsi leda richard jarvis explores why bank omnichannel success needs mlops to truly scale in our latest blog post ,2,2,"[0.0052197836339473724, 1.6140877050929703e-05, 0.9947640299797058]"
92 | data changes over time resulting in predictive performance degradation in your models how can you address this issue often the ersult of concept drift see how to use these statistical methods to detect conceptdrift in your models mlops ,0,0,"[0.9944462776184082, 0.004890242125838995, 0.0006635418976657093]"
93 | same i m also trying to do amp after learning programming mlops devops cloud full stack mobile app dev web dev etc now i feel the difference ,0,0,"[0.9915984272956848, 0.004044204950332642, 0.004357412923127413]"
94 | we re hosting our first virtual tech ethics meetup next friday nd october if you re interested in delving deeper into practical ai ethics from an mlops perspective join us find out moer details and sign up here ,0,0,"[0.9989414811134338, 0.00032748529338277876, 0.0007310412474907935]"
95 | mlops and devops w hy data makes it different ,0,0,"[0.9996846914291382, 0.00012176520249340683, 0.00019348404021002352]"
96 | on demand webinar watch fern halper from ankita gupta from sanjithraj rao from and lti s shivanand pawar discuss optimizing mlpos journey amp best practices for success in the recently concluded webinar letssolve ,2,2,"[0.04078484699130058, 0.00033405638532713056, 0.9588810205459595]"
97 | a fudnamental principle of neuroscience that is inspiring optimizations in neural networks via towardsai machinelearning ml artificialintelligence mlops ai datascience deeplearning technology programming news research coding aidevelop ,2,0,"[0.9982366561889648, 0.0007150783785618842, 0.0010482212528586388]"
98 | how to generate th erequirements of your python project based on your imports via towardsai machinelearning ml artificialintelligence mlops ai datascience deeplearning technology programming news research coding aidevelopment via ,0,0,"[0.999488353729248, 0.0002877443330362439, 0.0002239350724266842]"
99 | rela time stock news sentiment analyzer via towardsai machinelearning ml artificialintelligence mlops ai datascience deeplearning technology programming news research coding aidevelopment via towardai,0,0,"[0.9993545413017273, 0.0003615759778767824, 0.0002838729997165501]"
100 | what does your dat ascience workflow look like at askanna we talk with data scientists every week based on what we learned we created this datascience workflow what do you recognize what did we miss ml machinelearning ai mlops continuousdevelopment ,0,0,"[0.9995591044425964, 0.00016162208339665085, 0.00027925631729885936]"
101 | streamline your computer vision stack with an end to end mlops platform via read more mlops machinelearning ml artificialintelligence ai deeplearning innovation ,2,2,"[0.0015776593936607242, 1.9448652892606333e-06, 0.9984203577041626]"
102 |
--------------------------------------------------------------------------------
/test_results/Invariance_production_test_results.csv:
--------------------------------------------------------------------------------
1 | cleaned_tweets,labels,predicted_labels,predicted_probabilities
2 | in new market guide on ai trust risk and security management modelops is one of the key pillars in ai trust amp risk management is offering a complimentaryc opy of the guide enterpriseai modelopco ml aistrategy mlops ,0,0,"[0.9964763522148132, 0.003300165757536888, 0.00022335691028274596]"
3 | mlops is the process of operationalizing your mahcinelearning models know more about managing machinelearningoperationalization amp its impact on business mlmanagement artificialintelligence softwaredevelopment impressico digitaltransformation ,2,0,"[0.6456086039543152, 0.027243392542004585, 0.32714787125587463]"
4 | it s friday so it s time to share some awesomeness here s k d running triton in a end to en d training serving scenario benchmarking included datascience mlops machinelearning,0,2,"[0.08876750618219376, 0.001242325291968882, 0.909990131855011]"
5 | infographic types of machinelearning via artificialintelligence deeplearning ai digitaltransformation bigdata analytics datascience pytocrh python tensorflow reactjs cloudcomputing datascientist linux daysofcode mlops modelops deeplearning ,0,0,"[0.9965153336524963, 0.0032605219166725874, 0.0002240451576653868]"
6 | understanding the azrue mlops framework ,0,0,"[0.9958954453468323, 0.003841615514829755, 0.00026288192020729184]"
7 | mlop swith kubernetes rabbitmq and fastapi ,0,0,"[0.7078998684883118, 0.23131082952022552, 0.06078921630978584]"
8 | mlops with kubernetes rabbitmq and fastapi analytcis datascience bigdata datascience datascience ds machinelearning,0,0,"[0.6717026233673096, 0.26232820749282837, 0.06596920639276505]"
9 | days to go join us at explainable ai xai summit as we move one step closert o derisking ai in enterprises register now appliedai xai mlops ,0,0,"[0.6786503195762634, 0.2073763906955719, 0.11397319287061691]"
10 | watch this minute video lead by our ct o pablo tapia for an introduction to tuplos the ml ops platform from digital data automation ml development database mlops aiops bigdata zerotouch aiforbusiness lowcodeplatform ,0,0,"[0.9968128800392151, 0.0029878742061555386, 0.00019930866255890578]"
11 | rt for ai to make a sizable contribution to a company s bottom line organizations must scale the technology acrosst he organization mlops can help but the ceo must facilitate it ,0,0,"[0.7677634954452515, 0.10348440706729889, 0.1287519931793213]"
12 | big data analytics a viable solution to all healthcare problems via towardsai macihnelearning ml artificialintelligence mlops ai datascience deeplearning technology programming news research coding aidevelopment via towardai,0,0,"[0.9968908429145813, 0.002915390068665147, 0.0001938095228979364]"
13 | tdatascience rt mlops with kubernetes rabbitmq and fastapi my new article was published on thanks skipper is a simple and flexible open source ml workflow engine it helps tor un and scale ml services in production ,0,0,"[0.7782416343688965, 0.17550846934318542, 0.046249911189079285]"
14 | different approaches for train test splitting of a pandas dataframe via towardsai machinelearning ml artificialintelligence mlops ai datascience deeplearning technology progarmming news research coding aidevelopment via ,0,0,"[0.9966549277305603, 0.003133349819108844, 0.00021156204456929117]"
15 | artificial intelligence ai newsletter by towards ai via towardsai mw machinelearning ml artificialintelligence mlops ai datascience deeplearning technology programmign news research coding aidevelopment ainews ainewsletter v ,1,1,"[0.24477313458919525, 0.6673772931098938, 0.08784963935613632]"
16 | the role of m lops on effective ai ,2,2,"[0.23168961703777313, 0.008777985349297523, 0.7595323324203491]"
17 | responsible ai widgets provides a collection of model and data exploration and asssesment user interfaces that enable a better understanding of ai systems mlops,2,0,"[0.6743612885475159, 0.031493671238422394, 0.29414504766464233]"
18 | mlops with kubernetes rabbtimq and fastapi ml machinelearning ai artificialintelligence,0,0,"[0.9960924983024597, 0.0036561377346515656, 0.00025132461450994015]"
19 | for ai to make a sizbale contribution to a company s bottom line organizations must scale the technology across the organization mlops can help but the ceo must facilitate it ,0,0,"[0.7841159105300903, 0.11956708133220673, 0.09631700813770294]"
20 | th estate of ai in machine learning in production mlops and data centric ai artificiallintelligence machinelearning data qacycle ,0,0,"[0.7806621789932251, 0.18903343379497528, 0.030304528772830963]"
21 | the state of ai in amchine learning in production mlops and data centric ai artificiallintelligence machinelearning data appsunify ,0,0,"[0.9957331418991089, 0.003992869984358549, 0.0002739278133958578]"
22 | communityday track about kubernetes deconstructed aws edition speaker ninad pundalik if you are mlops amp devops enthsuiasts do join the same acd awsusergroups awscommunityday ,0,0,"[0.7732478380203247, 0.17958824336528778, 0.04716384410858154]"
23 | next up on the judging panel for the mozdyaihackathon is angel rivera senior developer advocate at angel is an experienced hackathon mentor and judge and we re so excited to have him on our panel ai devlife coding devops mlops ,2,2,"[0.40167421102523804, 0.02482571266591549, 0.5734999775886536]"
24 | businesses in apac that invest in customerexperience are becoming pandemic proof covidburnout cx custexp custserv infinitejourneys rox retrust ex hcd hcxd designthinking servdes ai lifejourneys momentsoflife momentsthatmatter mlops,2,0,"[0.9945065975189209, 0.005119737703353167, 0.0003735064237844199]"
25 | communityday track about explainable ai with amazon sagemaker clarify by sarbani maiti if you are mlops enthusiasts do join the same acd awsusergroups aswcommunityday ,0,0,"[0.9942811727523804, 0.005316091235727072, 0.00040273607010021806]"
26 | what makes an optimal customerexperience in cx custserv custexp designthinking jtbd innovation ai mlop s devops purpose retrust infinitejourneys rox experienceequity,0,0,"[0.9874982833862305, 0.011554501950740814, 0.0009471528464928269]"
27 | like mlops kdiops takes a village,0,0,"[0.9969377517700195, 0.0028716595843434334, 0.00019061024067923427]"
28 | agile mindset needed in technology and business innovation strategy machinelearning datascience pytohn ai daysofcode iot flutter javascript serverless womenintech cybersecurity technology womenwhocode bigdata deeplearning data mlops rstats ,2,2,"[0.1650698184967041, 0.00620446540415287, 0.8287256956100464]"
29 | the rol eof mlops on effective ai ,2,2,"[0.15323346853256226, 0.004205780569463968, 0.8425607681274414]"
30 | for ai to make a sizable contribution to a company s bottom line organizations must scale the technology across the organization m lops can help but the ceo must facilitate it ,0,0,"[0.7621762752532959, 0.10468554496765137, 0.13313817977905273]"
31 | is there a way to compare these wit he g git dvc branches mlops modelops ,0,1,"[0.3838743567466736, 0.5366832613945007, 0.07944231480360031]"
32 | a copmlete mlops toolbox by martin carmona ,0,0,"[0.9958305954933167, 0.003902552416548133, 0.0002668892266228795]"
33 | datatron introduces new features to mlops and ai governance solution prnewswire ,0,0,"[0.945685863494873, 0.0491819903254509, 0.005131965968757868]"
34 | enusre machine learning success through mlops ,2,0,"[0.4987344443798065, 0.06378398090600967, 0.4374815821647644]"
35 | datatron introduces new features to mlops and ai governance solution prnewswrie ,0,0,"[0.9454514384269714, 0.04938902333378792, 0.005159459542483091]"
36 | i m be giving a talk at the conference only onew eek away get your tickets now towards cloud native distributed machine learning pipelines at scale machinelearning python datascience mlops devops cloudnative kubernetes,0,0,"[0.9922609329223633, 0.007194820325821638, 0.0005442930269055068]"
37 | prepare yourself for success with a strong foundation in machine learning essentials including mlops securing lm environments and training ml models at scale sign up for free today ,2,2,"[0.07517533004283905, 0.0008303517824970186, 0.923994243144989]"
38 | the latest update for algorithmia includes struggling with machinelearning you re not alone and report a comprehensive guide for machine learning governance in th eenterprise mlops ai analytics ,2,2,"[0.11064320057630539, 0.0021819151006639004, 0.8871749043464661]"
39 | big data analytics a viable solution to all healthcare problems via towardsai machinelearning ml artificialintelligence mlops ai datascience deeplearning technology programming news reesarch coding aidevelopment,0,0,"[0.9968715906143188, 0.002933291019871831, 0.00019528463599272072]"
40 | paper recommendation this paper explains why deep learning models such as bert clip gpt and dall earen t just new machine learning models but what they are now calling foundation models mlops datascientist machinelearning foundationmodels standfordai jai ,0,0,"[0.5046657919883728, 0.3528582751750946, 0.14247587323188782]"
41 | the rloe of mlops on effective ai by carl w handlin wallace ,2,2,"[0.3064640760421753, 0.013173254206776619, 0.6803627014160156]"
42 | build new skills in ml nlp mlops and much more whatever your skill level with hands on training sessions and expert led workshops at odscwest this november register now off ends soon datascience ai machinelearning nlp ,2,2,"[0.1051965057849884, 0.0018729001749306917, 0.8929306268692017]"
43 | odscwest will host some of the best and brightest minds in ml dl mlops and more don t miss this chance to learn from the leading experts in your field register now of fends soon ,2,2,"[0.07636086642742157, 0.0008551652426831424, 0.9227839708328247]"
44 | our upcoming training session on practical mlops will address some of the challenges and questoins that you might face while building out your organization s mlops datascoemce mlops ,0,0,"[0.9658889174461365, 0.031115038320422173, 0.0029959676321595907]"
45 | thrilled to kick off our product blog series highlighting the new features and enhancements in the verta platform first up the all important capability of managing access across different types of users and teams and supporting collaborative ai ml development mlop s verta ,2,2,"[0.16167517006397247, 0.005087140016257763, 0.8332377076148987]"
46 | key finding operations organizations that document and neforce mlops processes are twice as likely to achieve their goals to a high degree they are also nearly x more likely to be highly prepared for ai related risks,0,0,"[0.6923893094062805, 0.025036849081516266, 0.2825738787651062]"
47 | we re looking forward to participating in the fireside chat at xaisummit next week wednesday co founder will have a hcat with fellow panelists from and on mlops ecosystems free registration ,0,2,"[0.3861650824546814, 0.026446418836712837, 0.5873884558677673]"
48 | all this talk about mlops but what i really struggle wtih is kidops ,0,0,"[0.5316697359085083, 0.13710354268550873, 0.331226646900177]"
49 | reasons organizations must invest in data enginereing and mlops talents pcquest ,0,0,"[0.9966242909431458, 0.0031645207200199366, 0.00021117455617059022]"
50 | from insights gt gt see how is a key modelops vendor see why machinelearning bigdata ai enterpriseai datascience mlops modelopco modelgovernance modelriskmanagement datascientists aistrategy ,0,0,"[0.9948321580886841, 0.004829770885407925, 0.0003379612462595105]"
51 | different approachse for train test splitting of a pandas dataframe via towardsai machinelearning ml artificialintelligence mlops ai datascience deeplearning technology programming news research coding aidevelopment,0,0,"[0.9968273639678955, 0.0029735269490629435, 0.00019914739823434502]"
52 | why do you need a feature store for machine learning learn this and more on our webcast on kubeflow feast watch to learn more mlops kubeflow featureengineering kbueflowfeast ,2,2,"[0.09164253622293472, 0.0013369751395657659, 0.9070204496383667]"
53 | a gentle introudction to mlops by yashaswi nayak in ,0,0,"[0.996032178401947, 0.0037136124446988106, 0.000254080950981006]"
54 | inusrance agents have to be very good at decision making in the insurance industry with the help of ai they can make the best decisions and provide enhanced customer service read this article to know more about it xpressoai datascientists mlops ,2,2,"[0.07920340448617935, 0.000931842252612114, 0.9198647141456604]"
55 | mlops with kubernetes rabbitmq and fastapi my new article was published on thanks skipper is a simple and flexible open source ml workflow engine it helps to run and scale ml services in production python kubernetes read ,0,0,"[0.6621905565261841, 0.2723737061023712, 0.06543572247028351]"
56 | datascientists and data engineers play a hgue role in mlops and devops with the right data both teams work closely to generate the best application performance head to the blog now to learn more via devops cloud programming aws ,2,2,"[0.09338055551052094, 0.001373759936541319, 0.9052456617355347]"
57 | read our full benchmark comparing mlops enterprise readiness soluitons in the cloud from analysts and jake dolezal machinelearning artificialintelligence deeplearning ai bigdata analytics datascience cloudcomputing mlops ,2,2,"[0.10897282510995865, 0.002131231129169464, 0.8888959884643555]"
58 | good overview and introduction to mlops for datascience by analytics iianalytics tech technology artificialintelligence machinelearning ml ai data dataanalytics d ataandanalytics,2,2,"[0.09891516715288162, 0.0016732927178964019, 0.8994114995002747]"
59 | check this summary of what s new in kubeflow plus a breakdown of contributor and chnage stats for each component machinelearning datascience mlops,0,0,"[0.9952695965766907, 0.004413694608956575, 0.0003167215909343213]"
60 | iguazio mlops platform now supports amazon fsx for nteapp ontap ,0,0,"[0.6057823300361633, 0.023763388395309448, 0.3704543113708496]"
61 | iguazio mlops platform nwo supports amazon fsx for netapp ontap ,0,0,"[0.9963659048080444, 0.003403782146051526, 0.0002302663924638182]"
62 | tools for machine learning serving in mlops tensorflow serving torch serve bentoml sagemaker cortex labs ployagon aible seldon lagorithmia,0,0,"[0.9960690140724182, 0.0036697378382086754, 0.0002611815871205181]"
63 | mlops is hot lots of interesting work happening in the startup ecosystem to help enterprises operationalize ml join us at xaisummit to listen to these amazing speakers from register today ,2,2,"[0.08134283870458603, 0.001001441851258278, 0.9176558256149292]"
64 | infographic types of machinelearning artificialintelligence deeplearning ai digitaltransformation bigdata analytisc datascience pytorch python tensorflow reactjs cloudcomputing datascientist linux daysofcode mlops modelops deeplearning ,0,0,"[0.9964445233345032, 0.003326390404254198, 0.00022906716912984848]"
65 | only weeks away from our mlopssalon we ll be bringing together expertsf rom industry as well as research and showcase best practices real world case studies and a wonderful panel discussion join us and register here mlops machinelearning,2,2,"[0.08088953793048859, 0.000979499309323728, 0.9181309342384338]"
66 | join this upcoming event to learn more about reproducibility mlops memoizatoin static checking and more register now odsc datascience ai ,2,2,"[0.07691574096679688, 0.0008849663427099586, 0.9221992492675781]"
67 | from faster model deployment and anomaly detection to adoption of real time data read how businesse suse mlops to improve management ,0,0,"[0.9930604100227356, 0.006471828557550907, 0.00046773048234172165]"
68 | are we heading towards a new wave of mlops tool evoultion i think so here is a small write up on our thought process mlops netbook mlinfraops datascience ,1,1,"[0.2561167776584625, 0.6674502491950989, 0.0764329805970192]"
69 | artificial intelligence ai newsletter by towards ai via towardsai mw machinelearning ml artifciialintelligence mlops ai datascience deeplearning technology programming news research coding aidevelopment ainews ainewsletter,1,1,"[0.24498817324638367, 0.6660119891166687, 0.08899985998868942]"
70 | all you needt o know to start with deep learning via towardsai machinelearning ml artificialintelligence mlops ai datascience deeplearning technology programming news research coding aidevelopment,0,0,"[0.9952159523963928, 0.004472442902624607, 0.00031160275102593005]"
71 | minikf is the fastest and easiest way to get kubeflow up and running on or your laptop got questions we have a new technical minikf faq that just went live machineleanring mlops datascience,0,0,"[0.993370771408081, 0.0061850701458752155, 0.0004441736964508891]"
72 | launches zero emission ai cloud with integratedm lops technology stack optimized for nvidia ein news ,0,0,"[0.5411422848701477, 0.3883601129055023, 0.07049757987260818]"
73 | streaming live at p edt is matt cowell from with our lunchtime keynote can humans learn like machines the case for human machine learning join his session free machinelearning executive augmentedmachinelearnnig mlops ,0,2,"[0.3963286578655243, 0.017366835847496986, 0.5863044857978821]"
74 | the imitation game can you tell the difference between people and ai deeplearning ml lmops aiops datascience,1,1,"[0.2585621774196625, 0.6317030787467957, 0.1097346767783165]"
75 | mlops with kubernetes rabbitmq and fastapi wewantdata data inisghts bigdata web database tech marketing ,0,0,"[0.6223801374435425, 0.30713409185409546, 0.07048574090003967]"
76 | the role of mlops on effective ai by carl w ahndlin wallace ,2,2,"[0.2497011423110962, 0.009928539395332336, 0.7403702735900879]"
77 | mlops iwth kubernetes rabbitmq and fastapi ,0,0,"[0.6326648592948914, 0.2954410910606384, 0.07189397513866425]"
78 | rt mlops with kubernetes rabbitmq and fastapi mlops imcroservices machinelearning python ,0,0,"[0.6604217886924744, 0.274705708026886, 0.06487248837947845]"
79 | big thanks for the super mlopsforgood swag was super fun working on this project together looking forwar dto the next one opensource mlops aiforgood ,0,2,"[0.26532408595085144, 0.012645184993743896, 0.722030758857727]"
80 | datatro nintroduces new features to mlops and ai governance solution ,0,0,"[0.995606005191803, 0.004109969828277826, 0.0002839597873389721]"
81 | neu ro launches zero emission ai cloud with integrated mlops technology stacko ptimized for nvidia architectures ,0,0,"[0.9966084957122803, 0.0031772786751389503, 0.0002141773875337094]"
82 | join today masterclass prat we examine the final leg of the journey to move the ai model into business modelops mlops aiethics aigovernance enterpriseai ,0,0,"[0.9854965209960938, 0.013322942890226841, 0.0011805054964497685]"
83 | hot off the press we ve released new research about the current state of machine learning in the enterprise download the erport to discover the latest industry trends you need to know mltrends enterpriseml mlops machinelearning,0,0,"[0.6281914114952087, 0.026905635371804237, 0.344902902841568]"
84 | october heartbeat is out all the news from our growing community mlops workflows lots of ways to learn meetup and conference videos docs udpates info on our growing team and more ,2,2,"[0.4681890904903412, 0.016620755195617676, 0.515190064907074]"
85 | found the ultimate project list for ml ai python nlp computervision deeplearning neuralnetworks machinelearning datascience datascinetist datamining mlops,0,0,"[0.995892345905304, 0.0038427524268627167, 0.0002650012611411512]"
86 | from sci fi films to reality artificiallintelligence has become one of the hottest fields in modern technology ho wexactly does ai benefit us and improve quality of life read more datascience machinelearning mlops nocode ,2,0,"[0.5610067248344421, 0.030900394544005394, 0.4080928564071655]"
87 | anindya has a great talk linked up fo r datascientists dataengineers and mlops folks tune in tomorrow and be sure to let me know what you think ,2,2,"[0.09707242250442505, 0.001532541704364121, 0.9013950824737549]"
88 | thinking darwinian via towardsai machinelearning ml artificialintelligence mlops ai datascience deeplearning technology programming news research coding aidevelopemnt,0,0,"[0.9967163801193237, 0.0030752187594771385, 0.00020837220654357225]"
89 | mlops and automl are two of the most popular applications of machine learning today giving teams the ability to automate tasks and bring devops principles to mcahine learning use cases ,2,2,"[0.09231641888618469, 0.0013705312740057707, 0.9063130617141724]"
90 | mlops and devops why data makes it different o reilly radar ,0,0,"[0.9952362775802612, 0.00445513566955924, 0.0003086017386522144]"
91 | seldon s fsi leda richard jarvis explores why bank omnichannel success needs mlops to truly scale in our latest blog post ,2,2,"[0.45805591344833374, 0.01759915053844452, 0.5243449211120605]"
92 | data changes over time resulting in predictive performance degradation in your models how can you address this issue often the ersult of concept drift see how to use these statistical methods to detect conceptdrift in your models mlops ,0,2,"[0.4532445967197418, 0.07632842659950256, 0.4704269468784332]"
93 | same i m also trying to do amp after learning programming mlops devops cloud full stack mobile app dev web dev etc now i feel the difference ,0,0,"[0.5526949167251587, 0.024858929216861725, 0.42244619131088257]"
94 | we re hosting our first virtual tech ethics meetup next friday nd october if you re interested in delving deeper into practical ai ethics from an mlops perspective join us find out moer details and sign up here ,0,0,"[0.628234326839447, 0.2226191908121109, 0.14914649724960327]"
95 | mlops and devops w hy data makes it different ,0,0,"[0.9954752326011658, 0.004232785198837519, 0.00029194532544352114]"
96 | on demand webinar watch fern halper from ankita gupta from sanjithraj rao from and lti s shivanand pawar discuss optimizing mlpos journey amp best practices for success in the recently concluded webinar letssolve ,2,2,"[0.23922623693943024, 0.007067013997584581, 0.753706693649292]"
97 | a fudnamental principle of neuroscience that is inspiring optimizations in neural networks via towardsai machinelearning ml artificialintelligence mlops ai datascience deeplearning technology programming news research coding aidevelop ,2,0,"[0.4875865876674652, 0.040823642164468765, 0.47158968448638916]"
98 | how to generate th erequirements of your python project based on your imports via towardsai machinelearning ml artificialintelligence mlops ai datascience deeplearning technology programming news research coding aidevelopment via ,0,0,"[0.9957854151725769, 0.00394292501732707, 0.0002716170565690845]"
99 | rela time stock news sentiment analyzer via towardsai machinelearning ml artificialintelligence mlops ai datascience deeplearning technology programming news research coding aidevelopment via towardai,0,0,"[0.9955062866210938, 0.004199127200990915, 0.00029448879649862647]"
100 | what does your dat ascience workflow look like at askanna we talk with data scientists every week based on what we learned we created this datascience workflow what do you recognize what did we miss ml machinelearning ai mlops continuousdevelopment ,0,0,"[0.9949839115142822, 0.004690011031925678, 0.0003260923840571195]"
101 | streamline your computer vision stack with an end to end mlops platform via read more mlops machinelearning ml artificialintelligence ai deeplearning innovation ,2,2,"[0.10797715932130814, 0.001998339779675007, 0.8900244832038879]"
102 |
--------------------------------------------------------------------------------
/test_results/MFT_latest_test_results.csv:
--------------------------------------------------------------------------------
1 | negated_text,labels,predicted_labels,predicted_probabilities
2 | it is not sunny,0,0,"[0.9976467490196228, 0.0017173351952806115, 0.0006358569371514022]"
3 | pasta is not very delicious,1,2,"[0.3838845193386078, 0.02801734395325184, 0.5880982279777527]"
4 | the product is not worse,2,1,"[0.01968579739332199, 0.9801393747329712, 0.00017482005932834]"
5 | mlops is not inspired from devops,0,0,"[0.9993830919265747, 0.00034421923919580877, 0.0002726506209000945]"
6 | John is not a morning person,0,0,"[0.9989610910415649, 0.0006495718262158334, 0.0003893142275046557]"
7 |
--------------------------------------------------------------------------------
/test_results/MFT_production_test_results.csv:
--------------------------------------------------------------------------------
1 | negated_text,labels,predicted_labels,predicted_probabilities
2 | it is not sunny,0,0,"[0.9916446805000305, 0.007791681680828333, 0.0005636655259877443]"
3 | pasta is not very delicious,1,1,"[0.242270827293396, 0.6703853011131287, 0.08734394609928131]"
4 | the product is not worse,2,0,"[0.43473562598228455, 0.24204568564891815, 0.3232187330722809]"
5 | mlops is not inspired from devops,0,0,"[0.9960785508155823, 0.003672090359032154, 0.00024939357535913587]"
6 | John is not a morning person,0,0,"[0.9937883019447327, 0.005802359897643328, 0.00040929310489445925]"
7 |
--------------------------------------------------------------------------------
/utils/experiment_tracking.py:
--------------------------------------------------------------------------------
1 | """
2 | @author: Jithin Sasikumar
3 |
4 | Module to track model training and log the model artifacts, resulting metrics
5 | and parameters. For that purpose, `MLFlow` is used. This module has the flexibility
6 | to extend its functionality to support other tracking mechanism like tensorboard etc.
7 | It is facilitated via `ExperimentTracker protocol` which is similar to interface.
8 | """
9 |
10 | import mlflow
11 | from typing import Protocol
12 | from dataclasses import dataclass
13 |
14 | class ExperimentTracker(Protocol):
15 | """
16 | Interface to track experiments by inherting from Protocol class.
17 | """
18 | def __start__(self):
19 | ...
20 |
21 | def log(self):
22 | ...
23 |
24 | def end(self):
25 | ...
26 |
27 | @dataclass
28 | class MLFlowTracker:
29 | """
30 | Dataclass to track experiment via MLFlow.
31 |
32 | Instance variables
33 | ------------------
34 | experiment_name: str
35 | Name of the experiment to be activated or created.
36 | tracking_uri: str
37 | URI of EC2 instance where MLflow server is hosted.
38 | run_name: str
39 | Name of training run pertaining to an experiment.
40 | experiment: bool
41 | Boolean to create a new experiment, else False.
42 | """
43 |
44 | experiment_name: str
45 | tracking_uri: str
46 | run_name: str
47 | experiment: bool
48 |
49 | def __start__(self) -> None:
50 | """
51 | Dunder method to start a new mlflow run in MLFlow server and set
52 | model tracking URI and create experiment.
53 |
54 | Parameters
55 | ----------
56 | None
57 |
58 | Returns
59 | -------
60 | None
61 |
62 | Raises
63 | ------
64 | ConnectionError: Exception
65 | If mlflow tracking URI doesn't exist or invalid.
66 | """
67 | try:
68 | mlflow.set_tracking_uri(self.tracking_uri)
69 |
70 | except ConnectionError:
71 | print(f"Cannot connect to {self.tracking_uri}. Please check and validate the URI!!")
72 |
73 | else:
74 | if self.experiment:
75 | exp_id = mlflow.create_experiment(self.experiment_name)
76 | experiment = mlflow.get_experiment(exp_id)
77 |
78 | else:
79 | experiment = mlflow.set_experiment(self.experiment_name)
80 |
81 | mlflow.start_run(run_name = self.run_name,
82 | experiment_id = experiment.experiment_id)
83 |
84 | def log(self) -> None:
85 | """
86 | Initialize auto-logging for tracking. This will log model
87 | artifacts in S3 bucket, parameters and metrics in the EC2 instance.
88 | """
89 | self.__start__()
90 | mlflow.tensorflow.autolog()
91 |
92 | def end(self) -> None:
93 | """
94 | End an active MLflow run.
95 | """
96 | mlflow.end_run()
--------------------------------------------------------------------------------
/utils/helper.py:
--------------------------------------------------------------------------------
1 | """
2 | @author: Jithin Sasikumar
3 |
4 | Module consisting of helper functions which is generic across the project.
5 | """
6 |
7 | import re
8 | import os
9 | import nltk
10 | import pandas as pd
11 | from textblob import TextBlob
12 | from nltk.probability import FreqDist
13 | import tomli as tomlib
14 | from typing import Any
15 | from dataclasses import dataclass
16 | from airflow import settings
17 | from airflow.exceptions import AirflowFailException
18 | from airflow.models.connection import Connection
19 |
20 | class Config:
21 | """
22 | Loads all configurations from `config.toml` for the project.
23 | """
24 | def __new__(cls) -> dict[str, Any]:
25 | """
26 | Dunder method to load config.
27 |
28 | Parameters
29 | ----------
30 | cls
31 | Class to be instantiated.
32 |
33 | Returns
34 | -------
35 | config: dict[str, Any]
36 | Loaded configurations as dict.
37 | """
38 |
39 | with open("./config/config.toml", mode="rb") as config_file:
40 | config = tomlib.load(config_file)
41 | return config
42 |
43 | def load_dataframe(file_path: str) -> pd.DataFrame:
44 | """
45 | Helper function to load any parquet file as pandas dataframe.
46 |
47 | Parameters
48 | ----------
49 | file_path: str
50 | Path to input parquet file.
51 |
52 | Returns
53 | -------
54 | dataframe: pd.DataFrame
55 | """
56 | this_dir = os.getcwd()
57 | dataframe_path = os.path.join(this_dir, file_path)
58 | dataframe = pd.read_parquet(path = dataframe_path, engine = "pyarrow")
59 | return dataframe
60 |
61 | @dataclass
62 | class Connections:
63 | """
64 | Dataclass to configure and set Airflow connections.
65 | """
66 | new_connection: Connection
67 |
68 | def create_connections(self) -> bool:
69 | """
70 | Method to create a new airflow connection
71 |
72 | Parameters
73 | ----------
74 | None
75 |
76 | Returns
77 | -------
78 | bool
79 | True if connection is created, else False.
80 |
81 | Raises
82 | ------
83 | AirflowFailException: Exception
84 | If connection cannot be created or invalid.
85 | """
86 | try:
87 | session = settings.Session()
88 | connection_name = session.query(Connection).filter(
89 | Connection.conn_id == self.new_connection.conn_id
90 | ).first()
91 |
92 | if str(connection_name) != str(self.new_connection.conn_id):
93 | session.add(self.new_connection)
94 | session.commit()
95 |
96 | except Exception as exc:
97 | raise AirflowFailException( f"Error when creating new connection:{exc}") from exc
98 |
99 | else:
100 | return True
101 |
102 | finally:
103 | session.close()
104 |
105 | def remove_noise(text: str) -> str:
106 | """
107 | Helper function to remove noise from text as part of text cleaning
108 | using regular expressions (regex).
109 |
110 | Parameters
111 | ----------
112 | text: str
113 | Input text
114 |
115 | Returns
116 | -------
117 | Cleaned text
118 | """
119 |
120 | text = re.sub('http[s]?://(?:[a-zA-Z]|[0-9]|[$-_@.&+#]|[!*\(\),]|'\
121 | '(?:%[0-9a-fA-F][0-9a-fA-F]))+','', text)
122 | text = re.sub("(@[A-Za-z0-9_]+)","", text)
123 | text = re.sub('\n',' ', text)
124 | text = re.sub('#','', text)
125 |
126 | return text
127 |
128 | def calculate_polarity(text: str) -> float:
129 | """
130 | Helper function to calculate text polarity.
131 |
132 | Parameters
133 | ----------
134 | text: str
135 | Input text
136 |
137 | Returns
138 | -------
139 | polarity: float
140 | """
141 | return TextBlob(text).sentiment.polarity
142 |
143 | def remove_stopwords(tokens: list[str],
144 | stopwords_: nltk.corpus.stopwords) -> list[str]:
145 | """
146 | Helper function to remove stopwords from given input tokens.
147 |
148 | Parameters
149 | ----------
150 | tokens: list[str]
151 | List of tokens pertaining to each text.
152 | stopwords_: nltk.corpus.stopwords
153 | List of stopwords defined in NLTK.
154 |
155 | Returns
156 | -------
157 | list[str]
158 | Resultant list of text with no stopwords.
159 | """
160 | return [token for token in tokens if token not in stopwords_]
161 |
162 | def remove_less_frequent_words(dataframe) -> pd.DataFrame:
163 | """
164 | Helper function to remove the words that are less frequent (< 2 times).
165 |
166 | Parameters
167 | ----------
168 | dataframe: pd.DataFrame
169 | Input dataframe
170 |
171 | Returns
172 | -------
173 | Resultant dataframe with less frequent words removed.
174 | """
175 |
176 | dataframe['tokenized_strings'] = dataframe['tokenized_tweets'].apply(
177 | lambda tokens: ' '.join(
178 | [token for token in tokens if len(token) > 2]
179 | )
180 | )
181 | tokenized_words = nltk.tokenize.word_tokenize(' '.join(
182 | [word
183 | for word in dataframe['tokenized_strings']
184 | ]
185 | )
186 | )
187 | frequency_distribution = FreqDist(tokenized_words)
188 | dataframe['tokenized_strings'] = dataframe['tokenized_tweets'].apply(
189 | lambda tweets: ' '.join(
190 | [tweet for tweet in tweets
191 | if frequency_distribution[tweet] > 2
192 | ]
193 | )
194 | )
195 | return dataframe
196 |
197 | def assign_sentiment_labels(score: float) -> str:
198 | """
199 | Helper function to assign sentiment labels to polarity scores.
200 |
201 | Parameters
202 | ----------
203 | score: float
204 | Polarity score of each text.
205 |
206 | Returns
207 | -------
208 | sentiment_label: str
209 | """
210 | if score > 0.25:
211 | return "positive"
212 | elif score < 0:
213 | return "negative"
214 | else:
215 | return "neutral"
--------------------------------------------------------------------------------
/utils/model.py:
--------------------------------------------------------------------------------
1 | """
2 | @author: Jithin Sasikumar
3 |
4 | Model to define deep neural network for training.
5 |
6 | Bi-directional LSTM (biLSTM) network is used for this project encompassing an
7 | embedding layer, stack of biLSTM layers followed by fully connected dense layers
8 | with dropout. This module provides the flexibility to add any other models
9 | by inheriting Models(ABC).
10 |
11 | """
12 |
13 | from abc import ABC, abstractmethod
14 | from dataclasses import dataclass
15 | from keras.models import Sequential
16 | from keras.layers import Embedding, Bidirectional, LSTM, Dense, Dropout
17 |
18 | class Models(ABC):
19 | """
20 | Abstract base class that defines and creates model.
21 | """
22 | @abstractmethod
23 | def define_model(self):
24 | pass
25 |
26 | @abstractmethod
27 | def create_model(self):
28 | pass
29 |
30 |
31 | @dataclass
32 | class BiLSTM_Model(Models):
33 | """
34 | Dataclass to create biLSTM model inheriting Models class.
35 | """
36 | vocab_size: int
37 | num_classes: int
38 | embedding_dim: int = 64
39 | input_length: int = 128
40 |
41 | def define_model(self) -> Sequential:
42 | """
43 | Method to define model that can be used for training and inference.
44 | The existing model can also be tweaked by changing parameters,
45 | based on the requirements.
46 |
47 | Parameters
48 | ----------
49 | None
50 |
51 | Returns
52 | -------
53 | keras.models.Sequential
54 | """
55 | return Sequential(
56 | [
57 |
58 | # Embedding layer that expects the following:
59 | # Size of vocabulary, Output embedding vectors & Size of each input sequence
60 | Embedding(self.vocab_size, self.embedding_dim, input_length = self.input_length),
61 |
62 | #Bidirectional LSTM layers
63 | Bidirectional(LSTM(self.embedding_dim, return_sequences=True)),
64 | Bidirectional(LSTM(64, return_sequences = True)),
65 | Bidirectional(LSTM(32)),
66 |
67 | #Dense layers
68 | Dense(self.embedding_dim, activation = 'relu'),
69 | Dense(64, activation = 'relu'),
70 | Dropout(0.25),
71 | Dense(self.num_classes, activation = 'softmax')
72 | ]
73 | )
74 |
75 | def create_model(self) -> Sequential:
76 | """
77 | Method to create the model defined by define_model() method
78 | and prints the model summary.
79 |
80 | Parameters
81 | ----------
82 | None
83 |
84 | Returns
85 | -------
86 | model: keras.models.Sequential
87 | Created model
88 | """
89 |
90 | model: Sequential = self.define_model()
91 | model.summary()
92 | return model
--------------------------------------------------------------------------------
/utils/prepare_data.py:
--------------------------------------------------------------------------------
1 | """
2 | @author: Jithin Sasikumar
3 |
4 | Module to transform preprocessed dataframe (parquet or csv) into tf.data.Dataset format
5 | which creates an efficient input pipeline that in turn be fed into the tensorflow model.
6 | BERT tokenizer is used instead of normal tokenizer for better embeddings.
7 |
8 | """
9 | import math
10 | import pandas as pd
11 | import numpy as np
12 | import tensorflow as tf
13 | from dataclasses import dataclass, field
14 | from transformers import BertTokenizer
15 |
16 | @dataclass
17 | class Dataset:
18 | """
19 | Dataclass that encodes and transforms dataframe into tensorflow dataset.
20 | """
21 | tokenizer: BertTokenizer
22 | dataframe: pd.DataFrame = field(default_factory = pd.DataFrame())
23 | labels: np.ndarray = None
24 | batch_size: int = 64
25 | max_length: int = 256
26 | train: bool = False
27 | col_name: str = "cleaned_tweets"
28 |
29 | @property
30 | def list_of_texts(self) -> list[str]:
31 | """
32 | Class property to convert text column of dataframe to list of strings
33 | for processing.
34 |
35 | Parameters
36 | ----------
37 | None
38 |
39 | Returns
40 | -------
41 | list[str]
42 | List of texts
43 | """
44 | return self.dataframe[self.col_name].tolist()
45 |
46 | @property
47 | def shuffle_size(self) -> int:
48 | """
49 | Class property to calculate the shuffle size for dataset.
50 |
51 | Parameters
52 | ----------
53 | None
54 |
55 | Returns
56 | -------
57 | shuffle_size: int
58 | """
59 | return math.ceil(len(self.list_of_texts) / self.batch_size)
60 |
61 | def encode_bert_tokens_to_tf_dataset(self) -> tf.data.Dataset.zip:
62 | """
63 | Transform tokens into tensorflow dataset. The dataset is batched and
64 | shuffled.
65 |
66 | BERT tokenizer is used => (i.e.) The texts are tokenized and each token
67 | is encoded into unique IDs referred as input_ids by means of vocabulary.
68 |
69 | Parameters
70 | ----------
71 | None
72 |
73 | Returns
74 | -------
75 | dataset: tf.data.Dataset.zip
76 | Tensorflow dataset after batching and shuffling.
77 | """
78 | tokenized: BertTokenizer = self.tokenizer(
79 | text = self.list_of_texts,
80 | add_special_tokens = True,
81 | max_length = self.max_length,
82 | padding = "max_length",
83 | return_tensors = "tf",
84 | return_attention_mask = False,
85 | return_token_type_ids = False,
86 | verbose = True
87 | )
88 |
89 | input_ids = tf.data.Dataset.from_tensor_slices(np.array(tokenized['input_ids']))
90 | labels = tf.data.Dataset.from_tensor_slices(self.labels)
91 | # Zipping input_ids and labels as a single dataset object
92 | dataset = tf.data.Dataset.zip((input_ids, labels))
93 |
94 | if self.train:
95 | return dataset.shuffle(self.shuffle_size).batch(self.batch_size)
96 |
97 | return dataset.batch(self.batch_size)
--------------------------------------------------------------------------------