├── Data └── spam.csv ├── Notebooks └── spam-ham-classifier-sentence-bert-pos.ipynb ├── UI ├── Dockerfile ├── app.py └── requirements.txt ├── docker-compose.yml ├── images └── spam.png ├── readme.md ├── requirements.txt └── service ├── Dockerfile ├── main.py └── requirements.txt /Data/spam.csv: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/subhasisj/FastAPI-Streamlit-Docker-NLP/cd7706d3d431a09e3f96cd9f22aeadda2d3b2b83/Data/spam.csv -------------------------------------------------------------------------------- /Notebooks/spam-ham-classifier-sentence-bert-pos.ipynb: -------------------------------------------------------------------------------- 1 | { 2 | "metadata": { 3 | "language_info": { 4 | "codemirror_mode": { 5 | "name": "ipython", 6 | "version": 3 7 | }, 8 | "file_extension": ".py", 9 | "mimetype": "text/x-python", 10 | "name": "python", 11 | "nbconvert_exporter": "python", 12 | "pygments_lexer": "ipython3", 13 | "version": "3.6.9-final" 14 | }, 15 | "orig_nbformat": 2, 16 | "kernelspec": { 17 | "name": "python3", 18 | "display_name": "Python 3", 19 | "language": "python" 20 | } 21 | }, 22 | "nbformat": 4, 23 | "nbformat_minor": 2, 24 | "cells": [ 25 | { 26 | "cell_type": "code", 27 | "execution_count": 98, 28 | "metadata": {}, 29 | "outputs": [], 30 | "source": [ 31 | "import pandas as pd \n", 32 | "import numpy as np \n", 33 | "import spacy \n", 34 | "from sentence_transformers import SentenceTransformer\n", 35 | "from tqdm import tqdm\n" 36 | ] 37 | }, 38 | { 39 | "cell_type": "code", 40 | "execution_count": 99, 41 | "metadata": {}, 42 | "outputs": [ 43 | { 44 | "output_type": "execute_result", 45 | "data": { 46 | "text/plain": [ 47 | " v1 v2 Unnamed: 2 \\\n", 48 | "0 ham Go until jurong point, crazy.. Available only ... NaN \n", 49 | "1 ham Ok lar... Joking wif u oni... NaN \n", 50 | "2 spam Free entry in 2 a wkly comp to win FA Cup fina... NaN \n", 51 | "3 ham U dun say so early hor... U c already then say... NaN \n", 52 | "4 ham Nah I don't think he goes to usf, he lives aro... NaN \n", 53 | "\n", 54 | " Unnamed: 3 Unnamed: 4 \n", 55 | "0 NaN NaN \n", 56 | "1 NaN NaN \n", 57 | "2 NaN NaN \n", 58 | "3 NaN NaN \n", 59 | "4 NaN NaN " 60 | ], 61 | "text/html": "
\n\n\n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n
v1v2Unnamed: 2Unnamed: 3Unnamed: 4
0hamGo until jurong point, crazy.. Available only ...NaNNaNNaN
1hamOk lar... Joking wif u oni...NaNNaNNaN
2spamFree entry in 2 a wkly comp to win FA Cup fina...NaNNaNNaN
3hamU dun say so early hor... U c already then say...NaNNaNNaN
4hamNah I don't think he goes to usf, he lives aro...NaNNaNNaN
\n
" 62 | }, 63 | "metadata": {}, 64 | "execution_count": 99 65 | } 66 | ], 67 | "source": [ 68 | "df = pd.read_csv('../Data/spam.csv',encoding='latin-1')\n", 69 | "df.head()" 70 | ] 71 | }, 72 | { 73 | "cell_type": "code", 74 | "execution_count": 100, 75 | "metadata": {}, 76 | "outputs": [], 77 | "source": [ 78 | "df = df.iloc[:,0:2]" 79 | ] 80 | }, 81 | { 82 | "cell_type": "code", 83 | "execution_count": 101, 84 | "metadata": {}, 85 | "outputs": [ 86 | { 87 | "output_type": "execute_result", 88 | "data": { 89 | "text/plain": [ 90 | " label text\n", 91 | "0 ham Go until jurong point, crazy.. Available only ...\n", 92 | "1 ham Ok lar... Joking wif u oni...\n", 93 | "2 spam Free entry in 2 a wkly comp to win FA Cup fina...\n", 94 | "3 ham U dun say so early hor... U c already then say...\n", 95 | "4 ham Nah I don't think he goes to usf, he lives aro..." 96 | ], 97 | "text/html": "
\n\n\n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n
labeltext
0hamGo until jurong point, crazy.. Available only ...
1hamOk lar... Joking wif u oni...
2spamFree entry in 2 a wkly comp to win FA Cup fina...
3hamU dun say so early hor... U c already then say...
4hamNah I don't think he goes to usf, he lives aro...
\n
" 98 | }, 99 | "metadata": {}, 100 | "execution_count": 101 101 | } 102 | ], 103 | "source": [ 104 | "df.columns = ['label','text']\n", 105 | "df.head()" 106 | ] 107 | }, 108 | { 109 | "cell_type": "code", 110 | "execution_count": 102, 111 | "metadata": {}, 112 | "outputs": [ 113 | { 114 | "output_type": "stream", 115 | "name": "stderr", 116 | "text": [ 117 | "Processing with spaCy: 100%|██████████| 5572/5572 [00:29<00:00, 186.72it/s]\n" 118 | ] 119 | } 120 | ], 121 | "source": [ 122 | "nlp = spacy.load('en_core_web_sm')\n", 123 | "tqdm.pandas(desc='Processing with spaCy')\n", 124 | "spacy_results = df['text'].progress_map(nlp)" 125 | ] 126 | }, 127 | { 128 | "cell_type": "code", 129 | "execution_count": 103, 130 | "metadata": {}, 131 | "outputs": [], 132 | "source": [ 133 | "# Encode with Sentence Transformers\n", 134 | "\n", 135 | "sentence_bert = SentenceTransformer('paraphrase-distilroberta-base-v1')\n", 136 | "# tqdm.pandas(desc='Applying sentence-bert')\n", 137 | "# vectors = df['text'].progress_map(model.encode)\n" 138 | ] 139 | }, 140 | { 141 | "cell_type": "code", 142 | "execution_count": 104, 143 | "metadata": {}, 144 | "outputs": [ 145 | { 146 | "output_type": "display_data", 147 | "data": { 148 | "text/plain": "Pandas Apply: 0%| | 0/5572 [00:00\n\n\n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n
labeltextraw_spacyraw_possentence-bert
0hamGo until jurong point, crazy.. Available only ...(Go, until, jurong, point, ,, crazy, .., Avail...VERB ADP ADJ NOUN PUNCT ADJ PUNCT ADJ ADV ADP ...[0.076579936, -0.3930265, 0.27844715, 0.371942...
1hamOk lar... Joking wif u oni...(Ok, lar, ..., Joking, wif, u, oni, ...)INTJ ADJ PUNCT NOUN VERB NOUN ADV PUNCT[0.022812596, 0.17678502, 0.12619068, -0.65074...
2spamFree entry in 2 a wkly comp to win FA Cup fina...(Free, entry, in, 2, a, wkly, comp, to, win, F...ADJ NOUN ADP NUM DET ADJ NOUN PART VERB PROPN ...[0.15409197, 0.06857502, -0.13811308, -0.40663...
3hamU dun say so early hor... U c already then say...(U, dun, say, so, early, hor, ..., U, c, alrea...NOUN NOUN VERB ADV ADJ NOUN PUNCT NOUN AUX ADV...[0.09308915, -0.12710004, -0.033977684, -0.630...
4hamNah I don't think he goes to usf, he lives aro...(Nah, I, do, n't, think, he, goes, to, usf, ,,...PROPN PRON AUX PART VERB PRON VERB ADP NOUN PU...[-0.036661543, 0.19233567, -0.27760535, 0.3999...
\n" 232 | }, 233 | "metadata": {}, 234 | "execution_count": 106 235 | } 236 | ], 237 | "source": [ 238 | "df['sentence-bert'] = vectors_swifter\n", 239 | "df.head()" 240 | ] 241 | }, 242 | { 243 | "cell_type": "code", 244 | "execution_count": 107, 245 | "metadata": {}, 246 | "outputs": [ 247 | { 248 | "output_type": "display_data", 249 | "data": { 250 | "text/plain": "Pandas Apply: 0%| | 0/5572 [00:00\n\n\n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n
labeltextraw_spacyraw_possentence-bert
00Go until jurong point, crazy.. Available only ...(Go, until, jurong, point, ,, crazy, .., Avail...VERB ADP ADJ NOUN PUNCT ADJ PUNCT ADJ ADV ADP ...[0.076579936, -0.3930265, 0.27844715, 0.371942...
10Ok lar... Joking wif u oni...(Ok, lar, ..., Joking, wif, u, oni, ...)INTJ ADJ PUNCT NOUN VERB NOUN ADV PUNCT[0.022812596, 0.17678502, 0.12619068, -0.65074...
21Free entry in 2 a wkly comp to win FA Cup fina...(Free, entry, in, 2, a, wkly, comp, to, win, F...ADJ NOUN ADP NUM DET ADJ NOUN PART VERB PROPN ...[0.15409197, 0.06857502, -0.13811308, -0.40663...
30U dun say so early hor... U c already then say...(U, dun, say, so, early, hor, ..., U, c, alrea...NOUN NOUN VERB ADV ADJ NOUN PUNCT NOUN AUX ADV...[0.09308915, -0.12710004, -0.033977684, -0.630...
40Nah I don't think he goes to usf, he lives aro...(Nah, I, do, n't, think, he, goes, to, usf, ,,...PROPN PRON AUX PART VERB PRON VERB ADP NOUN PU...[-0.036661543, 0.19233567, -0.27760535, 0.3999...
\n" 292 | }, 293 | "metadata": {}, 294 | "execution_count": 107 295 | } 296 | ], 297 | "source": [ 298 | "df['label'] = df.label.swifter.apply(lambda x : 1 if x =='spam' else 0)\n", 299 | "df.head()" 300 | ] 301 | }, 302 | { 303 | "cell_type": "code", 304 | "execution_count": 108, 305 | "metadata": {}, 306 | "outputs": [], 307 | "source": [ 308 | "df.drop(columns=['raw_spacy'],inplace=True)" 309 | ] 310 | }, 311 | { 312 | "cell_type": "code", 313 | "execution_count": 171, 314 | "metadata": {}, 315 | "outputs": [], 316 | "source": [ 317 | "from sklearn.feature_extraction.text import CountVectorizer, TfidfVectorizer\n", 318 | "from sklearn.compose import ColumnTransformer\n", 319 | "from sklearn.preprocessing import FunctionTransformer\n", 320 | "from sklearn.model_selection import train_test_split\n", 321 | "from sklearn import metrics\n", 322 | "from sklearn.pipeline import Pipeline\n", 323 | "from sklearn.linear_model import LogisticRegression\n", 324 | "from xgboost import XGBClassifier\n", 325 | "\n" 326 | ] 327 | }, 328 | { 329 | "cell_type": "code", 330 | "execution_count": 289, 331 | "metadata": {}, 332 | "outputs": [], 333 | "source": [ 334 | "train_df = df.copy()" 335 | ] 336 | }, 337 | { 338 | "cell_type": "code", 339 | "execution_count": 290, 340 | "metadata": {}, 341 | "outputs": [], 342 | "source": [ 343 | "def stack_embeddings(embeddings):\n", 344 | " import numpy as np\n", 345 | " return np.vstack(embeddings.values)\n", 346 | "\n", 347 | "ct = ColumnTransformer([\n", 348 | " ('bag of ngrams', TfidfVectorizer(ngram_range=(1, 2), max_features=3000), 'text'),\n", 349 | " ('bag of POS', CountVectorizer(ngram_range=(1, 2)), 'raw_pos'),\n", 350 | " # Lambda functions cannot be pickled\n", 351 | " ('sentence bert', FunctionTransformer(stack_embeddings), 'sentence-bert'),\n", 352 | " # ('bag of NER types', CountVectorizer(ngram_range=(1, 2)), 'raw_ner'),\n", 353 | " # ('ngrams before', TfidfVectorizer(ngram_range=(1, 2), max_features=3000), 'raw_before'),\n", 354 | " # ('ngrams after', TfidfVectorizer(ngram_range=(1, 2), max_features=3000), 'raw_after') \n", 355 | "],remainder='passthrough')\n", 356 | "\n", 357 | "# lm = LogisticRegression()\n", 358 | "xgb = XGBClassifier(random_state=0)\n", 359 | "\n", 360 | "# pipeline = Pipeline([('transformer', ct), ('classifier', lm)])\n", 361 | "pipeline = Pipeline([('transformer', ct), ('classifier', xgb)])\n", 362 | "\n", 363 | "\n", 364 | "y,X = train_df.pop('label'),train_df\n", 365 | "\n", 366 | "X_train, X_test, y_train, y_test = train_test_split(X,y, test_size=0.3, random_state=42,stratify=y)" 367 | ] 368 | }, 369 | { 370 | "cell_type": "code", 371 | "execution_count": 291, 372 | "metadata": {}, 373 | "outputs": [ 374 | { 375 | "output_type": "stream", 376 | "name": "stdout", 377 | "text": [ 378 | "[15:13:14] WARNING: ../src/learner.cc:1061: Starting in XGBoost 1.3.0, the default evaluation metric used with the objective 'binary:logistic' was changed from 'error' to 'logloss'. Explicitly set eval_metric if you'd like to restore the old behavior.\n", 379 | "CPU times: user 53.3 s, sys: 354 ms, total: 53.7 s\n", 380 | "Wall time: 5.52 s\n" 381 | ] 382 | } 383 | ], 384 | "source": [ 385 | "\n", 386 | "\n", 387 | "%time model = pipeline.fit(X_train, y_train)" 388 | ] 389 | }, 390 | { 391 | "cell_type": "code", 392 | "execution_count": 292, 393 | "metadata": {}, 394 | "outputs": [], 395 | "source": [ 396 | "y_pred = model.predict(X_test)" 397 | ] 398 | }, 399 | { 400 | "cell_type": "code", 401 | "execution_count": 293, 402 | "metadata": {}, 403 | "outputs": [ 404 | { 405 | "output_type": "stream", 406 | "name": "stdout", 407 | "text": [ 408 | " precision recall f1-score support\n\n 0 0.99 1.00 0.99 1448\n 1 0.99 0.91 0.95 224\n\n accuracy 0.99 1672\n macro avg 0.99 0.95 0.97 1672\nweighted avg 0.99 0.99 0.99 1672\n\n" 409 | ] 410 | } 411 | ], 412 | "source": [ 413 | "print(metrics.classification_report(y_test,y_pred))" 414 | ] 415 | }, 416 | { 417 | "cell_type": "code", 418 | "execution_count": 294, 419 | "metadata": {}, 420 | "outputs": [], 421 | "source": [ 422 | "# import joblib\n", 423 | "# filename = 'model.sav'\n", 424 | "# joblib.dump(model, filename)\n", 425 | "import dill\n", 426 | "\n", 427 | "\n", 428 | "pkl_filename = \"../Models/model.pkl\"\n", 429 | "with open(pkl_filename, 'wb') as file:\n", 430 | " dill.dump(model, file)" 431 | ] 432 | }, 433 | { 434 | "cell_type": "code", 435 | "execution_count": 295, 436 | "metadata": {}, 437 | "outputs": [ 438 | { 439 | "output_type": "stream", 440 | "name": "stdout", 441 | "text": [ 442 | "\u001b[0m\u001b[01;32mmodel.pkl\u001b[0m*\n" 443 | ] 444 | } 445 | ], 446 | "source": [ 447 | "ls ../Models" 448 | ] 449 | }, 450 | { 451 | "cell_type": "code", 452 | "execution_count": 296, 453 | "metadata": {}, 454 | "outputs": [], 455 | "source": [ 456 | "with open(pkl_filename,'rb') as file:\n", 457 | " loaded_model = dill.load(file)" 458 | ] 459 | }, 460 | { 461 | "cell_type": "code", 462 | "execution_count": 297, 463 | "metadata": {}, 464 | "outputs": [], 465 | "source": [ 466 | "def make_inference_df(input_text):\n", 467 | "\n", 468 | " model_input_dict = {}\n", 469 | " input_row_list = []\n", 470 | " \n", 471 | "\n", 472 | " spacy_raw = nlp(input_text)\n", 473 | " # pos_tags = [t.pos_ for t in spacy_raw]\n", 474 | "\n", 475 | " model_input_dict['text'] = input_text\n", 476 | " model_input_dict['raw_pos'] = ' '.join([t.pos_ for t in spacy_raw])\n", 477 | " model_input_dict['sentence-bert'] = sentence_bert.encode(input_text)\n", 478 | "\n", 479 | " input_row_list.append(model_input_dict)\n", 480 | "\n", 481 | " model_input_df = pd.DataFrame(input_row_list)\n", 482 | " return model_input_df" 483 | ] 484 | }, 485 | { 486 | "cell_type": "code", 487 | "execution_count": 298, 488 | "metadata": {}, 489 | "outputs": [ 490 | { 491 | "output_type": "execute_result", 492 | "data": { 493 | "text/plain": [ 494 | "array([1])" 495 | ] 496 | }, 497 | "metadata": {}, 498 | "execution_count": 298 499 | } 500 | ], 501 | "source": [ 502 | "sample_text = 'Free entry in 2 a wkly comp to win FA Cup final tkts 21st May 2005. Text FA to 87121 to receive entry question(std txt rate)T&C\\'s'\n", 503 | "# make_inference_df(sample_text)\n", 504 | "loaded_model.predict(make_inference_df(sample_text))" 505 | ] 506 | }, 507 | { 508 | "cell_type": "code", 509 | "execution_count": 299, 510 | "metadata": {}, 511 | "outputs": [ 512 | { 513 | "output_type": "stream", 514 | "name": "stdout", 515 | "text": [ 516 | "Nah I don't think he goes to usf, he lives around here though\n" 517 | ] 518 | }, 519 | { 520 | "output_type": "execute_result", 521 | "data": { 522 | "text/plain": [ 523 | "array([1])" 524 | ] 525 | }, 526 | "metadata": {}, 527 | "execution_count": 299 528 | } 529 | ], 530 | "source": [ 531 | "sample_text_2 = 'Nah I don\\'t think he goes to usf, he lives around here though'\n", 532 | "print(sample_text_2)\n", 533 | "loaded_model.predict(make_inference_df(sample_text))" 534 | ] 535 | }, 536 | { 537 | "cell_type": "code", 538 | "execution_count": null, 539 | "metadata": {}, 540 | "outputs": [], 541 | "source": [] 542 | }, 543 | { 544 | "cell_type": "code", 545 | "execution_count": null, 546 | "metadata": {}, 547 | "outputs": [], 548 | "source": [] 549 | }, 550 | { 551 | "cell_type": "code", 552 | "execution_count": null, 553 | "metadata": {}, 554 | "outputs": [], 555 | "source": [] 556 | } 557 | ] 558 | } -------------------------------------------------------------------------------- /UI/Dockerfile: -------------------------------------------------------------------------------- 1 | FROM python:3.8-slim 2 | 3 | WORKDIR /app 4 | 5 | COPY requirements.txt . 6 | RUN pip install -r requirements.txt 7 | 8 | # COPY . . 9 | 10 | EXPOSE 8501 11 | 12 | CMD ["streamlit", "run", "app.py"] -------------------------------------------------------------------------------- /UI/app.py: -------------------------------------------------------------------------------- 1 | import streamlit as st 2 | import requests 3 | 4 | 5 | def main(): 6 | 7 | st.title("Spam Classification") 8 | message = st.text_input('Enter Text to Classify') 9 | 10 | if st.button('Predict'): 11 | payload = { 12 | "text": message 13 | } 14 | res = requests.post(f"http://service:8000/predict/",json=payload ) 15 | with st.spinner('Classifying, please wait....'): 16 | st.write(res.json()) 17 | 18 | 19 | 20 | 21 | if __name__ == '__main__': 22 | main() -------------------------------------------------------------------------------- /UI/requirements.txt: -------------------------------------------------------------------------------- 1 | aiohttp==3.7.3 2 | aiohttp-cors==0.7.0 3 | aioredis==1.3.1 4 | altair==4.1.0 5 | argon2-cffi==20.1.0 6 | astor==0.8.1 7 | async-generator==1.10 8 | async-timeout==3.0.1 9 | attrs==20.3.0 10 | backcall==0.2.0 11 | base58==2.1.0 12 | bleach==3.3.0 13 | blessings==1.7 14 | blinker==1.4 15 | blis==0.7.4 16 | cachetools==4.2.1 17 | catalogue==2.0.1 18 | certifi==2020.12.5 19 | cffi==1.14.4 20 | chardet==3.0.4 21 | click==7.1.2 22 | colorama==0.4.4 23 | colorful==0.5.4 24 | contextvars==2.4 25 | cymem==2.0.5 26 | # dask==2021.2.0 27 | # dataclasses==0.8 28 | decorator==4.4.2 29 | defusedxml==0.6.0 30 | dill==0.3.3 31 | en-core-web-sm @ https://github.com/explosion/spacy-models/releases/download/en_core_web_sm-3.0.0/en_core_web_sm-3.0.0-py3-none-any.whl 32 | entrypoints==0.3 33 | fastapi==0.63.0 34 | filelock==3.0.12 35 | fsspec==0.8.5 36 | gitdb==4.0.5 37 | GitPython==3.1.13 38 | google-api-core==1.26.0 39 | google-auth==1.26.0 40 | googleapis-common-protos==1.52.0 41 | gpustat==0.6.0 42 | grpcio==1.35.0 43 | h11==0.12.0 44 | hiredis==1.1.0 45 | idna==2.10 46 | idna-ssl==1.1.0 47 | immutables==0.15 48 | importlib-metadata==3.4.0 49 | ipykernel==5.4.3 50 | ipython==7.16.1 51 | ipython-genutils==0.2.0 52 | ipywidgets==7.6.3 53 | jedi==0.18.0 54 | Jinja2==2.11.3 55 | joblib==1.0.1 56 | jsonschema==3.2.0 57 | jupyter-client==6.1.11 58 | jupyter-core==4.7.1 59 | jupyterlab-pygments==0.1.2 60 | jupyterlab-widgets==1.0.0 61 | locket==0.2.1 62 | MarkupSafe==1.1.1 63 | mistune==0.8.4 64 | modin==0.8.3 65 | msgpack==1.0.2 66 | multidict==5.1.0 67 | murmurhash==1.0.5 68 | nbclient==0.5.2 69 | nbconvert==6.0.7 70 | nbformat==5.1.2 71 | nest-asyncio==1.5.1 72 | nltk==3.5 73 | notebook==6.2.0 74 | numpy==1.19.5 75 | nvidia-ml-py3==7.352.0 76 | opencensus==0.7.12 77 | opencensus-context==0.1.2 78 | packaging==20.9 79 | pandarallel==1.5.2 80 | pandas==1.1.5 81 | pandocfilters==1.4.3 82 | parso==0.8.1 83 | partd==1.1.0 84 | pathy==0.3.6 85 | pexpect==4.8.0 86 | pickleshare==0.7.5 87 | Pillow==8.1.0 88 | preshed==3.0.5 89 | prometheus-client==0.9.0 90 | prompt-toolkit==3.0.15 91 | protobuf==3.14.0 92 | psutil==5.8.0 93 | ptyprocess==0.7.0 94 | py-spy==0.3.4 95 | pyarrow==1.0.0 96 | pyasn1==0.4.8 97 | pyasn1-modules==0.2.8 98 | pycparser==2.20 99 | pydantic==1.7.3 100 | pydeck==0.6.0 101 | Pygments==2.7.4 102 | pyparsing==2.4.7 103 | pyrsistent==0.17.3 104 | python-dateutil==2.8.1 105 | pytz==2021.1 106 | PyYAML==5.4.1 107 | pyzmq==22.0.2 108 | ray==1.1.0 109 | redis==3.5.3 110 | regex==2020.11.13 111 | requests==2.25.1 112 | rsa==4.7 113 | sacremoses==0.0.43 114 | scikit-learn==0.24.1 115 | scipy==1.5.4 116 | Send2Trash==1.5.0 117 | sentence-transformers==0.4.1.2 118 | sentencepiece==0.1.95 119 | six==1.15.0 120 | smart-open==3.0.0 121 | smmap==3.0.5 122 | spacy==3.0.1 123 | spacy-legacy==3.0.1 124 | srsly==2.4.0 125 | starlette==0.13.6 126 | streamlit==0.76.0 127 | # swifter==1.0.7 128 | terminado==0.9.2 129 | testpath==0.4.4 130 | thinc==8.0.1 131 | threadpoolctl==2.1.0 132 | tokenizers==0.10.1 133 | toml==0.10.2 134 | toolz==0.11.1 135 | torch==1.7.1 136 | tornado==6.1 137 | tqdm==4.56.2 138 | traitlets==4.3.3 139 | transformers==4.3.2 140 | typer==0.3.2 141 | typing-extensions==3.7.4.3 142 | tzlocal==2.1 143 | urllib3==1.26.3 144 | uvicorn==0.13.3 145 | validators==0.18.2 146 | wasabi==0.8.2 147 | watchdog==2.0.0 148 | wcwidth==0.2.5 149 | webencodings==0.5.1 150 | widgetsnbextension==3.5.1 151 | xgboost==1.3.3 152 | yarl==1.6.3 153 | zipp==3.4.0 154 | -------------------------------------------------------------------------------- /docker-compose.yml: -------------------------------------------------------------------------------- 1 | version: '3' 2 | 3 | services: 4 | frontendui: 5 | build: UI 6 | ports: 7 | - 8501:8501 8 | volumes: 9 | - './UI:/app:delegated' 10 | depends_on: 11 | - service 12 | 13 | service: 14 | build: service 15 | ports: 16 | - 8000:8000 -------------------------------------------------------------------------------- /images/spam.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/subhasisj/FastAPI-Streamlit-Docker-NLP/cd7706d3d431a09e3f96cd9f22aeadda2d3b2b83/images/spam.png -------------------------------------------------------------------------------- /readme.md: -------------------------------------------------------------------------------- 1 | # Text-Classification with FastAPI / Streamlit / Docker Compose 2 | 3 | This project showcases the use of `FastAPI` and `Streamlit` in tandem. 4 | The trained model is deployed using a FastAPI rest service containerized using `Docker`. 5 | The front end UI is built on Streamlit which is hosted on its own Docker container. 6 | 7 | We spin both the containers together using `Docker Compose` . 8 | 9 | 10 | ![GitHub Logo](/images/spam.png) 11 | 12 | 13 | ## How to use 14 | 15 | Clone this repo and run the below docker command: 16 | 17 | `To Start Application:` 18 | ```docker 19 | docker-compose up -d --build 20 | ``` 21 | and navigate to http://localhost:8501/ 22 | 23 | `To Stop Application:` 24 | ```docker 25 | docker-compose down 26 | ``` 27 | 28 | ### Trivia: 29 | 30 | _Using the volume tag in the compose file, we can mount the local folders from your computer to the Docker container. Now you can develop your app while using Docker and save changes._ -------------------------------------------------------------------------------- /requirements.txt: -------------------------------------------------------------------------------- 1 | aiohttp==3.7.3 2 | aiohttp-cors==0.7.0 3 | aioredis==1.3.1 4 | altair==4.1.0 5 | argon2-cffi==20.1.0 6 | astor==0.8.1 7 | async-generator==1.10 8 | async-timeout==3.0.1 9 | attrs==20.3.0 10 | backcall==0.2.0 11 | base58==2.1.0 12 | bleach==3.3.0 13 | blessings==1.7 14 | blinker==1.4 15 | blis==0.7.4 16 | cachetools==4.2.1 17 | catalogue==2.0.1 18 | certifi==2020.12.5 19 | cffi==1.14.4 20 | chardet==3.0.4 21 | click==7.1.2 22 | colorama==0.4.4 23 | colorful==0.5.4 24 | contextvars==2.4 25 | cymem==2.0.5 26 | dask==2021.2.0 27 | dataclasses==0.8 28 | decorator==4.4.2 29 | defusedxml==0.6.0 30 | dill==0.3.3 31 | en-core-web-sm @ https://github.com/explosion/spacy-models/releases/download/en_core_web_sm-3.0.0/en_core_web_sm-3.0.0-py3-none-any.whl 32 | entrypoints==0.3 33 | fastapi==0.63.0 34 | filelock==3.0.12 35 | fsspec==0.8.5 36 | gitdb==4.0.5 37 | GitPython==3.1.13 38 | google-api-core==1.26.0 39 | google-auth==1.26.0 40 | googleapis-common-protos==1.52.0 41 | gpustat==0.6.0 42 | grpcio==1.35.0 43 | h11==0.12.0 44 | hiredis==1.1.0 45 | idna==2.10 46 | idna-ssl==1.1.0 47 | immutables==0.15 48 | importlib-metadata==3.4.0 49 | ipykernel==5.4.3 50 | ipython==7.16.1 51 | ipython-genutils==0.2.0 52 | ipywidgets==7.6.3 53 | jedi==0.18.0 54 | Jinja2==2.11.3 55 | joblib==1.0.1 56 | jsonschema==3.2.0 57 | jupyter-client==6.1.11 58 | jupyter-core==4.7.1 59 | jupyterlab-pygments==0.1.2 60 | jupyterlab-widgets==1.0.0 61 | locket==0.2.1 62 | MarkupSafe==1.1.1 63 | mistune==0.8.4 64 | modin==0.8.3 65 | msgpack==1.0.2 66 | multidict==5.1.0 67 | murmurhash==1.0.5 68 | nbclient==0.5.2 69 | nbconvert==6.0.7 70 | nbformat==5.1.2 71 | nest-asyncio==1.5.1 72 | nltk==3.5 73 | notebook==6.2.0 74 | numpy==1.19.5 75 | nvidia-ml-py3==7.352.0 76 | opencensus==0.7.12 77 | opencensus-context==0.1.2 78 | packaging==20.9 79 | pandarallel==1.5.2 80 | pandas==1.1.5 81 | pandocfilters==1.4.3 82 | parso==0.8.1 83 | partd==1.1.0 84 | pathy==0.3.6 85 | pexpect==4.8.0 86 | pickleshare==0.7.5 87 | Pillow==8.1.0 88 | preshed==3.0.5 89 | prometheus-client==0.9.0 90 | prompt-toolkit==3.0.15 91 | protobuf==3.14.0 92 | psutil==5.8.0 93 | ptyprocess==0.7.0 94 | py-spy==0.3.4 95 | pyarrow==1.0.0 96 | pyasn1==0.4.8 97 | pyasn1-modules==0.2.8 98 | pycparser==2.20 99 | pydantic==1.7.3 100 | pydeck==0.6.0 101 | Pygments==2.7.4 102 | pyparsing==2.4.7 103 | pyrsistent==0.17.3 104 | python-dateutil==2.8.1 105 | pytz==2021.1 106 | PyYAML==5.4.1 107 | pyzmq==22.0.2 108 | ray==1.1.0 109 | redis==3.5.3 110 | regex==2020.11.13 111 | requests==2.25.1 112 | rsa==4.7 113 | sacremoses==0.0.43 114 | scikit-learn==0.24.1 115 | scipy==1.5.4 116 | Send2Trash==1.5.0 117 | sentence-transformers==0.4.1.2 118 | sentencepiece==0.1.95 119 | six==1.15.0 120 | smart-open==3.0.0 121 | smmap==3.0.5 122 | spacy==3.0.1 123 | spacy-legacy==3.0.1 124 | srsly==2.4.0 125 | starlette==0.13.6 126 | streamlit==0.76.0 127 | swifter==1.0.7 128 | terminado==0.9.2 129 | testpath==0.4.4 130 | thinc==8.0.1 131 | threadpoolctl==2.1.0 132 | tokenizers==0.10.1 133 | toml==0.10.2 134 | toolz==0.11.1 135 | torch==1.7.1 136 | tornado==6.1 137 | tqdm==4.56.2 138 | traitlets==4.3.3 139 | transformers==4.3.2 140 | typer==0.3.2 141 | typing-extensions==3.7.4.3 142 | tzlocal==2.1 143 | urllib3==1.26.3 144 | uvicorn==0.13.3 145 | validators==0.18.2 146 | wasabi==0.8.2 147 | watchdog==2.0.0 148 | wcwidth==0.2.5 149 | webencodings==0.5.1 150 | widgetsnbextension==3.5.1 151 | xgboost==1.3.3 152 | yarl==1.6.3 153 | zipp==3.4.0 154 | -------------------------------------------------------------------------------- /service/Dockerfile: -------------------------------------------------------------------------------- 1 | FROM python:3.8-slim 2 | 3 | WORKDIR /app 4 | 5 | RUN apt-get update 6 | RUN apt-get install \ 7 | 'ffmpeg'\ 8 | 'libsm6'\ 9 | 'libxext6' -y 10 | 11 | COPY ./requirements.txt . 12 | RUN pip install -r requirements.txt 13 | 14 | COPY . . 15 | 16 | EXPOSE 8000 17 | 18 | CMD ["python", "main.py"] -------------------------------------------------------------------------------- /service/main.py: -------------------------------------------------------------------------------- 1 | # https://github.com/kumarvc/Fastapi-docker-kubernetes-streamlit/blob/master/app/main.py 2 | 3 | # https://medium.com/analytics-vidhya/deploying-a-nlp-model-with-docker-and-fastapi-d972779d8008 4 | 5 | # https://www.analyticsvidhya.com/blog/2020/12/streamlit-web-api-for-nlp-tweet-sentiment-analysis/ 6 | 7 | # https://testdriven.io/blog/fastapi-streamlit/#docker-compose 8 | 9 | from fastapi import FastAPI 10 | import dill 11 | import os 12 | import numpy as np 13 | import uvicorn 14 | from pydantic import BaseModel 15 | import pandas as pd 16 | import spacy 17 | from sentence_transformers import SentenceTransformer 18 | 19 | 20 | PKL_FILENAME = "model.pkl" 21 | MODELS_PATH = "./Models/" 22 | MODEL_FILE_PATH = os.path.join(MODELS_PATH,PKL_FILENAME) 23 | SENTENCE_BERT = SentenceTransformer('paraphrase-distilroberta-base-v1') 24 | nlp = spacy.load('en_core_web_sm') 25 | 26 | 27 | def load_model(): 28 | with open(MODEL_FILE_PATH,'rb') as file: 29 | return dill.load(file) 30 | 31 | LOADED_MODEL = load_model() 32 | 33 | app = FastAPI(title="Ham or Spam API", description="API to predict if a SMS is ham or spam") 34 | 35 | 36 | class Data(BaseModel): 37 | text:str 38 | 39 | 40 | 41 | def make_inference_df(input_text): 42 | 43 | model_input_dict = {} 44 | input_row_list = [] 45 | 46 | 47 | spacy_raw = nlp(input_text) 48 | # pos_tags = [t.pos_ for t in spacy_raw] 49 | 50 | model_input_dict['text'] = input_text 51 | model_input_dict['raw_pos'] = ' '.join([t.pos_ for t in spacy_raw]) 52 | model_input_dict['sentence-bert'] = SENTENCE_BERT.encode(input_text) 53 | 54 | input_row_list.append(model_input_dict) 55 | 56 | model_input_df = pd.DataFrame(input_row_list) 57 | return model_input_df 58 | 59 | @app.get("/") 60 | def read_root(): 61 | return {"message": "Welcome from the API"} 62 | 63 | @app.post("/predict") 64 | def predict(data:Data): 65 | 66 | model_input_df = make_inference_df(data.text) 67 | prediction = LOADED_MODEL.predict_proba(model_input_df) 68 | print(data.text) 69 | ham_probability = 1 - prediction[:,1][0] 70 | spam_probability = prediction[:,1][0] 71 | print(spam_probability) 72 | return { 73 | 74 | "Text" : data.text, 75 | "prediction": "SPAM" if prediction[:,1][0] > 0.5 else "HAM", 76 | "SPAM" : float(spam_probability), 77 | "HAM" : float(ham_probability), 78 | 79 | } 80 | 81 | 82 | if __name__ == '__main__': 83 | uvicorn.run("main:app", host="0.0.0.0", port=8000, reload=True) -------------------------------------------------------------------------------- /service/requirements.txt: -------------------------------------------------------------------------------- 1 | aiohttp==3.7.3 2 | aiohttp-cors==0.7.0 3 | aioredis==1.3.1 4 | altair==4.1.0 5 | argon2-cffi==20.1.0 6 | astor==0.8.1 7 | async-generator==1.10 8 | async-timeout==3.0.1 9 | attrs==20.3.0 10 | backcall==0.2.0 11 | base58==2.1.0 12 | bleach==3.3.0 13 | blessings==1.7 14 | blinker==1.4 15 | blis==0.7.4 16 | cachetools==4.2.1 17 | catalogue==2.0.1 18 | certifi==2020.12.5 19 | cffi==1.14.4 20 | chardet==3.0.4 21 | click==7.1.2 22 | colorama==0.4.4 23 | colorful==0.5.4 24 | contextvars==2.4 25 | cymem==2.0.5 26 | # dask==2021.2.0 27 | # dataclasses==0.8 28 | decorator==4.4.2 29 | defusedxml==0.6.0 30 | dill==0.3.3 31 | en-core-web-sm @ https://github.com/explosion/spacy-models/releases/download/en_core_web_sm-3.0.0/en_core_web_sm-3.0.0-py3-none-any.whl 32 | entrypoints==0.3 33 | fastapi==0.63.0 34 | filelock==3.0.12 35 | fsspec==0.8.5 36 | gitdb==4.0.5 37 | GitPython==3.1.13 38 | google-api-core==1.26.0 39 | google-auth==1.26.0 40 | googleapis-common-protos==1.52.0 41 | gpustat==0.6.0 42 | grpcio==1.35.0 43 | h11==0.12.0 44 | hiredis==1.1.0 45 | idna==2.10 46 | idna-ssl==1.1.0 47 | immutables==0.15 48 | importlib-metadata==3.4.0 49 | ipykernel==5.4.3 50 | ipython==7.16.1 51 | ipython-genutils==0.2.0 52 | ipywidgets==7.6.3 53 | jedi==0.18.0 54 | Jinja2==2.11.3 55 | joblib==1.0.1 56 | jsonschema==3.2.0 57 | jupyter-client==6.1.11 58 | jupyter-core==4.7.1 59 | jupyterlab-pygments==0.1.2 60 | jupyterlab-widgets==1.0.0 61 | locket==0.2.1 62 | MarkupSafe==1.1.1 63 | mistune==0.8.4 64 | modin==0.8.3 65 | msgpack==1.0.2 66 | multidict==5.1.0 67 | murmurhash==1.0.5 68 | nbclient==0.5.2 69 | nbconvert==6.0.7 70 | nbformat==5.1.2 71 | nest-asyncio==1.5.1 72 | nltk==3.5 73 | notebook==6.2.0 74 | numpy==1.19.5 75 | nvidia-ml-py3==7.352.0 76 | opencensus==0.7.12 77 | opencensus-context==0.1.2 78 | packaging==20.9 79 | pandarallel==1.5.2 80 | pandas==1.1.5 81 | pandocfilters==1.4.3 82 | parso==0.8.1 83 | partd==1.1.0 84 | pathy==0.3.6 85 | pexpect==4.8.0 86 | pickleshare==0.7.5 87 | Pillow==8.1.0 88 | preshed==3.0.5 89 | prometheus-client==0.9.0 90 | prompt-toolkit==3.0.15 91 | protobuf==3.14.0 92 | psutil==5.8.0 93 | ptyprocess==0.7.0 94 | py-spy==0.3.4 95 | pyarrow==1.0.0 96 | pyasn1==0.4.8 97 | pyasn1-modules==0.2.8 98 | pycparser==2.20 99 | pydantic==1.7.3 100 | pydeck==0.6.0 101 | Pygments==2.7.4 102 | pyparsing==2.4.7 103 | pyrsistent==0.17.3 104 | python-dateutil==2.8.1 105 | pytz==2021.1 106 | PyYAML==5.4.1 107 | pyzmq==22.0.2 108 | ray==1.1.0 109 | redis==3.5.3 110 | regex==2020.11.13 111 | requests==2.25.1 112 | rsa==4.7 113 | sacremoses==0.0.43 114 | scikit-learn==0.24.1 115 | scipy==1.5.4 116 | Send2Trash==1.5.0 117 | sentence-transformers==0.4.1.2 118 | sentencepiece==0.1.95 119 | six==1.15.0 120 | smart-open==3.0.0 121 | smmap==3.0.5 122 | spacy==3.0.1 123 | spacy-legacy==3.0.1 124 | srsly==2.4.0 125 | starlette==0.13.6 126 | streamlit==0.76.0 127 | # swifter==1.0.7 128 | terminado==0.9.2 129 | testpath==0.4.4 130 | thinc==8.0.1 131 | threadpoolctl==2.1.0 132 | tokenizers==0.10.1 133 | toml==0.10.2 134 | toolz==0.11.1 135 | torch==1.7.1 136 | tornado==6.1 137 | tqdm==4.56.2 138 | traitlets==4.3.3 139 | transformers==4.3.2 140 | typer==0.3.2 141 | typing-extensions==3.7.4.3 142 | tzlocal==2.1 143 | urllib3==1.26.3 144 | uvicorn==0.13.3 145 | validators==0.18.2 146 | wasabi==0.8.2 147 | watchdog==2.0.0 148 | wcwidth==0.2.5 149 | webencodings==0.5.1 150 | widgetsnbextension==3.5.1 151 | xgboost==1.3.3 152 | yarl==1.6.3 153 | zipp==3.4.0 154 | --------------------------------------------------------------------------------