├── .gitattributes
├── .gitignore
├── 2_level_doc_classification
    ├── bento_package.py
    ├── bento_predictor.py
    ├── docker-compose.yml
    ├── ecs-params.yml
    ├── hybrid_v2.py
    ├── pre_process_text.py
    ├── pre_processing.py
    └── train.py
├── LICENSE
├── README.md
├── conditional_GAN
    ├── __pycache__
    │   └── bento_predictor.cpython-38.pyc
    ├── artifacts
    │   ├── saved_model.pb
    │   └── variables
    │   │   ├── variables.data-00000-of-00001
    │   │   └── variables.index
    ├── bento_package.py
    ├── bento_predictor.py
    ├── infer.py
    └── train.py
├── document_classification
    ├── __pycache__
    │   ├── hybrid_v1.cpython-38.pyc
    │   ├── pre_process_text.cpython-38.pyc
    │   └── pre_processing.cpython-38.pyc
    ├── bento_package.py
    ├── bento_predictor.py
    ├── docker-compose.yml
    ├── ecs-params.yml
    ├── hybrid_v1.py
    ├── pre_process_text.py
    ├── pre_processing.py
    └── train.py
├── image_classifier
    ├── __pycache__
    │   └── bento_predictor.cpython-38.pyc
    ├── artifacts
    │   ├── saved_model.pb
    │   └── variables
    │   │   ├── variables.data-00000-of-00001
    │   │   └── variables.index
    ├── bento_package.py
    ├── bento_predictor.py
    ├── docker-compose.yml
    ├── ecs-params.yml
    ├── infer.py
    └── train.py
├── multiple_models
    ├── bento_package.py
    ├── bento_predictor.py
    ├── docker-compose.yml
    ├── ecs-params.yml
    ├── hybrid_v1.py
    ├── hybrid_v2.py
    ├── pre_process_text.py
    ├── pre_processing.py
    └── train.py
├── trial.py
└── vanilla_GAN
    ├── __pycache__
        └── bento_predictor.cpython-38.pyc
    ├── artifacts
        ├── saved_model.pb
        └── variables
        │   ├── variables.data-00000-of-00001
        │   └── variables.index
    ├── bento_package.py
    ├── bento_predictor.py
    ├── infer.py
    └── train.py


/.gitattributes:
--------------------------------------------------------------------------------
1 | # Auto detect text files and perform LF normalization
2 | * text=auto
3 | 


--------------------------------------------------------------------------------
/.gitignore:
--------------------------------------------------------------------------------
1 | 
2 | *.pyc
3 | *.pyc
4 | 


--------------------------------------------------------------------------------
/2_level_doc_classification/bento_package.py:
--------------------------------------------------------------------------------
 1 | from bento_predictor import ModelZoo
 2 | import tensorflow as tf
 3 | from tensorflow.keras.models import load_model
 4 | from tensorflow.keras.preprocessing.text import Tokenizer
 5 | import spacy
 6 | import json
 7 | 
 8 | 
 9 | def som_models(
10 |     model_zoo,
11 |     tf_model_path_som: str,
12 |     text_file_path_som: str,
13 |     master_labels_path: str,
14 |     sub_labels_path: str,
15 | ):
16 |     model_cnn = load_model(tf_model_path_som)
17 |     tf.saved_model.save(model_cnn, "artifacts/")
18 |     model_cnn = tf.saved_model.load("artifacts/")
19 |     model_zoo.pack("som_model", model_cnn)
20 | 
21 |     text_model = spacy.load("en_core_web_sm")
22 |     model_zoo.pack("som_spacy_model", text_model)
23 | 
24 |     tokenizer = Tokenizer()
25 |     with open(text_file_path_som, "r") as f:
26 |         bow = f.read()
27 |     tokenizer.fit_on_texts(bow.split("####"))
28 |     model_zoo.pack("som_tokenizer", tokenizer)
29 | 
30 |     with open(master_labels_path, "r") as f:
31 |         labels_som = json.load(f)
32 |     model_zoo.pack("som_master_labels", labels_som)
33 | 
34 |     with open(sub_labels_path, "r") as g:
35 |         sub_labels_som = json.load(g)
36 |     model_zoo.pack("som_sub_labels", sub_labels_som)
37 | 
38 | 
39 | def main():
40 |     model_zoo = ModelZoo()
41 |     som_models(
42 |         model_zoo=model_zoo,
43 |         tf_model_path_som=tf_model_path_som,
44 |         text_file_path_som=text_file_path_som,
45 |         master_labels_path=master_labels_path,
46 |         sub_labels_path=sub_labels_path,
47 |     )
48 |     saved_path = model_zoo.save()
49 | 
50 | 
51 | tf_model_path_som = (
52 |     "/Users/vsatpathy/Desktop/docs/training_data/som/document_classifier.h5"
53 | )
54 | text_file_path_som = (
55 |     "/Users/vsatpathy/Desktop/docs/training_data/som/file_and_text_som.txt"
56 | )
57 | master_labels_path = (
58 |     "/Users/vsatpathy/Desktop/docs/training_data/som/rev_labels_master_som.json"
59 | )
60 | sub_labels_path = "/Users/vsatpathy/Desktop/docs/training_data/som/rev_labels_som.json"
61 | main()
62 | 


--------------------------------------------------------------------------------
/2_level_doc_classification/bento_predictor.py:
--------------------------------------------------------------------------------
  1 | import bentoml
  2 | from bentoml.types import FileLike
  3 | from bentoml.adapters import JsonInput, FileInput, MultiFileInput
  4 | from bentoml.frameworks.spacy import SpacyModelArtifact
  5 | from bentoml.frameworks.tensorflow import TensorflowSavedModelArtifact
  6 | from bentoml.service.artifacts.common import (
  7 |     JSONArtifact,
  8 |     PickleArtifact,
  9 | )
 10 | 
 11 | import tensorflow as tf
 12 | from tensorflow.keras.preprocessing.image import load_img
 13 | import numpy as np
 14 | from pytesseract import image_to_string
 15 | import re
 16 | from PIL import Image
 17 | from typing import List
 18 | 
 19 | 
 20 | @bentoml.env(infer_pip_packages=True)
 21 | @bentoml.artifacts(
 22 |     [
 23 |         TensorflowSavedModelArtifact("som_model"),
 24 |         SpacyModelArtifact("som_spacy_model"),
 25 |         PickleArtifact("som_tokenizer"),
 26 |         JSONArtifact("som_master_labels"),
 27 |         JSONArtifact("som_sub_labels"),
 28 |     ]
 29 | )
 30 | class ModelZoo(bentoml.BentoService):
 31 |     def helper(self, text):
 32 |         dummy = []
 33 |         for word in text:
 34 |             dummy.append(str(word))
 35 |         final = " ".join(dummy)
 36 |         return final
 37 | 
 38 |     def preprocess_spacy(self, spacy_model, text, num_of_words: int):
 39 |         text = str(text)
 40 |         text = text.split(" ")
 41 |         text = self.helper(text)
 42 |         text = str(text.lower())
 43 |         # Remove all the special characters
 44 |         text = re.sub(r"\W", " ", text)
 45 |         text = re.sub(r"[^a-zA-Z ]+", "", text)
 46 |         # remove all single characters
 47 |         text = re.sub(r"\s+[a-zA-Z]\s+", " ", text)
 48 |         # Remove single characters from the start
 49 |         text = re.sub(r"\^[a-zA-Z]\s+", " ", text)
 50 |         # Substituting multiple spaces with single space
 51 |         text = re.sub(r"\s+", " ", text, flags=re.I)
 52 |         # text = self.artifacts.mcr_spacy_model(text)
 53 |         text = spacy_model(text)
 54 |         filtered = [token.lemma_ for token in text if token.is_stop == False]
 55 |         text = " ".join(filtered[: num_of_words * 2])
 56 |         text = text.strip().split(" ")
 57 |         text = " ".join(text[:num_of_words])
 58 |         return text
 59 | 
 60 |     def tokenize_sentence(self, sentence, tokenizer, maximum_word_length):
 61 |         updated_sentence = sentence.split(" ")
 62 |         tok_sent = []
 63 |         for word in updated_sentence:
 64 |             if word in tokenizer.word_index:
 65 |                 tok_sent.append(tokenizer.word_index[word])
 66 |             else:
 67 |                 tok_sent.append(0)
 68 |         if len(tok_sent) != maximum_word_length:
 69 |             delta = maximum_word_length - len(tok_sent)
 70 |             for i in range(delta):
 71 |                 tok_sent.append(0)
 72 |         return tok_sent
 73 | 
 74 |     def pre_process_image(self, image_file):
 75 |         ocr_image = np.asarray(Image.open(image_file))
 76 |         image = np.asarray(
 77 |             Image.open(image_file).convert(mode="RGB").resize((100, 100))
 78 |         )
 79 |         image = np.divide(image, 255.0)
 80 |         image = np.asarray([image]).astype("float32")
 81 |         return ocr_image, image
 82 | 
 83 |     def pre_process_som(self, file):
 84 |         ocr_image, image = self.pre_process_image(image_file=file)
 85 |         doc_text = image_to_string(ocr_image)
 86 |         doc_text_processed = self.preprocess_spacy(
 87 |             spacy_model=self.artifacts.som_spacy_model, text=doc_text, num_of_words=10
 88 |         )
 89 |         fin_text = self.tokenize_sentence(
 90 |             sentence=doc_text_processed,
 91 |             tokenizer=self.artifacts.som_tokenizer,
 92 |             maximum_word_length=10,
 93 |         )
 94 |         return image, np.asarray([fin_text]).astype("float32")
 95 | 
 96 |     @bentoml.api(input=FileInput())
 97 |     def predict_document_labels_som(self, file_stream):
 98 |         image, text = self.pre_process_som(file=file_stream)
 99 |         model = self.artifacts.som_model.signatures["serving_default"]
100 |         model._num_positional_args = 2
101 |         results = model(tf.constant(text), tf.constant(image))
102 |         mas_results = results.get("dense_1")[0].numpy()
103 |         sub_results = results.get("dense_4")[0].numpy()
104 |         master_label = self.artifacts.som_master_labels[str(np.argmax(mas_results))]
105 |         sub_label = self.artifacts.som_sub_labels[str(np.argmax(sub_results))]
106 |         return {"master document type": master_label, "sub document type": sub_label}
107 | 


--------------------------------------------------------------------------------
/2_level_doc_classification/docker-compose.yml:
--------------------------------------------------------------------------------
 1 | version: "3"
 2 | services:
 3 |   web:
 4 |     image: 142339138776.dkr.ecr.us-east-2.amazonaws.com/docuedge-model-zoo:latest
 5 |     ports:
 6 |       - "5000:5000"
 7 |     logging:
 8 |       driver: awslogs
 9 |       options:
10 |         awslogs-group: docuedge-modelserver-ecs
11 |         awslogs-region: us-east-2
12 |         awslogs-stream-prefix: web
13 |     volumes:
14 |       - /app/temp
15 | 


--------------------------------------------------------------------------------
/2_level_doc_classification/ecs-params.yml:
--------------------------------------------------------------------------------
 1 | version: 1
 2 | task_definition:
 3 |   task_execution_role: ecsTaskExecutionRoleBento
 4 |   ecs_network_mode: awsvpc
 5 |   task_size:
 6 |     mem_limit: 8GB
 7 |     cpu_limit: 4096
 8 |   efs_volumes:
 9 |     - name: docuedgedev-efs
10 |       filesystem_id: fs-4717c93f
11 |       root_directory: /smartbox-config
12 | run_params:
13 |   network_configuration:
14 |     awsvpc_configuration:
15 |       subnets:
16 |         - subnet-00e7bff093931a167
17 |         - subnet-0345b051535c9625d
18 |       security_groups:
19 |         - sg-09b7e06cb8b13167d
20 |         - sg-0601a52d4ea28af05
21 |       assign_public_ip: ENABLED
22 | 


--------------------------------------------------------------------------------
/2_level_doc_classification/hybrid_v2.py:
--------------------------------------------------------------------------------
  1 | import numpy as np
  2 | import random
  3 | import tensorflow as tf
  4 | from tensorflow.keras.preprocessing.image import load_img
  5 | from tensorflow.keras.preprocessing.text import Tokenizer
  6 | from tensorflow.keras.layers import (
  7 |     Input,
  8 |     Conv2D,
  9 |     Dense,
 10 |     Flatten,
 11 |     Embedding,
 12 |     Concatenate,
 13 |     GlobalMaxPool1D,
 14 |     Conv1D,
 15 |     MaxPooling1D,
 16 | )
 17 | from tensorflow.keras.models import Model, load_model
 18 | import os
 19 | import json
 20 | import mlflow
 21 | import mlflow.tensorflow
 22 | 
 23 | tracking_uri = (
 24 |     "http://testuser:password@ec2-18-218-100-222.us-east-2.compute.amazonaws.com"
 25 | )
 26 | s3_bucket = "s3://docuedge-mlflow-bucket"  # replace this value
 27 | 
 28 | 
 29 | def read_data(path):
 30 |     bow = open(path, "r")
 31 |     data = bow.readlines()
 32 |     all_data_paths = []
 33 |     all_texts = []
 34 |     doc_type_y_labels = {}
 35 |     master_doc_type_y_labels = {}
 36 |     for line in data:
 37 |         line_data = line.split("####")
 38 |         all_data_paths.append(line_data[0])
 39 |         all_texts.append(line_data[-1][:-1])
 40 |         doc_type_label = line_data[0].split("/")[-2]
 41 |         master_doc_type_label = line_data[0].split("/")[-3]
 42 |         if doc_type_label not in doc_type_y_labels:
 43 |             doc_type_y_labels[doc_type_label] = len(doc_type_y_labels)
 44 |         if master_doc_type_label not in master_doc_type_y_labels:
 45 |             master_doc_type_y_labels[master_doc_type_label] = len(
 46 |                 master_doc_type_y_labels
 47 |             )
 48 | 
 49 |     rev_labels_doc_type = {}
 50 |     for key, val in doc_type_y_labels.items():
 51 |         rev_labels_doc_type[val] = key
 52 |     rev_labels_master_doc_type = {}
 53 |     for key, val in master_doc_type_y_labels.items():
 54 |         rev_labels_master_doc_type[val] = key
 55 | 
 56 |     return (
 57 |         all_data_paths,
 58 |         doc_type_y_labels,
 59 |         rev_labels_doc_type,
 60 |         all_texts,
 61 |         master_doc_type_y_labels,
 62 |         rev_labels_master_doc_type,
 63 |     )
 64 | 
 65 | 
 66 | def tokenize_sentence(sentence, tokenizer, maximum_word_length):
 67 |     updated_sentence = sentence.split(" ")
 68 |     tok_sent = []
 69 |     for word in updated_sentence:
 70 |         if word in tokenizer.word_index:
 71 |             tok_sent.append(tokenizer.word_index[word])
 72 |         else:
 73 |             tok_sent.append(0)
 74 |     if len(tok_sent) != maximum_word_length:
 75 |         delta = maximum_word_length - len(tok_sent)
 76 |         for i in range(delta):
 77 |             tok_sent.append(0)
 78 |     return tok_sent
 79 | 
 80 | 
 81 | def data_loader_text(
 82 |     bs,
 83 |     data,
 84 |     y_lab,
 85 |     tokenizer,
 86 |     text_data,
 87 |     image_input_shape,
 88 |     max_word_length,
 89 |     y_sub_labels,
 90 | ):
 91 |     while True:
 92 |         images = []
 93 |         master_labels = []
 94 |         sub_labels = []
 95 |         texts = []
 96 |         while len(images) < bs:
 97 |             indice = random.randint(0, len(data) - 1)
 98 |             target = data[indice].split("/")[-3]
 99 |             sub_target = data[indice].split("/")[-2]
100 |             master_labels.append(y_lab[target])
101 |             sub_labels.append(y_sub_labels[sub_target])
102 | 
103 |             test_img = np.asarray(load_img(data[indice], target_size=image_input_shape))
104 |             img = np.divide(test_img, 255.0)
105 |             images.append(img)
106 | 
107 |             tok_sen = tokenize_sentence(
108 |                 text_data[indice], tokenizer, maximum_word_length=max_word_length
109 |             )
110 |             texts.append(tok_sen)
111 |         yield [np.asarray(images), np.asarray(texts)], [
112 |             np.asarray(master_labels),
113 |             np.asarray(sub_labels),
114 |         ]
115 | 
116 | 
117 | def model_arc(y_labels, tokenizer, text_model_inp_shape, image_inp_shape, y_sub_labels):
118 |     inp_layer_texts = Input(shape=text_model_inp_shape)
119 |     inp_layer_images = Input(shape=image_inp_shape)
120 | 
121 |     embedding_layer = Embedding(
122 |         input_dim=len(tokenizer.word_index) + 1,
123 |         output_dim=64,
124 |         input_length=text_model_inp_shape,
125 |         trainable=True,
126 |     )(inp_layer_texts)
127 |     pooling_layer = GlobalMaxPool1D()(embedding_layer)
128 |     dense_layer = Dense(units=64, activation="relu")(pooling_layer)
129 | 
130 |     conv_layer = Conv2D(filters=64, kernel_size=(2, 2), activation="relu")(
131 |         inp_layer_images
132 |     )
133 |     flatten_layer = Flatten()(conv_layer)
134 | 
135 |     concat_layer = Concatenate()([flatten_layer, dense_layer])
136 |     out_layer = Dense(len(y_labels), activation="softmax")(concat_layer)
137 | 
138 |     sub_model_inp = Dense(units=64, activation="relu")(out_layer)
139 |     sub_dense_layer = Dense(units=256, activation="relu")(sub_model_inp)
140 | 
141 |     sub_concat_layer = Concatenate()([sub_dense_layer, concat_layer])
142 |     sub_out_layer = Dense(units=len(y_sub_labels), activation="softmax")(
143 |         sub_concat_layer
144 |     )
145 | 
146 |     model = Model([inp_layer_images, inp_layer_texts], [out_layer, sub_out_layer])
147 |     model.compile(
148 |         optimizer="adam", loss="sparse_categorical_crossentropy", metrics=["accuracy"]
149 |     )
150 |     return model
151 | 
152 | 
153 | def train_hybrid_v2(
154 |     text_plus_file_path: str,
155 |     batch_size: int,
156 |     epochs: int,
157 |     image_shape: int,
158 |     max_words: int,
159 |     artifact_name: str,
160 |     save_dir_path: str,
161 |     trained_model_path: str,
162 |     experiment_name: str,
163 | ):
164 |     mlflow.set_tracking_uri(tracking_uri)
165 |     client = mlflow.tracking.MlflowClient(tracking_uri=tracking_uri)
166 |     try:
167 |         expr_name = experiment_name  # create a new experiment (do not replace)
168 |         mlflow.create_experiment(expr_name, s3_bucket)
169 |         mlflow.set_experiment(expr_name)
170 |         experiment = mlflow.get_experiment_by_name(experiment_name)
171 |     except:
172 |         experiment = mlflow.get_experiment_by_name(experiment_name)
173 | 
174 |     (
175 |         all_imgs_path,
176 |         doc_type_y_labels,
177 |         rev_labels_doc_type,
178 |         all_text,
179 |         master_doc_type_label,
180 |         rev_labels_master_doc_type,
181 |     ) = read_data(path=text_plus_file_path)
182 |     num_train_img = len(all_imgs_path)
183 | 
184 |     with open(
185 |         os.path.join(save_dir_path, artifact_name, f"rev_labels_{artifact_name}.json"),
186 |         "w+",
187 |     ) as tar:
188 |         json.dump(rev_labels_doc_type, tar)
189 |     with open(
190 |         os.path.join(
191 |             save_dir_path, artifact_name, f"rev_labels_master_{artifact_name}.json"
192 |         ),
193 |         "w+",
194 |     ) as tar:
195 |         json.dump(rev_labels_master_doc_type, tar)
196 | 
197 |     print("target_encodings: ", master_doc_type_label)
198 |     print("target_encodings: ", doc_type_y_labels)
199 |     print("Number of training images: ", num_train_img)
200 | 
201 |     bow = open(text_plus_file_path, "r")
202 |     tokenizer = Tokenizer()
203 |     tokenizer.fit_on_texts(bow.read().split("####"))
204 | 
205 |     train_gen = data_loader_text(
206 |         tokenizer=tokenizer,
207 |         y_lab=master_doc_type_label,
208 |         data=all_imgs_path,
209 |         text_data=all_text,
210 |         bs=batch_size,
211 |         image_input_shape=(image_shape, image_shape, 3),
212 |         max_word_length=max_words,
213 |         y_sub_labels=doc_type_y_labels,
214 |     )
215 |     if os.path.isfile(trained_model_path):
216 |         model = load_model(trained_model_path)
217 |     else:
218 |         model = model_arc(
219 |             y_labels=master_doc_type_label,
220 |             tokenizer=tokenizer,
221 |             text_model_inp_shape=(max_words,),
222 |             image_inp_shape=(image_shape, image_shape, 3),
223 |             y_sub_labels=doc_type_y_labels,
224 |         )
225 |     mlflow.tensorflow.autolog(every_n_iter=1)
226 |     with mlflow.start_run(experiment_id=experiment.experiment_id):
227 |         mlflow.log_metrics(
228 |             {
229 |                 "batch_size": batch_size,
230 |                 "epochs": epochs,
231 |                 "image_shape": image_shape,
232 |                 "max_words": max_words,
233 |             }
234 |         )
235 |         model.fit(
236 |             x=train_gen, steps_per_epoch=num_train_img // batch_size, epochs=epochs
237 |         )
238 |         model.save(
239 |             filepath=os.path.join(
240 |                 save_dir_path, artifact_name, "document_classifier.h5"
241 |             )
242 |         )
243 |         meta_data_path = os.path.join(save_dir_path, artifact_name)
244 |         for artifact in sorted(os.listdir(meta_data_path)):
245 |             if artifact != ".DS_Store":
246 |                 artifact_path = os.path.join(meta_data_path, artifact)
247 |                 if (
248 |                     os.path.isfile(artifact_path)
249 |                     and artifact_path.split(".")[-1] != "h5"
250 |                 ):
251 |                     print(f"artifact to be uploaded is: {artifact}")
252 |                     mlflow.log_artifact(local_path=artifact_path)
253 | 
254 |         artifact_uri = mlflow.get_artifact_uri()
255 |         print(artifact_uri)
256 |         mlflow.end_run()
257 | 


--------------------------------------------------------------------------------
/2_level_doc_classification/pre_process_text.py:
--------------------------------------------------------------------------------
  1 | import spacy
  2 | import re
  3 | from pdf2image import convert_from_path
  4 | import os
  5 | from tqdm import tqdm
  6 | import pre_processing
  7 | from PIL import Image
  8 | import pytesseract
  9 | 
 10 | nlp = spacy.load("en_core_web_sm")
 11 | 
 12 | 
 13 | def helper(text):
 14 |     dummy = []
 15 |     for word in text:
 16 |         dummy.append(str(word))
 17 |     final = " ".join(dummy)
 18 |     return final
 19 | 
 20 | 
 21 | def preprocess_spacy(text, num_of_words: int):
 22 |     text = str(text)
 23 |     text = text.split(" ")
 24 |     text = helper(text)
 25 |     text = str(text.lower())
 26 |     # Remove all the special characters
 27 |     text = re.sub(r"\W", " ", text)
 28 |     text = re.sub(r"[^a-zA-Z ]+", "", text)
 29 |     # remove all single characters
 30 |     text = re.sub(r"\s+[a-zA-Z]\s+", " ", text)
 31 |     # Remove single characters from the start
 32 |     text = re.sub(r"\^[a-zA-Z]\s+", " ", text)
 33 |     # Substituting multiple spaces with single space
 34 |     text = re.sub(r"\s+", " ", text, flags=re.I)
 35 |     text = nlp(text)
 36 |     filtered = [token.lemma_ for token in text if token.is_stop == False]
 37 |     text = " ".join(filtered[: num_of_words * 2])
 38 |     text = text.strip().split(" ")
 39 |     text = " ".join(text[:num_of_words])
 40 |     return text
 41 | 
 42 | 
 43 | def read_text_from_pages_v2(
 44 |     complete_folder_path: str,
 45 |     path_to_save_essential_data: str,
 46 |     meta_name: str,
 47 |     num_of_words: int,
 48 | ):
 49 |     final_path_for_data = os.path.join(
 50 |         path_to_save_essential_data, meta_name, f"file_and_text_{meta_name}.txt"
 51 |     )
 52 |     if os.path.isfile(final_path_for_data):
 53 |         data = open(final_path_for_data, "r").read()
 54 |     else:
 55 |         data = "null"
 56 |     print("####  Reading pages ####")
 57 |     document_folders_path = os.path.join(complete_folder_path)
 58 |     master_doc_types = sorted(os.listdir(document_folders_path))
 59 |     text_of_all_pages = []
 60 |     for master_doc_type in master_doc_types:
 61 |         if master_doc_type != ".DS_Store":
 62 |             print("MASTER DOCUMENT TYPE: ", master_doc_type)
 63 |             sub_doc_type_path = os.path.join(document_folders_path, master_doc_type)
 64 |             for doc_image_type in sorted(os.listdir(sub_doc_type_path)):
 65 |                 if doc_image_type != ".DS_Store":
 66 |                     print("DOCUMENT TYPE: ", doc_image_type)
 67 |                     complete_doc_image_path = os.path.join(
 68 |                         sub_doc_type_path, doc_image_type
 69 |                     )
 70 |                     pages = sorted(os.listdir(complete_doc_image_path))
 71 |                     for page in tqdm(pages):
 72 |                         if page != ".DS_Store":
 73 |                             page_path = os.path.join(complete_doc_image_path, page)
 74 |                             if page_path not in data:
 75 |                                 document_page = Image.open(page_path)
 76 |                                 document_text = pytesseract.image_to_string(
 77 |                                     document_page
 78 |                                 )
 79 |                                 document_page.close()
 80 |                                 essential_file_path_and_text = (
 81 |                                     page_path
 82 |                                     + "####"
 83 |                                     + preprocess_spacy(
 84 |                                         document_text, num_of_words=num_of_words
 85 |                                     )
 86 |                                     + "\n"
 87 |                                 )
 88 |                                 text_of_all_pages.append(essential_file_path_and_text)
 89 | 
 90 |     if os.path.isfile(final_path_for_data):
 91 |         all_essential_data = open(final_path_for_data, "a+")
 92 |         all_essential_data.writelines(text_of_all_pages)
 93 |     else:
 94 |         all_essential_data = open(final_path_for_data, "w")
 95 |         all_essential_data.writelines(text_of_all_pages)
 96 |     return final_path_for_data
 97 | 
 98 | 
 99 | def pdf_to_images(full_path_pdf: str, converted_images_path: str, meta_name: str):
100 |     doc = full_path_pdf.split("/")[-1]
101 |     index = 0
102 |     OUTPUT_PATH = converted_images_path
103 |     os.makedirs(name=OUTPUT_PATH, exist_ok=True)
104 | 
105 |     print("Document name: ", doc)
106 |     if str(doc.split(".pdf")[-2]) + "_" + str(index) + ".jpg" not in os.listdir(
107 |         converted_images_path
108 |     ):
109 |         pil_images = convert_from_path(full_path_pdf, dpi=300)
110 | 
111 |         for image in tqdm(pil_images):
112 |             processed_image = pre_processing.preprocess_image_file(image)
113 |             try:
114 |                 processed_image = Image.fromarray(processed_image)
115 |                 processed_image.save(
116 |                     os.path.join(OUTPUT_PATH, str(doc.split(".pdf")[-2]))
117 |                     + "_"
118 |                     + str(index)
119 |                     + ".jpg",
120 |                     format="JPEG",
121 |                     subsampling=0,
122 |                     quality=100,
123 |                 )
124 |                 index += 1
125 |                 processed_image.close()
126 |             except:
127 |                 index += 1
128 |     else:
129 |         pass


--------------------------------------------------------------------------------
/2_level_doc_classification/pre_processing.py:
--------------------------------------------------------------------------------
  1 | """IMAGE PREPROCESSING FUNCTIONS
  2 | """
  3 | import cv2
  4 | import numpy as np
  5 | from scipy.ndimage.filters import rank_filter
  6 | 
  7 | # from sbox.utils.sbox_logger import logger
  8 | import pytesseract
  9 | import re
 10 | import imutils
 11 | from PIL import Image
 12 | 
 13 | # print("error") # = logger(__name__)
 14 | 
 15 | 
 16 | class PagePreprocess(object):
 17 |     def __init__(self, im):
 18 |         self.err = False
 19 |         self.orig_im = im
 20 |         self.orig_shape = self.orig_im.shape
 21 |         self.image = im
 22 | 
 23 |     def crop(self):
 24 |         try:
 25 |             self.image, self.num_tries = process_image(self.orig_im)
 26 |             self.crop_shape = self.image.shape
 27 |             return self.image
 28 |         except Exception as e:
 29 |             print("crop_obj_Error")  # (f"Error: {e}", exc_info=True)
 30 | 
 31 |     def deskew(self):
 32 |         try:
 33 |             self.image, self.theta_est = process_skewed_crop(self.image)
 34 |             return self.image
 35 |         except Exception as e:
 36 |             print("deskew_obj_Error")  # (f"Error: {e}", exc_info=True)
 37 | 
 38 | 
 39 | def auto_canny(image, sigma=0.33):
 40 |     try:
 41 |         v = np.median(image)
 42 |         lower = int(max(0, (1.0 - sigma) * v))
 43 |         upper = int(min(255, (1.0 + sigma) * v))
 44 |         edged = cv2.Canny(image, lower, upper, True)
 45 |         return edged
 46 |     except Exception as e:
 47 |         print("auto_canny_Error")  # (f"Error: {e}", exc_info=True)
 48 | 
 49 | 
 50 | def dilate(image, kernel, iterations):
 51 |     dilated_image = cv2.dilate(image, kernel, iterations=iterations)
 52 |     return dilated_image
 53 | 
 54 | 
 55 | def downscale_image(im, max_dim=2048):
 56 |     try:
 57 |         a, b = im.shape[:2]
 58 |         if max(a, b) <= max_dim:
 59 |             return 1.0, im
 60 | 
 61 |         scale = 1.0 * max_dim / max(a, b)
 62 |         new_im = cv2.resize(im, (int(b * scale), int(a * scale)), cv2.INTER_AREA)
 63 |         return scale, new_im
 64 |     except Exception as e:
 65 |         print("error")  # (f"Error: {e}", exc_info=True)
 66 | 
 67 | 
 68 | def find_components(im, max_components=16):
 69 |     try:
 70 |         kernel = cv2.getStructuringElement(cv2.MORPH_RECT, (10, 10))
 71 |         dilation = dilate(im, kernel, 6)
 72 | 
 73 |         count = 21
 74 |         n = 0
 75 |         sigma = 0.000
 76 | 
 77 |         while count > max_components:
 78 |             n += 1
 79 |             sigma += 0.005
 80 |             result = cv2.findContours(dilation, cv2.RETR_TREE, cv2.CHAIN_APPROX_SIMPLE)
 81 |             if len(result) == 3:
 82 |                 _, contours, hierarchy = result
 83 |             elif len(result) == 2:
 84 |                 contours, hierarchy = result
 85 |             possible = find_likely_rectangles(contours, sigma)
 86 |             count = len(possible)
 87 | 
 88 |         return (dilation, possible, n)
 89 |     except Exception as e:
 90 |         print("comp_error")  # (f"Error: {e}", exc_info=True)
 91 | 
 92 | 
 93 | def find_likely_rectangles(contours, sigma):
 94 |     try:
 95 |         contours = sorted(contours, key=cv2.contourArea, reverse=True)[:10]
 96 |         possible = []
 97 |         for c in contours:
 98 | 
 99 |             peri = cv2.arcLength(c, True)
100 |             approx = cv2.approxPolyDP(c, sigma * peri, True)
101 |             box = make_box(approx)
102 |             possible.append(box)
103 | 
104 |         return possible
105 |     except Exception as e:
106 |         print("likely_rec_error")  # (f"Error: {e}", exc_info=True)
107 | 
108 | 
109 | def make_box(poly):
110 |     try:
111 |         x = []
112 |         y = []
113 |         for p in poly:
114 |             for point in p:
115 |                 x.append(point[0])
116 |                 y.append(point[1])
117 |         xmax = max(x)
118 |         ymax = max(y)
119 |         xmin = min(x)
120 |         ymin = min(y)
121 |         return (xmin, ymin, xmax, ymax)
122 |     except Exception as e:
123 |         print("bbox_error")  # (f"Error: {e}", exc_info=True)
124 | 
125 | 
126 | def rect_union(crop1, crop2):
127 |     x11, y11, x21, y21 = crop1
128 |     x12, y12, x22, y22 = crop2
129 |     return min(x11, x12), min(y11, y12), max(x21, x22), max(y21, y22)
130 | 
131 | 
132 | def rect_area(crop):
133 |     x1, y1, x2, y2 = crop
134 |     return max(0, x2 - x1) * max(0, y2 - y1)
135 | 
136 | 
137 | def crop_image(im, rect, scale):
138 |     try:
139 |         xmin, ymin, xmax, ymax = rect
140 |         crop = [xmin, ymin, xmax, ymax]
141 |         xmin, ymin, xmax, ymax = [int(x / scale) for x in crop]
142 |         if ((ymax - ymin) * (xmax - xmin)) > 0.25 * im.size:
143 |             cropped = im[ymin:ymax, xmin:xmax]
144 |         else:
145 |             cropped = im
146 |         return cropped
147 |     except Exception as e:
148 |         print("crop_error_1")  # (f"Error: {e}", exc_info=True)
149 | 
150 | 
151 | def reduce_noise_raw(im):
152 |     bilat = cv2.bilateralFilter(im, 4, 75, 75)
153 |     blur = cv2.medianBlur(bilat, 1)
154 |     return blur
155 | 
156 | 
157 | def reduce_noise_edges(im):
158 |     try:
159 |         structuring_element = cv2.getStructuringElement(cv2.MORPH_RECT, (1, 1))
160 |         opening = cv2.morphologyEx(im, cv2.MORPH_OPEN, structuring_element)
161 |         maxed_rows = rank_filter(opening, -4, size=(1, 20))
162 |         maxed_cols = rank_filter(opening, -4, size=(20, 1))
163 |         debordered = np.minimum(np.minimum(opening, maxed_rows), maxed_cols)
164 |         return debordered
165 |     except Exception as e:
166 |         print("noise_red_Error")  # (f"Error: {e}", exc_info=True)
167 | 
168 | 
169 | def rects_are_vertical(rect1, rect2, rect_align=2):
170 |     try:
171 |         xmin1, ymin1, xmax1, ymax1 = rect1
172 |         xmin2, ymin2, xmax2, ymax2 = rect2
173 | 
174 |         midpoint1 = (xmin1 + xmax1) / 2
175 |         midpoint2 = (xmin2 + xmax2) / 2
176 |         dist = abs(midpoint1 - midpoint2)
177 | 
178 |         rectarea1 = rect_area(rect1)
179 |         rectarea2 = rect_area(rect2)
180 |         if rectarea1 > rectarea2:
181 |             thres = (xmax1 - xmin1) * rect_align
182 |         else:
183 |             thres = (xmax2 - xmin2) * rect_align
184 | 
185 |         if thres > dist:
186 |             align = True
187 |         else:
188 |             align = False
189 |         return align
190 |     except Exception as e:
191 |         print("vert_rec_Error")  # (f"Error: {e}", exc_info=True)
192 | 
193 | 
194 | def find_final_crop(im, rects, orig_im):
195 |     try:
196 |         current = None
197 |         for rect in rects:
198 |             if current is None:
199 |                 current = rect
200 |                 continue
201 | 
202 |             aligned = rects_are_vertical(current, rect)
203 | 
204 |             if not aligned:
205 |                 continue
206 | 
207 |             current = rect_union(current, rect)
208 |         if current is not None:
209 |             return current
210 |         else:
211 |             return (0, 0, orig_im.shape[0], orig_im.shape[1])
212 |     except Exception as e:
213 |         print("crop_Error")  # (f"Error: {e}", exc_info=True)
214 | 
215 | 
216 | def process_image(orig_im):
217 |     try:
218 |         scale, im = downscale_image(orig_im)
219 | 
220 |         blur = reduce_noise_raw(im.copy())
221 | 
222 |         edges = auto_canny(blur.copy())
223 | 
224 |         debordered = reduce_noise_edges(edges.copy())
225 | 
226 |         dilation, rects, num_tries = find_components(debordered, 16)
227 | 
228 |         final_rect = find_final_crop(dilation, rects, orig_im)
229 | 
230 |         cropped = crop_image(orig_im, final_rect, scale)
231 |         # kernel = np.ones((3, 3), np.float32) / 25
232 |         # smooth2d = cv2.filter2D(cropped, -1, kernel=kernel)
233 |         return (cropped, num_tries)
234 |     except Exception as e:
235 |         print("process")  # (f"Error: {e}", exc_info=True)
236 | 
237 | 
238 | def rad_to_deg(theta):
239 |     return theta * 180 / np.pi
240 | 
241 | 
242 | def rotate(image, theta):
243 |     try:
244 |         (h, w) = image.shape[:2]
245 |         center = (w / 2, h / 2)
246 |         M = cv2.getRotationMatrix2D(center, theta, 1)
247 |         rotated = cv2.warpAffine(
248 |             image,
249 |             M,
250 |             (int(w), int(h)),
251 |             cv2.INTER_LINEAR,
252 |             borderMode=cv2.BORDER_CONSTANT,
253 |             borderValue=(255, 255, 255),
254 |         )
255 |         return rotated
256 |     except Exception as e:
257 |         print("rotation_error")  # (f"Error: {e}", exc_info=True)
258 | 
259 | 
260 | def angle_calculation(gray):
261 |     gray = cv2.cvtColor(gray, cv2.COLOR_BGR2GRAY)
262 |     gray = cv2.bitwise_not(gray)
263 |     thresh = cv2.threshold(gray, 0, 255, cv2.THRESH_BINARY | cv2.THRESH_OTSU)[1]
264 | 
265 |     coords = np.column_stack(np.where(thresh > 0))
266 |     # print(coords, coords.shape)
267 | 
268 |     min_y = coords[0][0]
269 |     max_y = coords[-1][0]
270 |     min_x = coords[0][0]
271 |     max_x = coords[-1][1]
272 | 
273 |     left_most = coords[0]
274 |     right_most = coords[0]
275 |     top_most = coords[0]
276 |     bottom_most = coords[0]
277 |     # print(coords[0], coords[-1])
278 |     for i in range(1, coords.shape[0]):
279 |         y, x = coords[i][0], coords[i][1]
280 |         if y <= min_y:
281 |             min_y = y
282 |             top_most = coords[i]
283 |         elif y >= max_y:
284 |             max_y = y
285 |             bottom_most = coords[i]
286 |         if x <= min_x:
287 |             min_x = x
288 |             left_most = coords[i]
289 |         elif x >= max_x:
290 |             max_x = x
291 |             right_most = coords[i]
292 |     # print(top_most, left_most, bottom_most, right_most)
293 | 
294 |     slopes = []
295 |     edge_coor = [top_most, left_most, bottom_most, right_most]
296 |     for i in range(0, len(edge_coor)):
297 |         if i == len(edge_coor) - 1:
298 |             if abs((edge_coor[0][1] - edge_coor[i][1])) >= 10:
299 |                 angle = (
300 |                     (
301 |                         (edge_coor[0][0] - edge_coor[i][0])
302 |                         / (edge_coor[0][1] - edge_coor[i][1])
303 |                     )
304 |                     * 180
305 |                 ) / 3.14
306 |                 slopes.append(angle)
307 |             else:
308 |                 slopes.append(0.0)
309 |         else:
310 |             if abs((edge_coor[i + 1][1] - edge_coor[i][1])) >= 10:
311 |                 angle = (
312 |                     (
313 |                         (edge_coor[i + 1][0] - edge_coor[i][0])
314 |                         / (edge_coor[i + 1][1] - edge_coor[i][1])
315 |                     )
316 |                     * 180
317 |                 ) / 3.14
318 |                 slopes.append(angle)
319 |             else:
320 |                 slopes.append(0.0)
321 |         # img = cv2.circle(thresh, (edge_coor[i][1], edge_coor[i][0]), 5, (255, 0, 0), 2)
322 | 
323 |     slopes = np.asarray(slopes)
324 |     if len(np.where(slopes == 0.0)[0]) >= 2:
325 |         # print("error") #(f"Error: {e}", exc_info=True)don't rotate")
326 |         return None
327 |     else:
328 |         # print("error") #(f"Error: {e}", exc_info=True)rotate")
329 |         neg_slope = (slopes[0] + slopes[2]) / 2
330 |         pos_slope = (slopes[1] + slopes[3]) / 2
331 |         # print(pos_slope, neg_slope)
332 |         new_pos_slope = pos_slope
333 |         new_neg_slope = neg_slope
334 |         if pos_slope > 90:
335 |             if pos_slope < 180:
336 |                 new_pos_slope = 180 - pos_slope
337 |             else:
338 |                 new_pos_slope = pos_slope - ((pos_slope // 180) * 180)
339 |                 # print(new_pos_slope)
340 |         if neg_slope < -90:
341 |             new_neg_slope = 180 + neg_slope
342 |         # print(new_pos_slope, new_neg_slope)
343 |         if new_pos_slope <= new_neg_slope:
344 |             fin_angle = pos_slope
345 |         else:
346 |             fin_angle = neg_slope
347 | 
348 |         if fin_angle < -90:
349 |             rot_angle = 180 + fin_angle
350 |         elif fin_angle > 90:
351 |             rot_angle = -(180 - fin_angle)
352 |         elif -90 < fin_angle < 0:
353 |             rot_angle = fin_angle
354 |         elif 0 < fin_angle < 90:
355 |             rot_angle = fin_angle
356 |         return rot_angle
357 | 
358 | 
359 | def estimate_skew(image):
360 |     try:
361 |         osd = pytesseract.image_to_osd(image)
362 |         angle = float(re.search("(?<=Rotate: )\d+", osd).group(0))
363 |         if angle == 0:
364 |             # fin_image = rotate(image_gray, angle)
365 |             edges = auto_canny(image)
366 |             # print(edges.shape)
367 |             # print("error") #(f"Error: {e}", exc_info=True)edges found: ", edges)
368 |             lines = cv2.HoughLines(edges, 1, np.pi / 270, 400)
369 |             # print("error") #(f"Error: {e}", exc_info=True)lines found: ", lines)
370 |             if lines is not None:
371 |                 new = edges.copy()
372 |                 thetas = []
373 |                 for line in lines:
374 |                     for rho, theta in line:
375 |                         a = np.cos(theta)
376 |                         b = np.sin(theta)
377 |                         x0 = a * rho
378 |                         y0 = b * rho
379 |                         x1 = int(x0 + 1000 * (-b))
380 |                         y1 = int(y0 + 1000 * (a))
381 |                         x2 = int(x0 - 1000 * (-b))
382 |                         y2 = int(y0 - 1000 * (a))
383 |                         if theta > np.pi / 3 and theta < np.pi * 2 / 3:
384 |                             thetas.append(theta)
385 |                             new = cv2.line(new, (x1, y1), (x2, y2), (255, 255, 255), 1)
386 | 
387 |                 theta_mean = np.mean(thetas)
388 |                 theta = -(90 - (rad_to_deg(theta_mean) if len(thetas) > 0 else 0))
389 |             else:
390 |                 # theta = angle_calculation(image)
391 |                 theta = 0.0
392 |         else:
393 |             theta = angle
394 |         return theta
395 |     except Exception as e:
396 |         print("theta_error")  # (f"Error: {e}", exc_info=True)
397 | 
398 | 
399 | def process_skewed_crop(image):
400 |     try:
401 |         theta = estimate_skew(image)
402 |         # print(theta)
403 |         # ret, thresh = cv2.threshold(image, 0, 127, cv2.THRESH_OTSU)
404 |         # print(thresh)
405 |         if theta is not None and (theta % 90) != 0:
406 |             rotated = rotate(image, theta)
407 |         elif (theta % 90) == 0:
408 |             rotated = imutils.rotate_bound(image, theta)
409 |         else:
410 |             rotated = image
411 |         # print(rotated)
412 |         return rotated, theta
413 |     except Exception as e:
414 |         print("skew_Error")  # (f"Error: {e}", exc_info=True)
415 | 
416 | 
417 | def preprocess_image(file_path: str):
418 |     try:
419 |         gray_page = cv2.imread(file_path, 0)
420 |         process_page = PagePreprocess(gray_page)
421 |         _ = process_page.crop()
422 |         deskewed_page = process_page.deskew()
423 |         # cv2.imwrite(file_path, deskewed_page)
424 |         return deskewed_page
425 |     except Exception as e:
426 |         print("process_image_error")  # (f"Error: {e}", exc_info=True)
427 | 
428 | 
429 | def preprocess_image_file(img):
430 |     try:
431 |         # converted_image = cv2.cvtColor(np.asarray(img), cv2.COLOR_RGB2BGR)
432 |         gray_page = cv2.cvtColor(np.array(img), cv2.COLOR_BGR2GRAY)
433 |         # gray_page = cv2.cvtColor(gray_page, cv2.COLOR_BGR2RGB)
434 |         process_page = PagePreprocess(gray_page)
435 |         _ = process_page.crop()
436 |         deskewed_page = process_page.deskew()
437 |         return deskewed_page
438 |     except Exception as e:
439 |         print("error")  # (f"Error: {e}", exc_info=True)
440 | 


--------------------------------------------------------------------------------
/2_level_doc_classification/train.py:
--------------------------------------------------------------------------------
 1 | import argparse
 2 | import os
 3 | from hybrid_v2 import train_hybrid_v2
 4 | from pre_process_text import (
 5 |     pdf_to_images,
 6 |     read_text_from_pages_v2,
 7 | )
 8 | 
 9 | 
10 | def process_multi_level(
11 |     dataset_path: str,
12 |     save_dir: str,
13 |     pdf_check: bool,
14 |     artifact_name: str,
15 |     num_words_to_read: int,
16 | ):
17 |     updated_dataset_path = os.path.join(save_dir, artifact_name, "dataset")
18 |     os.makedirs(updated_dataset_path, exist_ok=True)
19 |     for master_document_type in sorted(os.listdir(dataset_path)):
20 |         if master_document_type != ".DS_Store":
21 |             document_folder_path = os.path.join(dataset_path, master_document_type)
22 |             updated_master_document_type_folder_path = os.path.join(
23 |                 updated_dataset_path, master_document_type
24 |             )
25 |             os.makedirs(updated_master_document_type_folder_path, exist_ok=True)
26 |             for document_type in sorted(os.listdir(document_folder_path)):
27 |                 if document_type != ".DS_Store":
28 |                     folder_path = os.path.join(document_folder_path, document_type)
29 |                     updated_document_type_folder_path = os.path.join(
30 |                         updated_master_document_type_folder_path, document_type
31 |                     )
32 |                     os.makedirs(updated_document_type_folder_path, exist_ok=True)
33 |                     for documents in sorted(os.listdir(folder_path)):
34 |                         if documents != ".DS_Store":
35 |                             docuemnt_path = os.path.join(folder_path, documents)
36 |                             if pdf_check:
37 |                                 # Perform conversion and store the images in a temp folder
38 |                                 pdf_to_images(
39 |                                     full_path_pdf=docuemnt_path,
40 |                                     converted_images_path=updated_document_type_folder_path,
41 |                                     meta_name=artifact_name,
42 |                                 )
43 |     if pdf_check:
44 |         images_data_path = os.path.join(save_dir, artifact_name)
45 |     else:
46 |         images_data_path = dataset_path
47 |     master_data_path = read_text_from_pages_v2(
48 |         complete_folder_path=images_data_path,
49 |         path_to_save_essential_data=save_dir,
50 |         meta_name=artifact_name,
51 |         num_of_words=num_words_to_read,
52 |     )
53 |     return master_data_path
54 | 
55 | 
56 | def multi_level(args):
57 |     all_data_path = process_multi_level(
58 |         dataset_path=args.data_path,
59 |         save_dir=args.file_path,
60 |         pdf_check=bool(args.pdfs),
61 |         artifact_name=args.art_name,
62 |         num_words_to_read=args.num_of_words,
63 |     )
64 |     train_hybrid_v2(
65 |         text_plus_file_path=all_data_path,
66 |         batch_size=int(args.batch_size),
67 |         epochs=int(args.epochs),
68 |         image_shape=int(args.img_shape),
69 |         max_words=int(args.num_of_words),
70 |         artifact_name=args.art_name,
71 |         save_dir_path=args.file_path,
72 |         trained_model_path=args.model_path,
73 |         experiment_name=args.experiment_name,
74 |     )
75 | 
76 | 
77 | parser = argparse.ArgumentParser()
78 | parser.add_argument("-dp", "--data_path", help="File path of the dataset")
79 | parser.add_argument("-fp", "--file_path", help="Directory path to save artifacts")
80 | parser.add_argument("-a", "--art_name", help="Artifacts name")
81 | parser.add_argument("-p", "--pdfs", default=False, help="Dataset type")
82 | parser.add_argument("-n", "--num_of_words", default=10, help="No of words to read")
83 | parser.add_argument("-b", "--batch_size", default=8, help="Batch size for training")
84 | parser.add_argument("-e", "--epochs", default=3, help="Number of epochs")
85 | parser.add_argument("-is", "--img_shape", default=100, help="One dimension of image")
86 | parser.add_argument("-mp", "--model_path", default="NULL", help="Path to trained model")
87 | parser.add_argument(
88 |     "-exp",
89 |     "--experiment_name",
90 |     default="multi_label_classification",
91 |     help="Name of the experiment for tracking",
92 | )
93 | args = parser.parse_args()
94 | multi_level(args=args)
95 | 


--------------------------------------------------------------------------------
/LICENSE:
--------------------------------------------------------------------------------
 1 | MIT License
 2 | 
 3 | Copyright (c) 2021 Vaibhav Satpathy
 4 | 
 5 | Permission is hereby granted, free of charge, to any person obtaining a copy
 6 | of this software and associated documentation files (the "Software"), to deal
 7 | in the Software without restriction, including without limitation the rights
 8 | to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
 9 | copies of the Software, and to permit persons to whom the Software is
10 | furnished to do so, subject to the following conditions:
11 | 
12 | The above copyright notice and this permission notice shall be included in all
13 | copies or substantial portions of the Software.
14 | 
15 | THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
16 | IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
17 | FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
18 | AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
19 | LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
20 | OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
21 | SOFTWARE.
22 | 


--------------------------------------------------------------------------------
/README.md:
--------------------------------------------------------------------------------
 1 | # psAI-clOps
 2 |  End to End MLOps
 3 |  
 4 |  This is a workflow to ensure Model Ops is taken care properly in any organisation
 5 |  
 6 |  To utilize this repository to it's fullest qnd set up the required dashboard on AWS for tracking, deploying and versioning, check out the following blogs:
 7 |  1. Setup MLflow on AWS: https://vaibhavsatpathy.medium.com/setup-mlflow-on-aws-ec2-94b8e473618f
 8 |  2. MLOps deployment into AWS Fargate I: https://vaibhavsatpathy.medium.com/mlops-deployment-into-aws-fargate-i-bd612af5dd7a
 9 |  3. MLOps deployment into AWS Fargate II: https://vaibhavsatpathy.medium.com/mlops-deployment-in-to-aws-fargate-ii-95321942b9e1
10 | 


--------------------------------------------------------------------------------
/conditional_GAN/__pycache__/bento_predictor.cpython-38.pyc:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/vaibhavsatpathy/psAI-clOps/7b447177b3e43c8d028360f574e6a20ba5090cc5/conditional_GAN/__pycache__/bento_predictor.cpython-38.pyc


--------------------------------------------------------------------------------
/conditional_GAN/artifacts/saved_model.pb:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/vaibhavsatpathy/psAI-clOps/7b447177b3e43c8d028360f574e6a20ba5090cc5/conditional_GAN/artifacts/saved_model.pb


--------------------------------------------------------------------------------
/conditional_GAN/artifacts/variables/variables.data-00000-of-00001:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/vaibhavsatpathy/psAI-clOps/7b447177b3e43c8d028360f574e6a20ba5090cc5/conditional_GAN/artifacts/variables/variables.data-00000-of-00001


--------------------------------------------------------------------------------
/conditional_GAN/artifacts/variables/variables.index:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/vaibhavsatpathy/psAI-clOps/7b447177b3e43c8d028360f574e6a20ba5090cc5/conditional_GAN/artifacts/variables/variables.index


--------------------------------------------------------------------------------
/conditional_GAN/bento_package.py:
--------------------------------------------------------------------------------
 1 | from bento_predictor import ConditionalDigitGenerator
 2 | from tensorflow.keras.models import load_model
 3 | import tensorflow as tf
 4 | 
 5 | 
 6 | def classifier_models(model_service, model_path: str):
 7 |     model_gen = load_model(model_path)
 8 |     tf.saved_model.save(model_gen, "artifacts/")
 9 |     model_gen = tf.saved_model.load("artifacts/")
10 |     model_service.pack("model", model_gen)
11 | 
12 | 
13 | def main():
14 |     model_service = ConditionalDigitGenerator()
15 |     classifier_models(model_service=model_service, model_path=generator_model_path)
16 |     saved_path = model_service.save()
17 | 
18 | 
19 | generator_model_path = "/Users/vsatpathy/Desktop/docs/training_data/c_gan/generator.h5"
20 | main()


--------------------------------------------------------------------------------
/conditional_GAN/bento_predictor.py:
--------------------------------------------------------------------------------
 1 | import bentoml
 2 | from bentoml.adapters import JsonInput
 3 | from bentoml.frameworks.tensorflow import TensorflowSavedModelArtifact
 4 | 
 5 | import tensorflow as tf
 6 | import importlib.util
 7 | import numpy as np
 8 | from PIL import Image
 9 | 
10 | 
11 | @bentoml.env(infer_pip_packages=True)
12 | @bentoml.artifacts([TensorflowSavedModelArtifact("model")])
13 | class ConditionalDigitGenerator(bentoml.BentoService):
14 |     @bentoml.api(input=JsonInput())
15 |     def generate_conditional_image(self, parsed_json):
16 |         model = self.artifacts.model.signatures["serving_default"]
17 |         model._num_positional_args = 2
18 |         noise = np.random.normal(0, 1, (1, 100))
19 |         noise = tf.convert_to_tensor(noise, dtype=tf.float32)
20 |         label = np.asarray(int(parsed_json.get("number"))).reshape(-1, 1)
21 |         label = tf.convert_to_tensor(label, dtype=tf.int32)
22 |         results = model(noise, label)
23 |         generated_image = results.get("sequential")[0].numpy().reshape(28, 28)
24 |         return {"digit_generated": generated_image}
25 | 


--------------------------------------------------------------------------------
/conditional_GAN/infer.py:
--------------------------------------------------------------------------------
 1 | from tensorflow.keras.models import load_model
 2 | import numpy as np
 3 | import matplotlib.pyplot as plt
 4 | from PIL import Image
 5 | 
 6 | 
 7 | def test(gen_model_path: str, i: int):
 8 |     gen = load_model(gen_model_path)
 9 |     noise = np.random.normal(0, 1, (1, 100))
10 |     label = np.random.randint(0, 10, 1).reshape(-1, 1)
11 |     image = np.squeeze(gen.predict([noise, label]), axis=0)
12 |     plt.imsave(
13 |         "/Users/vsatpathy/Desktop/off_POCs/cycle_gan/epoch_%d_tag_%s" % (i, label[0]),
14 |         image.reshape(28, 28),
15 |         format="jpg",
16 |         cmap="gray",
17 |     )
18 | 
19 | 
20 | generator_model_path = "/Users/vsatpathy/Desktop/docs/training_data/c_gan/generator.h5"
21 | test(gen_model_path=generator_model_path, i=0)
22 | 


--------------------------------------------------------------------------------
/conditional_GAN/train.py:
--------------------------------------------------------------------------------
  1 | import os
  2 | import numpy as np
  3 | import matplotlib.pyplot as plt
  4 | import argparse
  5 | import mlflow
  6 | 
  7 | from tensorflow.keras.models import Model, Sequential
  8 | from tensorflow.keras.datasets import mnist
  9 | from tensorflow.keras.optimizers import Adam
 10 | from tensorflow.keras import backend as K
 11 | from tensorflow.keras import initializers
 12 | from tensorflow.keras.layers import (
 13 |     Input,
 14 |     multiply,
 15 |     Embedding,
 16 |     LeakyReLU,
 17 |     Reshape,
 18 |     Dense,
 19 |     Dropout,
 20 |     Flatten,
 21 |     Convolution2D,
 22 |     UpSampling2D,
 23 |     BatchNormalization,
 24 | )
 25 | 
 26 | 
 27 | tracking_uri = (
 28 |     "http://testuser:password@ec2-18-218-100-222.us-east-2.compute.amazonaws.com"
 29 | )
 30 | s3_bucket = "s3://docuedge-mlflow-bucket"  # replace this value
 31 | 
 32 | 
 33 | def generator():
 34 |     gen = Sequential()
 35 |     gen.add(Dense(256, input_dim=100))
 36 |     gen.add(LeakyReLU(0.2))
 37 |     gen.add(BatchNormalization(momentum=0.8))
 38 |     gen.add(Dense(512))
 39 |     gen.add(LeakyReLU(0.2))
 40 |     gen.add(BatchNormalization(momentum=0.8))
 41 |     gen.add(Dense(1024))
 42 |     gen.add(LeakyReLU(0.2))
 43 |     gen.add(BatchNormalization(momentum=0.8))
 44 |     gen.add(Dense(784, activation="tanh"))
 45 |     # gen.summary()
 46 | 
 47 |     noise = Input(shape=(100,))
 48 |     label = Input(shape=(1,), dtype="int32")
 49 |     label_embedding = Flatten()(Embedding(10, 100)(label))
 50 |     model_input = multiply([noise, label_embedding])
 51 |     image = gen(model_input)
 52 | 
 53 |     gen = Model([noise, label], image)
 54 |     gen.compile(loss="binary_crossentropy", optimizer=Adam(lr=0.0002, beta_1=0.5))
 55 |     return gen
 56 | 
 57 | 
 58 | def discriminator():
 59 |     disc = Sequential()
 60 |     disc.add(Dense(512, input_dim=784))
 61 |     disc.add(LeakyReLU(0.2))
 62 |     disc.add(Dropout(0.4))
 63 |     disc.add(Dense(512))
 64 |     disc.add(LeakyReLU(0.2))
 65 |     disc.add(Dropout(0.4))
 66 |     disc.add(Dense(512))
 67 |     disc.add(LeakyReLU(0.2))
 68 |     disc.add(Dropout(0.4))
 69 |     disc.add(Dense(1, activation="sigmoid"))
 70 |     # disc.summary()
 71 | 
 72 |     image = Input(shape=(784,))
 73 |     label = Input(shape=(1,), dtype="int32")
 74 |     label_embedding = Flatten()(Embedding(10, 784)(label))
 75 |     model_input = multiply([image, label_embedding])
 76 |     prediction = disc(model_input)
 77 | 
 78 |     disc = Model([image, label], prediction)
 79 |     disc.compile(
 80 |         loss="binary_crossentropy",
 81 |         optimizer=Adam(lr=0.0002, beta_1=0.5),
 82 |         metrics=["accuracy"],
 83 |     )
 84 |     return disc
 85 | 
 86 | 
 87 | def stacked_GAN(gen, disc):
 88 |     gan_input = Input(shape=(100,))
 89 |     label = Input(shape=(1,))
 90 |     x = gen([gan_input, label])
 91 |     disc.trainable = False
 92 |     gan_out = disc([x, label])
 93 |     gan_stack = Model([gan_input, label], gan_out)
 94 |     gan_stack.compile(loss="binary_crossentropy", optimizer=Adam(lr=0.0002, beta_1=0.5))
 95 |     return gan_stack
 96 | 
 97 | 
 98 | def train(
 99 |     gen,
100 |     disc,
101 |     gan_stack,
102 |     max_iter: int,
103 |     batch_size: int,
104 |     img_shape: int,
105 |     file_path: str,
106 |     artifact_name: str,
107 |     exp_name: str,
108 | ):
109 | 
110 |     mlflow.set_tracking_uri(tracking_uri)
111 |     client = mlflow.tracking.MlflowClient(tracking_uri=tracking_uri)
112 |     try:
113 |         expr_name = exp_name  # create a new experiment (do not replace)
114 |         mlflow.create_experiment(expr_name, s3_bucket)
115 |         mlflow.set_experiment(expr_name)
116 |         experiment = mlflow.get_experiment_by_name(exp_name)
117 |     except:
118 |         experiment = mlflow.get_experiment_by_name(exp_name)
119 | 
120 |     os.makedirs(os.path.join(file_path, artifact_name), exist_ok=True)
121 |     mlflow.tensorflow.autolog(every_n_iter=1)
122 |     with mlflow.start_run(experiment_id=experiment.experiment_id):
123 | 
124 |         mlflow.log_metrics(
125 |             {
126 |                 "batch_size": batch_size,
127 |                 "epochs": max_iter,
128 |                 "image_shape": img_shape,
129 |             }
130 |         )
131 | 
132 |         (X_train, y_train), (_, _) = mnist.load_data()
133 |         X_train = (X_train.astype(np.float32) - 127.5) / 127.5
134 |         X_train = X_train.reshape(60000, 784)
135 |         y_train = y_train.reshape(-1, 1)
136 | 
137 |         valid = np.ones((batch_size, 1))
138 |         fake = np.zeros((batch_size, 1))
139 |         for i in range(max_iter):
140 |             noise = np.random.normal(0, 1, (batch_size, 100))
141 |             index = np.random.randint(0, X_train.shape[0], size=batch_size)
142 |             image_batch = X_train[index]
143 |             label_batch = y_train[index]
144 | 
145 |             fake_images = gen.predict([noise, label_batch])
146 | 
147 |             disc.trainable = True
148 |             disc_loss_real = disc.train_on_batch([image_batch, label_batch], valid)
149 |             disc_loss_fake = disc.train_on_batch([fake_images, label_batch], fake)
150 |             disc_loss_final = 0.5 * np.add(disc_loss_real, disc_loss_fake)
151 | 
152 |             fake_labels = np.random.randint(0, 10, batch_size).reshape(-1, 1)
153 |             disc.trainable = False
154 |             gen_loss = gan_stack.train_on_batch([noise, fake_labels], valid)
155 | 
156 |             mlflow.log_metrics(
157 |                 {"generator_loss": gen_loss, "discriminator_loss": disc_loss_final[0]}
158 |             )
159 | 
160 |             print(
161 |                 "epoch_%d---->gen_loss:[%f]---->disc_loss:[%f]---->acc:[%f]"
162 |                 % (i, gen_loss, disc_loss_final[0], disc_loss_final[1] * 100)
163 |             )
164 |             # if i % 100 == 0:
165 |             #     test(gen, i)
166 |         gen.save(os.path.join(file_path, artifact_name, "generator.h5"))
167 |         # disc.save(os.path.join(file_path, artifact_name, "discriminator.h5"))
168 | 
169 |         meta_data_path = os.path.join(file_path, artifact_name)
170 |         for artifact in sorted(os.listdir(meta_data_path)):
171 |             if artifact != ".DS_Store":
172 |                 artifact_path = os.path.join(meta_data_path, artifact)
173 |                 if (
174 |                     os.path.isfile(artifact_path)
175 |                     and artifact_path.split(".")[-1] != "h5"
176 |                 ):
177 |                     print(f"artifact to be uploaded is: {artifact}")
178 |                     mlflow.log_artifact(local_path=artifact_path)
179 | 
180 |         artifact_uri = mlflow.get_artifact_uri()
181 |         print(artifact_uri)
182 |         mlflow.end_run()
183 | 
184 | 
185 | parser = argparse.ArgumentParser()
186 | parser.add_argument("-fp", "--file_path", help="Directory path to save artifacts")
187 | parser.add_argument("-a", "--art_name", help="Artifacts name")
188 | parser.add_argument("-b", "--batch_size", default=32, help="Batch size for training")
189 | parser.add_argument("-e", "--epochs", default=20000, help="Number of epochs")
190 | parser.add_argument("-is", "--img_shape", default=784, help="One dimension of image")
191 | parser.add_argument(
192 |     "-exp",
193 |     "--experiment_name",
194 |     default="conditional_gan",
195 |     help="Name of the experiment for tracking",
196 | )
197 | args = parser.parse_args()
198 | train(
199 |     gen=generator(),
200 |     disc=discriminator(),
201 |     gan_stack=stacked_GAN(gen=generator(), disc=discriminator()),
202 |     max_iter=int(args.epochs),
203 |     batch_size=int(args.batch_size),
204 |     img_shape=int(args.img_shape),
205 |     file_path=args.file_path,
206 |     artifact_name=args.art_name,
207 |     exp_name=args.experiment_name,
208 | )
209 | 


--------------------------------------------------------------------------------
/document_classification/__pycache__/hybrid_v1.cpython-38.pyc:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/vaibhavsatpathy/psAI-clOps/7b447177b3e43c8d028360f574e6a20ba5090cc5/document_classification/__pycache__/hybrid_v1.cpython-38.pyc


--------------------------------------------------------------------------------
/document_classification/__pycache__/pre_process_text.cpython-38.pyc:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/vaibhavsatpathy/psAI-clOps/7b447177b3e43c8d028360f574e6a20ba5090cc5/document_classification/__pycache__/pre_process_text.cpython-38.pyc


--------------------------------------------------------------------------------
/document_classification/__pycache__/pre_processing.cpython-38.pyc:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/vaibhavsatpathy/psAI-clOps/7b447177b3e43c8d028360f574e6a20ba5090cc5/document_classification/__pycache__/pre_processing.cpython-38.pyc


--------------------------------------------------------------------------------
/document_classification/bento_package.py:
--------------------------------------------------------------------------------
 1 | from bento_predictor import ModelZoo
 2 | import tensorflow as tf
 3 | from tensorflow.keras.models import load_model
 4 | from tensorflow.keras.preprocessing.text import Tokenizer
 5 | import spacy
 6 | import json
 7 | 
 8 | 
 9 | def mcr_models(
10 |     model_zoo, tf_model_path_mcr: str, text_file_path_mcr: str, labels_path_mcr: str
11 | ):
12 |     model_cnn = load_model(tf_model_path_mcr)
13 |     tf.saved_model.save(model_cnn, "artifacts/")
14 |     model_cnn = tf.saved_model.load("artifacts/")
15 |     model_zoo.pack("mcr_model", model_cnn)
16 | 
17 |     text_model = spacy.load("en_core_web_sm")
18 |     model_zoo.pack("mcr_spacy_model", text_model)
19 | 
20 |     tokenizer = Tokenizer()
21 |     with open(text_file_path_mcr, "r") as f:
22 |         bow = f.read()
23 |     tokenizer.fit_on_texts(bow.split("####"))
24 |     model_zoo.pack("mcr_tokenizer", tokenizer)
25 | 
26 |     with open(labels_path_mcr, "r") as f:
27 |         labels_mcr = json.load(f)
28 |     model_zoo.pack("mcr_labels", labels_mcr)
29 | 
30 | 
31 | def main():
32 |     model_zoo = ModelZoo()
33 |     mcr_models(
34 |         model_zoo=model_zoo,
35 |         tf_model_path_mcr=tf_model_path_mcr,
36 |         text_file_path_mcr=text_file_path_mcr,
37 |         labels_path_mcr=labels_path_mcr,
38 |     )
39 |     saved_path = model_zoo.save()
40 | 
41 | 
42 | tf_model_path_mcr = (
43 |     "/Users/vsatpathy/Desktop/docs/training_data/mcr/document_classifier.h5"
44 | )
45 | text_file_path_mcr = (
46 |     "/Users/vsatpathy/Desktop/docs/training_data/mcr/file_and_text_mcr.txt"
47 | )
48 | labels_path_mcr = "/Users/vsatpathy/Desktop/docs/training_data/mcr/rev_labels_mcr.json"
49 | main()
50 | 


--------------------------------------------------------------------------------
/document_classification/bento_predictor.py:
--------------------------------------------------------------------------------
  1 | import bentoml
  2 | from bentoml.types import FileLike
  3 | from bentoml.adapters import JsonInput, FileInput, MultiFileInput
  4 | from bentoml.frameworks.spacy import SpacyModelArtifact
  5 | from bentoml.frameworks.tensorflow import TensorflowSavedModelArtifact
  6 | from bentoml.service.artifacts.common import (
  7 |     JSONArtifact,
  8 |     PickleArtifact,
  9 | )
 10 | 
 11 | import tensorflow as tf
 12 | from tensorflow.keras.preprocessing.image import load_img
 13 | import numpy as np
 14 | from pytesseract import image_to_string
 15 | import re
 16 | from PIL import Image
 17 | from typing import List
 18 | 
 19 | 
 20 | @bentoml.env(infer_pip_packages=True)
 21 | @bentoml.artifacts(
 22 |     [
 23 |         TensorflowSavedModelArtifact("mcr_model"),
 24 |         SpacyModelArtifact("mcr_spacy_model"),
 25 |         PickleArtifact("mcr_tokenizer"),
 26 |         JSONArtifact("mcr_labels"),
 27 |     ]
 28 | )
 29 | class ModelZoo(bentoml.BentoService):
 30 |     def helper(self, text):
 31 |         dummy = []
 32 |         for word in text:
 33 |             dummy.append(str(word))
 34 |         final = " ".join(dummy)
 35 |         return final
 36 | 
 37 |     def preprocess_spacy(self, spacy_model, text, num_of_words: int):
 38 |         text = str(text)
 39 |         text = text.split(" ")
 40 |         text = self.helper(text)
 41 |         text = str(text.lower())
 42 |         # Remove all the special characters
 43 |         text = re.sub(r"\W", " ", text)
 44 |         text = re.sub(r"[^a-zA-Z ]+", "", text)
 45 |         # remove all single characters
 46 |         text = re.sub(r"\s+[a-zA-Z]\s+", " ", text)
 47 |         # Remove single characters from the start
 48 |         text = re.sub(r"\^[a-zA-Z]\s+", " ", text)
 49 |         # Substituting multiple spaces with single space
 50 |         text = re.sub(r"\s+", " ", text, flags=re.I)
 51 |         # text = self.artifacts.mcr_spacy_model(text)
 52 |         text = spacy_model(text)
 53 |         filtered = [token.lemma_ for token in text if token.is_stop == False]
 54 |         text = " ".join(filtered[: num_of_words * 2])
 55 |         text = text.strip().split(" ")
 56 |         text = " ".join(text[:num_of_words])
 57 |         return text
 58 | 
 59 |     def tokenize_sentence(self, sentence, tokenizer, maximum_word_length):
 60 |         updated_sentence = sentence.split(" ")
 61 |         tok_sent = []
 62 |         for word in updated_sentence:
 63 |             if word in tokenizer.word_index:
 64 |                 tok_sent.append(tokenizer.word_index[word])
 65 |             else:
 66 |                 tok_sent.append(0)
 67 |         if len(tok_sent) != maximum_word_length:
 68 |             delta = maximum_word_length - len(tok_sent)
 69 |             for i in range(delta):
 70 |                 tok_sent.append(0)
 71 |         return tok_sent
 72 | 
 73 |     def pre_process_image(self, image_file):
 74 |         ocr_image = np.asarray(Image.open(image_file))
 75 |         image = np.asarray(
 76 |             Image.open(image_file).convert(mode="RGB").resize((100, 100))
 77 |         )
 78 |         image = np.divide(image, 255.0)
 79 |         image = np.asarray([image]).astype("float32")
 80 |         return ocr_image, image
 81 | 
 82 |     def pre_process_mcr(self, file):
 83 |         ocr_image, image = self.pre_process_image(image_file=file)
 84 |         doc_text = image_to_string(ocr_image)
 85 |         doc_text_processed = self.preprocess_spacy(
 86 |             spacy_model=self.artifacts.mcr_spacy_model, text=doc_text, num_of_words=10
 87 |         )
 88 |         fin_text = self.tokenize_sentence(
 89 |             sentence=doc_text_processed,
 90 |             tokenizer=self.artifacts.mcr_tokenizer,
 91 |             maximum_word_length=10,
 92 |         )
 93 |         return image, np.asarray([fin_text]).astype("float32")
 94 | 
 95 |     @bentoml.api(input=FileInput())
 96 |     def predict_document_labels_mcr(self, file_stream):
 97 |         image, text = self.pre_process_mcr(file=file_stream)
 98 |         model = self.artifacts.mcr_model.signatures["serving_default"]
 99 |         model._num_positional_args = 2
100 |         results = model(tf.constant(text), tf.constant(image))
101 |         conv_results = results.get("dense_1")[0].numpy()
102 |         document_label = self.artifacts.mcr_labels[str(np.argmax(conv_results))]
103 |         return {"document_type": document_label}
104 | 


--------------------------------------------------------------------------------
/document_classification/docker-compose.yml:
--------------------------------------------------------------------------------
 1 | version: "3"
 2 | services:
 3 |   web:
 4 |     image: 142339138776.dkr.ecr.us-east-2.amazonaws.com/docuedge-model-zoo:latest
 5 |     ports:
 6 |       - "5000:5000"
 7 |     logging:
 8 |       driver: awslogs
 9 |       options:
10 |         awslogs-group: docuedge-modelserver-ecs
11 |         awslogs-region: us-east-2
12 |         awslogs-stream-prefix: web
13 |     volumes:
14 |       - /app/temp
15 | 


--------------------------------------------------------------------------------
/document_classification/ecs-params.yml:
--------------------------------------------------------------------------------
 1 | version: 1
 2 | task_definition:
 3 |   task_execution_role: ecsTaskExecutionRoleBento
 4 |   ecs_network_mode: awsvpc
 5 |   task_size:
 6 |     mem_limit: 8GB
 7 |     cpu_limit: 4096
 8 |   efs_volumes:
 9 |     - name: docuedgedev-efs
10 |       filesystem_id: fs-4717c93f
11 |       root_directory: /smartbox-config
12 | run_params:
13 |   network_configuration:
14 |     awsvpc_configuration:
15 |       subnets:
16 |         - subnet-00e7bff093931a167
17 |         - subnet-0345b051535c9625d
18 |       security_groups:
19 |         - sg-09b7e06cb8b13167d
20 |         - sg-0601a52d4ea28af05
21 |       assign_public_ip: ENABLED
22 | 


--------------------------------------------------------------------------------
/document_classification/hybrid_v1.py:
--------------------------------------------------------------------------------
  1 | import numpy as np
  2 | import random
  3 | from tensorflow.keras.preprocessing.image import load_img
  4 | from tensorflow.keras.preprocessing.text import Tokenizer
  5 | from tensorflow.keras.layers import (
  6 |     Input,
  7 |     Conv2D,
  8 |     Dense,
  9 |     Flatten,
 10 |     Embedding,
 11 |     Concatenate,
 12 |     GlobalMaxPool1D,
 13 | )
 14 | from tensorflow.keras.models import Model, load_model
 15 | import os
 16 | import json
 17 | import mlflow
 18 | import mlflow.tensorflow
 19 | 
 20 | 
 21 | tracking_uri = (
 22 |     "http://testuser:password@ec2-18-218-100-222.us-east-2.compute.amazonaws.com"
 23 | )
 24 | s3_bucket = "s3://docuedge-mlflow-bucket"  # replace this value
 25 | 
 26 | 
 27 | def read_data(path):
 28 |     bow = open(path, "r")
 29 |     data = bow.readlines()
 30 |     all_data_paths = []
 31 |     all_texts = []
 32 |     y_labels = {}
 33 |     for line in data:
 34 |         line_data = line.split("####")
 35 |         all_data_paths.append(line_data[0])
 36 |         all_texts.append(line_data[-1][:-1])
 37 |         label = line_data[0].split("/")[-2]
 38 |         if label not in y_labels:
 39 |             y_labels[label] = len(y_labels)
 40 | 
 41 |     rev_labels = {}
 42 |     for key, val in y_labels.items():
 43 |         rev_labels[val] = key
 44 | 
 45 |     return all_data_paths, y_labels, rev_labels, all_texts
 46 | 
 47 | 
 48 | def tokenize_sentence(sentence, tokenizer, maximum_word_length):
 49 |     updated_sentence = sentence.split(" ")
 50 |     tok_sent = []
 51 |     for word in updated_sentence:
 52 |         if word in tokenizer.word_index:
 53 |             tok_sent.append(tokenizer.word_index[word])
 54 |         else:
 55 |             tok_sent.append(0)
 56 |     if len(tok_sent) != maximum_word_length:
 57 |         delta = maximum_word_length - len(tok_sent)
 58 |         for i in range(delta):
 59 |             tok_sent.append(0)
 60 |     return tok_sent
 61 | 
 62 | 
 63 | def data_loader_text(
 64 |     bs, data, y_lab, tokenizer, text_data, image_input_shape, max_word_length
 65 | ):
 66 |     while True:
 67 |         images = []
 68 |         labels = []
 69 |         texts = []
 70 |         while len(images) < bs:
 71 |             indice = random.randint(0, len(data) - 1)
 72 |             target = data[indice].split("/")[-2]
 73 |             labels.append(y_lab[target])
 74 | 
 75 |             test_img = np.asarray(load_img(data[indice], target_size=image_input_shape))
 76 |             img = np.divide(test_img, 255.0)
 77 |             images.append(img)
 78 | 
 79 |             tok_sen = tokenize_sentence(
 80 |                 text_data[indice], tokenizer, maximum_word_length=max_word_length
 81 |             )
 82 |             texts.append(tok_sen)
 83 |         yield [np.asarray(images), np.asarray(texts)], np.asarray(labels)
 84 | 
 85 | 
 86 | def model_arc(y_labels, tokenizer, text_model_inp_shape, image_inp_shape):
 87 |     inp_layer_texts = Input(shape=text_model_inp_shape)
 88 |     inp_layer_images = Input(shape=image_inp_shape)
 89 | 
 90 |     embedding_layer = Embedding(
 91 |         input_dim=len(tokenizer.word_index) + 1,
 92 |         output_dim=64,
 93 |         input_length=text_model_inp_shape,
 94 |         trainable=True,
 95 |     )(inp_layer_texts)
 96 |     pooling_layer = GlobalMaxPool1D()(embedding_layer)
 97 |     dense_layer = Dense(units=64, activation="relu")(pooling_layer)
 98 |     # lstm_layer = Bidirectional(LSTM(units=32))(embedding_layer)
 99 | 
100 |     conv_layer = Conv2D(filters=64, kernel_size=(2, 2), activation="relu")(
101 |         inp_layer_images
102 |     )
103 |     flatten_layer = Flatten()(conv_layer)
104 | 
105 |     concat_layer = Concatenate()([flatten_layer, dense_layer])
106 |     out_layer = Dense(len(y_labels), activation="softmax")(concat_layer)
107 | 
108 |     model = Model([inp_layer_images, inp_layer_texts], out_layer)
109 |     model.compile(
110 |         optimizer="adam", loss="sparse_categorical_crossentropy", metrics=["accuracy"]
111 |     )
112 |     return model
113 | 
114 | 
115 | def train_hybrid_v1(
116 |     text_plus_file_path: str,
117 |     batch_size: int,
118 |     epochs: int,
119 |     image_shape: int,
120 |     max_words: int,
121 |     artifact_name: str,
122 |     save_dir_path: str,
123 |     trained_model_path: str,
124 |     experiment_name: str,
125 | ):
126 | 
127 |     mlflow.set_tracking_uri(tracking_uri)
128 |     client = mlflow.tracking.MlflowClient(tracking_uri=tracking_uri)
129 |     try:
130 |         expr_name = experiment_name  # create a new experiment (do not replace)
131 |         mlflow.create_experiment(expr_name, s3_bucket)
132 |         mlflow.set_experiment(expr_name)
133 |         experiment = mlflow.get_experiment_by_name(experiment_name)
134 |     except:
135 |         experiment = mlflow.get_experiment_by_name(experiment_name)
136 | 
137 |     all_imgs_path, y_labels, rev_labels, all_text = read_data(path=text_plus_file_path)
138 |     num_train_img = len(all_imgs_path)
139 | 
140 |     with open(
141 |         os.path.join(save_dir_path, artifact_name, f"rev_labels_{artifact_name}.json"),
142 |         "w+",
143 |     ) as tar:
144 |         json.dump(rev_labels, tar)
145 | 
146 |     print("target_encodings: ", y_labels)
147 |     print("Number of training images: ", num_train_img)
148 | 
149 |     bow = open(text_plus_file_path, "r")
150 |     tokenizer = Tokenizer()
151 |     tokenizer.fit_on_texts(bow.read().split("####"))
152 | 
153 |     train_gen = data_loader_text(
154 |         tokenizer=tokenizer,
155 |         y_lab=y_labels,
156 |         data=all_imgs_path,
157 |         text_data=all_text,
158 |         bs=batch_size,
159 |         image_input_shape=(image_shape, image_shape, 3),
160 |         max_word_length=max_words,
161 |     )
162 |     if os.path.isfile(trained_model_path):
163 |         model = load_model(trained_model_path)
164 |     else:
165 |         model = model_arc(
166 |             y_labels=y_labels,
167 |             tokenizer=tokenizer,
168 |             text_model_inp_shape=(max_words,),
169 |             image_inp_shape=(image_shape, image_shape, 3),
170 |         )
171 |     mlflow.tensorflow.autolog(every_n_iter=1)
172 |     with mlflow.start_run(experiment_id=experiment.experiment_id):
173 |         mlflow.log_metrics(
174 |             {
175 |                 "batch_size": batch_size,
176 |                 "epochs": epochs,
177 |                 "image_shape": image_shape,
178 |                 "max_words": max_words,
179 |             }
180 |         )
181 |         history = model.fit(
182 |             x=train_gen,
183 |             steps_per_epoch=num_train_img // batch_size,
184 |             epochs=epochs,
185 |         )
186 |         model.save(
187 |             filepath=os.path.join(
188 |                 save_dir_path, artifact_name, "document_classifier.h5"
189 |             )
190 |         )
191 |         meta_data_path = os.path.join(save_dir_path, artifact_name)
192 |         for artifact in sorted(os.listdir(meta_data_path)):
193 |             if artifact != ".DS_Store":
194 |                 artifact_path = os.path.join(meta_data_path, artifact)
195 |                 if (
196 |                     os.path.isfile(artifact_path)
197 |                     and artifact_path.split(".")[-1] != "h5"
198 |                 ):
199 |                     print(f"artifact to be uploaded is: {artifact}")
200 |                     mlflow.log_artifact(local_path=artifact_path)
201 | 
202 |         artifact_uri = mlflow.get_artifact_uri()
203 |         print(artifact_uri)
204 |         mlflow.end_run()
205 | 


--------------------------------------------------------------------------------
/document_classification/pre_process_text.py:
--------------------------------------------------------------------------------
  1 | import spacy
  2 | import re
  3 | from pdf2image import convert_from_path
  4 | import os
  5 | from tqdm import tqdm
  6 | import pre_processing
  7 | from PIL import Image
  8 | import pytesseract
  9 | 
 10 | nlp = spacy.load("en_core_web_sm")
 11 | 
 12 | 
 13 | def helper(text):
 14 |     dummy = []
 15 |     for word in text:
 16 |         dummy.append(str(word))
 17 |     final = " ".join(dummy)
 18 |     return final
 19 | 
 20 | 
 21 | def preprocess_spacy(text, num_of_words: int):
 22 |     text = str(text)
 23 |     text = text.split(" ")
 24 |     text = helper(text)
 25 |     text = str(text.lower())
 26 |     # Remove all the special characters
 27 |     text = re.sub(r"\W", " ", text)
 28 |     text = re.sub(r"[^a-zA-Z ]+", "", text)
 29 |     # remove all single characters
 30 |     text = re.sub(r"\s+[a-zA-Z]\s+", " ", text)
 31 |     # Remove single characters from the start
 32 |     text = re.sub(r"\^[a-zA-Z]\s+", " ", text)
 33 |     # Substituting multiple spaces with single space
 34 |     text = re.sub(r"\s+", " ", text, flags=re.I)
 35 |     text = nlp(text)
 36 |     filtered = [token.lemma_ for token in text if token.is_stop == False]
 37 |     text = " ".join(filtered[: num_of_words * 2])
 38 |     text = text.strip().split(" ")
 39 |     text = " ".join(text[:num_of_words])
 40 |     return text
 41 | 
 42 | 
 43 | def read_text_from_pages(
 44 |     complete_folder_path: str,
 45 |     path_to_save_essential_data: str,
 46 |     meta_name: str,
 47 |     num_of_words: int,
 48 | ):
 49 |     final_path_for_data = os.path.join(
 50 |         path_to_save_essential_data, meta_name, f"file_and_text_{meta_name}.txt"
 51 |     )
 52 |     if os.path.isfile(final_path_for_data):
 53 |         data = open(final_path_for_data, "r").read()
 54 |     else:
 55 |         data = "null"
 56 |     print("####  Reading pages ####")
 57 |     doc_image_types = sorted(os.listdir(complete_folder_path))
 58 |     text_of_all_pages = []
 59 |     for doc_image_type in doc_image_types:
 60 |         if doc_image_type != ".DS_Store":
 61 |             print("DOCUMENT TYPE: ", doc_image_type)
 62 |             complete_doc_image_path = os.path.join(complete_folder_path, doc_image_type)
 63 |             pages = sorted(os.listdir(complete_doc_image_path))
 64 |             for page in tqdm(pages):
 65 |                 if page != ".DS_Store":
 66 |                     page_path = os.path.join(complete_doc_image_path, page)
 67 |                     if page_path not in data:
 68 |                         document_page = Image.open(page_path)
 69 |                         document_text = pytesseract.image_to_string(document_page)
 70 |                         document_page.close()
 71 |                         essential_file_path_and_text = (
 72 |                             page_path
 73 |                             + "####"
 74 |                             + preprocess_spacy(document_text, num_of_words=num_of_words)
 75 |                             + "\n"
 76 |                         )
 77 |                         text_of_all_pages.append(essential_file_path_and_text)
 78 | 
 79 |     if os.path.isfile(final_path_for_data):
 80 |         all_essential_data = open(final_path_for_data, "a+")
 81 |         all_essential_data.writelines(text_of_all_pages)
 82 |     else:
 83 |         all_essential_data = open(final_path_for_data, "w")
 84 |         all_essential_data.writelines(text_of_all_pages)
 85 |     return final_path_for_data
 86 | 
 87 | 
 88 | def pdf_to_images(full_path_pdf: str, converted_images_path: str, meta_name: str):
 89 |     doc = full_path_pdf.split("/")[-1]
 90 |     index = 0
 91 |     OUTPUT_PATH = converted_images_path
 92 |     os.makedirs(name=OUTPUT_PATH, exist_ok=True)
 93 | 
 94 |     print("Document name: ", doc)
 95 |     if str(doc.split(".pdf")[-2]) + "_" + str(index) + ".jpg" not in os.listdir(
 96 |         converted_images_path
 97 |     ):
 98 |         pil_images = convert_from_path(full_path_pdf, dpi=300)
 99 | 
100 |         for image in tqdm(pil_images):
101 |             processed_image = pre_processing.preprocess_image_file(image)
102 |             try:
103 |                 processed_image = Image.fromarray(processed_image)
104 |                 processed_image.save(
105 |                     os.path.join(OUTPUT_PATH, str(doc.split(".pdf")[-2]))
106 |                     + "_"
107 |                     + str(index)
108 |                     + ".jpg",
109 |                     format="JPEG",
110 |                     subsampling=0,
111 |                     quality=100,
112 |                 )
113 |                 index += 1
114 |                 processed_image.close()
115 |             except:
116 |                 index += 1
117 |     else:
118 |         pass


--------------------------------------------------------------------------------
/document_classification/pre_processing.py:
--------------------------------------------------------------------------------
  1 | """IMAGE PREPROCESSING FUNCTIONS
  2 | """
  3 | import cv2
  4 | import numpy as np
  5 | from scipy.ndimage.filters import rank_filter
  6 | 
  7 | # from sbox.utils.sbox_logger import logger
  8 | import pytesseract
  9 | import re
 10 | import imutils
 11 | from PIL import Image
 12 | 
 13 | # print("error") # = logger(__name__)
 14 | 
 15 | 
 16 | class PagePreprocess(object):
 17 |     def __init__(self, im):
 18 |         self.err = False
 19 |         self.orig_im = im
 20 |         self.orig_shape = self.orig_im.shape
 21 |         self.image = im
 22 | 
 23 |     def crop(self):
 24 |         try:
 25 |             self.image, self.num_tries = process_image(self.orig_im)
 26 |             self.crop_shape = self.image.shape
 27 |             return self.image
 28 |         except Exception as e:
 29 |             print("crop_obj_Error")  # (f"Error: {e}", exc_info=True)
 30 | 
 31 |     def deskew(self):
 32 |         try:
 33 |             self.image, self.theta_est = process_skewed_crop(self.image)
 34 |             return self.image
 35 |         except Exception as e:
 36 |             print("deskew_obj_Error")  # (f"Error: {e}", exc_info=True)
 37 | 
 38 | 
 39 | def auto_canny(image, sigma=0.33):
 40 |     try:
 41 |         v = np.median(image)
 42 |         lower = int(max(0, (1.0 - sigma) * v))
 43 |         upper = int(min(255, (1.0 + sigma) * v))
 44 |         edged = cv2.Canny(image, lower, upper, True)
 45 |         return edged
 46 |     except Exception as e:
 47 |         print("auto_canny_Error")  # (f"Error: {e}", exc_info=True)
 48 | 
 49 | 
 50 | def dilate(image, kernel, iterations):
 51 |     dilated_image = cv2.dilate(image, kernel, iterations=iterations)
 52 |     return dilated_image
 53 | 
 54 | 
 55 | def downscale_image(im, max_dim=2048):
 56 |     try:
 57 |         a, b = im.shape[:2]
 58 |         if max(a, b) <= max_dim:
 59 |             return 1.0, im
 60 | 
 61 |         scale = 1.0 * max_dim / max(a, b)
 62 |         new_im = cv2.resize(im, (int(b * scale), int(a * scale)), cv2.INTER_AREA)
 63 |         return scale, new_im
 64 |     except Exception as e:
 65 |         print("error")  # (f"Error: {e}", exc_info=True)
 66 | 
 67 | 
 68 | def find_components(im, max_components=16):
 69 |     try:
 70 |         kernel = cv2.getStructuringElement(cv2.MORPH_RECT, (10, 10))
 71 |         dilation = dilate(im, kernel, 6)
 72 | 
 73 |         count = 21
 74 |         n = 0
 75 |         sigma = 0.000
 76 | 
 77 |         while count > max_components:
 78 |             n += 1
 79 |             sigma += 0.005
 80 |             result = cv2.findContours(dilation, cv2.RETR_TREE, cv2.CHAIN_APPROX_SIMPLE)
 81 |             if len(result) == 3:
 82 |                 _, contours, hierarchy = result
 83 |             elif len(result) == 2:
 84 |                 contours, hierarchy = result
 85 |             possible = find_likely_rectangles(contours, sigma)
 86 |             count = len(possible)
 87 | 
 88 |         return (dilation, possible, n)
 89 |     except Exception as e:
 90 |         print("comp_error")  # (f"Error: {e}", exc_info=True)
 91 | 
 92 | 
 93 | def find_likely_rectangles(contours, sigma):
 94 |     try:
 95 |         contours = sorted(contours, key=cv2.contourArea, reverse=True)[:10]
 96 |         possible = []
 97 |         for c in contours:
 98 | 
 99 |             peri = cv2.arcLength(c, True)
100 |             approx = cv2.approxPolyDP(c, sigma * peri, True)
101 |             box = make_box(approx)
102 |             possible.append(box)
103 | 
104 |         return possible
105 |     except Exception as e:
106 |         print("likely_rec_error")  # (f"Error: {e}", exc_info=True)
107 | 
108 | 
109 | def make_box(poly):
110 |     try:
111 |         x = []
112 |         y = []
113 |         for p in poly:
114 |             for point in p:
115 |                 x.append(point[0])
116 |                 y.append(point[1])
117 |         xmax = max(x)
118 |         ymax = max(y)
119 |         xmin = min(x)
120 |         ymin = min(y)
121 |         return (xmin, ymin, xmax, ymax)
122 |     except Exception as e:
123 |         print("bbox_error")  # (f"Error: {e}", exc_info=True)
124 | 
125 | 
126 | def rect_union(crop1, crop2):
127 |     x11, y11, x21, y21 = crop1
128 |     x12, y12, x22, y22 = crop2
129 |     return min(x11, x12), min(y11, y12), max(x21, x22), max(y21, y22)
130 | 
131 | 
132 | def rect_area(crop):
133 |     x1, y1, x2, y2 = crop
134 |     return max(0, x2 - x1) * max(0, y2 - y1)
135 | 
136 | 
137 | def crop_image(im, rect, scale):
138 |     try:
139 |         xmin, ymin, xmax, ymax = rect
140 |         crop = [xmin, ymin, xmax, ymax]
141 |         xmin, ymin, xmax, ymax = [int(x / scale) for x in crop]
142 |         if ((ymax - ymin) * (xmax - xmin)) > 0.25 * im.size:
143 |             cropped = im[ymin:ymax, xmin:xmax]
144 |         else:
145 |             cropped = im
146 |         return cropped
147 |     except Exception as e:
148 |         print("crop_error_1")  # (f"Error: {e}", exc_info=True)
149 | 
150 | 
151 | def reduce_noise_raw(im):
152 |     bilat = cv2.bilateralFilter(im, 4, 75, 75)
153 |     blur = cv2.medianBlur(bilat, 1)
154 |     return blur
155 | 
156 | 
157 | def reduce_noise_edges(im):
158 |     try:
159 |         structuring_element = cv2.getStructuringElement(cv2.MORPH_RECT, (1, 1))
160 |         opening = cv2.morphologyEx(im, cv2.MORPH_OPEN, structuring_element)
161 |         maxed_rows = rank_filter(opening, -4, size=(1, 20))
162 |         maxed_cols = rank_filter(opening, -4, size=(20, 1))
163 |         debordered = np.minimum(np.minimum(opening, maxed_rows), maxed_cols)
164 |         return debordered
165 |     except Exception as e:
166 |         print("noise_red_Error")  # (f"Error: {e}", exc_info=True)
167 | 
168 | 
169 | def rects_are_vertical(rect1, rect2, rect_align=2):
170 |     try:
171 |         xmin1, ymin1, xmax1, ymax1 = rect1
172 |         xmin2, ymin2, xmax2, ymax2 = rect2
173 | 
174 |         midpoint1 = (xmin1 + xmax1) / 2
175 |         midpoint2 = (xmin2 + xmax2) / 2
176 |         dist = abs(midpoint1 - midpoint2)
177 | 
178 |         rectarea1 = rect_area(rect1)
179 |         rectarea2 = rect_area(rect2)
180 |         if rectarea1 > rectarea2:
181 |             thres = (xmax1 - xmin1) * rect_align
182 |         else:
183 |             thres = (xmax2 - xmin2) * rect_align
184 | 
185 |         if thres > dist:
186 |             align = True
187 |         else:
188 |             align = False
189 |         return align
190 |     except Exception as e:
191 |         print("vert_rec_Error")  # (f"Error: {e}", exc_info=True)
192 | 
193 | 
194 | def find_final_crop(im, rects, orig_im):
195 |     try:
196 |         current = None
197 |         for rect in rects:
198 |             if current is None:
199 |                 current = rect
200 |                 continue
201 | 
202 |             aligned = rects_are_vertical(current, rect)
203 | 
204 |             if not aligned:
205 |                 continue
206 | 
207 |             current = rect_union(current, rect)
208 |         if current is not None:
209 |             return current
210 |         else:
211 |             return (0, 0, orig_im.shape[0], orig_im.shape[1])
212 |     except Exception as e:
213 |         print("crop_Error")  # (f"Error: {e}", exc_info=True)
214 | 
215 | 
216 | def process_image(orig_im):
217 |     try:
218 |         scale, im = downscale_image(orig_im)
219 | 
220 |         blur = reduce_noise_raw(im.copy())
221 | 
222 |         edges = auto_canny(blur.copy())
223 | 
224 |         debordered = reduce_noise_edges(edges.copy())
225 | 
226 |         dilation, rects, num_tries = find_components(debordered, 16)
227 | 
228 |         final_rect = find_final_crop(dilation, rects, orig_im)
229 | 
230 |         cropped = crop_image(orig_im, final_rect, scale)
231 |         # kernel = np.ones((3, 3), np.float32) / 25
232 |         # smooth2d = cv2.filter2D(cropped, -1, kernel=kernel)
233 |         return (cropped, num_tries)
234 |     except Exception as e:
235 |         print("process")  # (f"Error: {e}", exc_info=True)
236 | 
237 | 
238 | def rad_to_deg(theta):
239 |     return theta * 180 / np.pi
240 | 
241 | 
242 | def rotate(image, theta):
243 |     try:
244 |         (h, w) = image.shape[:2]
245 |         center = (w / 2, h / 2)
246 |         M = cv2.getRotationMatrix2D(center, theta, 1)
247 |         rotated = cv2.warpAffine(
248 |             image,
249 |             M,
250 |             (int(w), int(h)),
251 |             cv2.INTER_LINEAR,
252 |             borderMode=cv2.BORDER_CONSTANT,
253 |             borderValue=(255, 255, 255),
254 |         )
255 |         return rotated
256 |     except Exception as e:
257 |         print("rotation_error")  # (f"Error: {e}", exc_info=True)
258 | 
259 | 
260 | def angle_calculation(gray):
261 |     gray = cv2.cvtColor(gray, cv2.COLOR_BGR2GRAY)
262 |     gray = cv2.bitwise_not(gray)
263 |     thresh = cv2.threshold(gray, 0, 255, cv2.THRESH_BINARY | cv2.THRESH_OTSU)[1]
264 | 
265 |     coords = np.column_stack(np.where(thresh > 0))
266 |     # print(coords, coords.shape)
267 | 
268 |     min_y = coords[0][0]
269 |     max_y = coords[-1][0]
270 |     min_x = coords[0][0]
271 |     max_x = coords[-1][1]
272 | 
273 |     left_most = coords[0]
274 |     right_most = coords[0]
275 |     top_most = coords[0]
276 |     bottom_most = coords[0]
277 |     # print(coords[0], coords[-1])
278 |     for i in range(1, coords.shape[0]):
279 |         y, x = coords[i][0], coords[i][1]
280 |         if y <= min_y:
281 |             min_y = y
282 |             top_most = coords[i]
283 |         elif y >= max_y:
284 |             max_y = y
285 |             bottom_most = coords[i]
286 |         if x <= min_x:
287 |             min_x = x
288 |             left_most = coords[i]
289 |         elif x >= max_x:
290 |             max_x = x
291 |             right_most = coords[i]
292 |     # print(top_most, left_most, bottom_most, right_most)
293 | 
294 |     slopes = []
295 |     edge_coor = [top_most, left_most, bottom_most, right_most]
296 |     for i in range(0, len(edge_coor)):
297 |         if i == len(edge_coor) - 1:
298 |             if abs((edge_coor[0][1] - edge_coor[i][1])) >= 10:
299 |                 angle = (
300 |                     (
301 |                         (edge_coor[0][0] - edge_coor[i][0])
302 |                         / (edge_coor[0][1] - edge_coor[i][1])
303 |                     )
304 |                     * 180
305 |                 ) / 3.14
306 |                 slopes.append(angle)
307 |             else:
308 |                 slopes.append(0.0)
309 |         else:
310 |             if abs((edge_coor[i + 1][1] - edge_coor[i][1])) >= 10:
311 |                 angle = (
312 |                     (
313 |                         (edge_coor[i + 1][0] - edge_coor[i][0])
314 |                         / (edge_coor[i + 1][1] - edge_coor[i][1])
315 |                     )
316 |                     * 180
317 |                 ) / 3.14
318 |                 slopes.append(angle)
319 |             else:
320 |                 slopes.append(0.0)
321 |         # img = cv2.circle(thresh, (edge_coor[i][1], edge_coor[i][0]), 5, (255, 0, 0), 2)
322 | 
323 |     slopes = np.asarray(slopes)
324 |     if len(np.where(slopes == 0.0)[0]) >= 2:
325 |         # print("error") #(f"Error: {e}", exc_info=True)don't rotate")
326 |         return None
327 |     else:
328 |         # print("error") #(f"Error: {e}", exc_info=True)rotate")
329 |         neg_slope = (slopes[0] + slopes[2]) / 2
330 |         pos_slope = (slopes[1] + slopes[3]) / 2
331 |         # print(pos_slope, neg_slope)
332 |         new_pos_slope = pos_slope
333 |         new_neg_slope = neg_slope
334 |         if pos_slope > 90:
335 |             if pos_slope < 180:
336 |                 new_pos_slope = 180 - pos_slope
337 |             else:
338 |                 new_pos_slope = pos_slope - ((pos_slope // 180) * 180)
339 |                 # print(new_pos_slope)
340 |         if neg_slope < -90:
341 |             new_neg_slope = 180 + neg_slope
342 |         # print(new_pos_slope, new_neg_slope)
343 |         if new_pos_slope <= new_neg_slope:
344 |             fin_angle = pos_slope
345 |         else:
346 |             fin_angle = neg_slope
347 | 
348 |         if fin_angle < -90:
349 |             rot_angle = 180 + fin_angle
350 |         elif fin_angle > 90:
351 |             rot_angle = -(180 - fin_angle)
352 |         elif -90 < fin_angle < 0:
353 |             rot_angle = fin_angle
354 |         elif 0 < fin_angle < 90:
355 |             rot_angle = fin_angle
356 |         return rot_angle
357 | 
358 | 
359 | def estimate_skew(image):
360 |     try:
361 |         osd = pytesseract.image_to_osd(image)
362 |         angle = float(re.search("(?<=Rotate: )\d+", osd).group(0))
363 |         if angle == 0:
364 |             # fin_image = rotate(image_gray, angle)
365 |             edges = auto_canny(image)
366 |             # print(edges.shape)
367 |             # print("error") #(f"Error: {e}", exc_info=True)edges found: ", edges)
368 |             lines = cv2.HoughLines(edges, 1, np.pi / 270, 400)
369 |             # print("error") #(f"Error: {e}", exc_info=True)lines found: ", lines)
370 |             if lines is not None:
371 |                 new = edges.copy()
372 |                 thetas = []
373 |                 for line in lines:
374 |                     for rho, theta in line:
375 |                         a = np.cos(theta)
376 |                         b = np.sin(theta)
377 |                         x0 = a * rho
378 |                         y0 = b * rho
379 |                         x1 = int(x0 + 1000 * (-b))
380 |                         y1 = int(y0 + 1000 * (a))
381 |                         x2 = int(x0 - 1000 * (-b))
382 |                         y2 = int(y0 - 1000 * (a))
383 |                         if theta > np.pi / 3 and theta < np.pi * 2 / 3:
384 |                             thetas.append(theta)
385 |                             new = cv2.line(new, (x1, y1), (x2, y2), (255, 255, 255), 1)
386 | 
387 |                 theta_mean = np.mean(thetas)
388 |                 theta = -(90 - (rad_to_deg(theta_mean) if len(thetas) > 0 else 0))
389 |             else:
390 |                 # theta = angle_calculation(image)
391 |                 theta = 0.0
392 |         else:
393 |             theta = angle
394 |         return theta
395 |     except Exception as e:
396 |         print("theta_error")  # (f"Error: {e}", exc_info=True)
397 | 
398 | 
399 | def process_skewed_crop(image):
400 |     try:
401 |         theta = estimate_skew(image)
402 |         # print(theta)
403 |         # ret, thresh = cv2.threshold(image, 0, 127, cv2.THRESH_OTSU)
404 |         # print(thresh)
405 |         if theta is not None and (theta % 90) != 0:
406 |             rotated = rotate(image, theta)
407 |         elif (theta % 90) == 0:
408 |             rotated = imutils.rotate_bound(image, theta)
409 |         else:
410 |             rotated = image
411 |         # print(rotated)
412 |         return rotated, theta
413 |     except Exception as e:
414 |         print("skew_Error")  # (f"Error: {e}", exc_info=True)
415 | 
416 | 
417 | def preprocess_image(file_path: str):
418 |     try:
419 |         gray_page = cv2.imread(file_path, 0)
420 |         process_page = PagePreprocess(gray_page)
421 |         _ = process_page.crop()
422 |         deskewed_page = process_page.deskew()
423 |         # cv2.imwrite(file_path, deskewed_page)
424 |         return deskewed_page
425 |     except Exception as e:
426 |         print("process_image_error")  # (f"Error: {e}", exc_info=True)
427 | 
428 | 
429 | def preprocess_image_file(img):
430 |     try:
431 |         # converted_image = cv2.cvtColor(np.asarray(img), cv2.COLOR_RGB2BGR)
432 |         gray_page = cv2.cvtColor(np.array(img), cv2.COLOR_BGR2GRAY)
433 |         # gray_page = cv2.cvtColor(gray_page, cv2.COLOR_BGR2RGB)
434 |         process_page = PagePreprocess(gray_page)
435 |         _ = process_page.crop()
436 |         deskewed_page = process_page.deskew()
437 |         return deskewed_page
438 |     except Exception as e:
439 |         print("error")  # (f"Error: {e}", exc_info=True)
440 | 


--------------------------------------------------------------------------------
/document_classification/train.py:
--------------------------------------------------------------------------------
 1 | import argparse
 2 | import os
 3 | from hybrid_v1 import train_hybrid_v1
 4 | from pre_process_text import (
 5 |     pdf_to_images,
 6 |     read_text_from_pages,
 7 | )
 8 | 
 9 | 
10 | def process(
11 |     dataset_path: str,
12 |     save_dir: str,
13 |     pdf_check: bool,
14 |     artifact_name: str,
15 |     num_words_to_read: int,
16 | ):
17 |     updated_dataset_path = os.path.join(save_dir, artifact_name, "dataset")
18 |     os.makedirs(updated_dataset_path, exist_ok=True)
19 |     for document_type in sorted(os.listdir(dataset_path)):
20 |         if document_type != ".DS_Store":
21 |             folder_path = os.path.join(dataset_path, document_type)
22 |             updated_document_type_folder_path = os.path.join(
23 |                 updated_dataset_path, document_type
24 |             )
25 |             os.makedirs(updated_document_type_folder_path, exist_ok=True)
26 |             for documents in sorted(os.listdir(folder_path)):
27 |                 if documents != ".DS_Store":
28 |                     document_path = os.path.join(folder_path, documents)
29 |                     if pdf_check:
30 |                         # Perform conversion and store the images in a temp folder
31 |                         pdf_to_images(
32 |                             full_path_pdf=document_path,
33 |                             converted_images_path=updated_document_type_folder_path,
34 |                             meta_name=artifact_name,
35 |                         )
36 |     if pdf_check:
37 |         images_data_path = os.path.join(save_dir, artifact_name)
38 |     else:
39 |         images_data_path = dataset_path
40 |     master_data_path = read_text_from_pages(
41 |         complete_folder_path=images_data_path,
42 |         path_to_save_essential_data=save_dir,
43 |         meta_name=artifact_name,
44 |         num_of_words=num_words_to_read,
45 |     )
46 |     return master_data_path
47 | 
48 | 
49 | def single_level(args):
50 |     all_data_path = process(
51 |         dataset_path=args.data_path,
52 |         save_dir=args.file_path,
53 |         pdf_check=bool(args.pdfs),
54 |         artifact_name=args.art_name,
55 |         num_words_to_read=int(args.num_of_words),
56 |     )
57 |     train_hybrid_v1(
58 |         text_plus_file_path=all_data_path,
59 |         batch_size=int(args.batch_size),
60 |         epochs=int(args.epochs),
61 |         image_shape=int(args.img_shape),
62 |         max_words=int(args.num_of_words),
63 |         artifact_name=args.art_name,
64 |         save_dir_path=args.file_path,
65 |         trained_model_path=args.model_path,
66 |         experiment_name=args.experiment_name,
67 |     )
68 | 
69 | 
70 | parser = argparse.ArgumentParser()
71 | parser.add_argument("-dp", "--data_path", help="File path of the dataset")
72 | parser.add_argument("-fp", "--file_path", help="Directory path to save artifacts")
73 | parser.add_argument("-a", "--art_name", help="Artifacts name")
74 | parser.add_argument("-p", "--pdfs", default=False, help="Dataset type")
75 | parser.add_argument("-n", "--num_of_words", default=10, help="No of words to read")
76 | parser.add_argument("-b", "--batch_size", default=8, help="Batch size for training")
77 | parser.add_argument("-e", "--epochs", default=3, help="Number of epochs")
78 | parser.add_argument("-is", "--img_shape", default=100, help="One dimension of image")
79 | parser.add_argument("-mp", "--model_path", default="NULL", help="Path to trained model")
80 | parser.add_argument(
81 |     "-exp",
82 |     "--experiment_name",
83 |     default="document_classification",
84 |     help="Name of the experiment for tracking",
85 | )
86 | args = parser.parse_args()
87 | single_level(args=args)  # For single level document classification
88 | 


--------------------------------------------------------------------------------
/image_classifier/__pycache__/bento_predictor.cpython-38.pyc:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/vaibhavsatpathy/psAI-clOps/7b447177b3e43c8d028360f574e6a20ba5090cc5/image_classifier/__pycache__/bento_predictor.cpython-38.pyc


--------------------------------------------------------------------------------
/image_classifier/artifacts/saved_model.pb:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/vaibhavsatpathy/psAI-clOps/7b447177b3e43c8d028360f574e6a20ba5090cc5/image_classifier/artifacts/saved_model.pb


--------------------------------------------------------------------------------
/image_classifier/artifacts/variables/variables.data-00000-of-00001:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/vaibhavsatpathy/psAI-clOps/7b447177b3e43c8d028360f574e6a20ba5090cc5/image_classifier/artifacts/variables/variables.data-00000-of-00001


--------------------------------------------------------------------------------
/image_classifier/artifacts/variables/variables.index:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/vaibhavsatpathy/psAI-clOps/7b447177b3e43c8d028360f574e6a20ba5090cc5/image_classifier/artifacts/variables/variables.index


--------------------------------------------------------------------------------
/image_classifier/bento_package.py:
--------------------------------------------------------------------------------
 1 | from bento_predictor import ImageClassifier
 2 | from tensorflow.keras.models import load_model
 3 | import tensorflow as tf
 4 | import json
 5 | 
 6 | 
 7 | def classifier_models(model_service, model_path: str, labels_path: str):
 8 |     model_cnn = load_model(model_path)
 9 |     tf.saved_model.save(model_cnn, "artifacts/")
10 |     model_cnn = tf.saved_model.load("artifacts/")
11 |     model_service.pack("model", model_cnn)
12 | 
13 |     with open(labels_path, "r") as f:
14 |         labels = json.load(f)
15 |     model_service.pack("labels", labels)
16 | 
17 | 
18 | def main():
19 |     model_service = ImageClassifier()
20 |     classifier_models(
21 |         model_service=model_service, model_path=model_path, labels_path=labels_path
22 |     )
23 |     saved_path = model_service.save()
24 | 
25 | 
26 | model_path = "/Users/vsatpathy/Desktop/docs/training_data/intel/image_classifier.h5"
27 | labels_path = "/Users/vsatpathy/Desktop/docs/training_data/intel/rev_labels_intel.json"
28 | main()


--------------------------------------------------------------------------------
/image_classifier/bento_predictor.py:
--------------------------------------------------------------------------------
 1 | import bentoml
 2 | from bentoml.adapters import FileInput
 3 | from bentoml.frameworks.tensorflow import TensorflowSavedModelArtifact
 4 | from bentoml.service.artifacts.common import JSONArtifact
 5 | 
 6 | import tensorflow as tf
 7 | from tensorflow.keras.preprocessing.image import load_img
 8 | import importlib.util
 9 | import numpy as np
10 | from PIL import Image
11 | 
12 | 
13 | @bentoml.env(infer_pip_packages=True)
14 | @bentoml.artifacts([TensorflowSavedModelArtifact("model"), JSONArtifact("labels")])
15 | class ImageClassifier(bentoml.BentoService):
16 |     def pre_process_image(self, image_file):
17 |         image = np.asarray(
18 |             Image.open(image_file).convert(mode="RGB").resize((100, 100))
19 |         )
20 |         image = np.divide(image, 255.0)
21 |         image = np.asarray([image]).astype("float32")
22 |         return image
23 | 
24 |     @bentoml.api(input=FileInput())
25 |     def predict_image(self, file_stream):
26 |         image = self.pre_process_image(image_file=file_stream)
27 |         model = self.artifacts.model.signatures["serving_default"]
28 |         model._num_positional_args = 1
29 |         results = model(tf.constant(image))
30 |         print(results)
31 |         conv_results = results.get("dense")[0].numpy()
32 |         label = self.artifacts.labels[str(np.argmax(conv_results))]
33 |         return {"label": label}
34 | 


--------------------------------------------------------------------------------
/image_classifier/docker-compose.yml:
--------------------------------------------------------------------------------
 1 | version: "3"
 2 | services:
 3 |   web:
 4 |     image: 142339138776.dkr.ecr.us-east-2.amazonaws.com/docuedge-model-zoo:latest
 5 |     ports:
 6 |       - "5000:5000"
 7 |     logging:
 8 |       driver: awslogs
 9 |       options:
10 |         awslogs-group: docuedge-modelserver-ecs
11 |         awslogs-region: us-east-2
12 |         awslogs-stream-prefix: web
13 |     volumes:
14 |       - /app/temp
15 | 


--------------------------------------------------------------------------------
/image_classifier/ecs-params.yml:
--------------------------------------------------------------------------------
 1 | version: 1
 2 | task_definition:
 3 |   task_execution_role: ecsTaskExecutionRoleBento
 4 |   ecs_network_mode: awsvpc
 5 |   task_size:
 6 |     mem_limit: 8GB
 7 |     cpu_limit: 4096
 8 |   efs_volumes:
 9 |     - name: docuedgedev-efs
10 |       filesystem_id: fs-4717c93f
11 |       root_directory: /smartbox-config
12 | run_params:
13 |   network_configuration:
14 |     awsvpc_configuration:
15 |       subnets:
16 |         - subnet-00e7bff093931a167
17 |         - subnet-0345b051535c9625d
18 |       security_groups:
19 |         - sg-09b7e06cb8b13167d
20 |         - sg-0601a52d4ea28af05
21 |       assign_public_ip: ENABLED
22 | 


--------------------------------------------------------------------------------
/image_classifier/infer.py:
--------------------------------------------------------------------------------
 1 | from tensorflow.keras.models import load_model
 2 | from tensorflow.keras.preprocessing.image import load_img
 3 | 
 4 | import numpy as np
 5 | import os
 6 | import random
 7 | 
 8 | 
 9 | def preprocess(image_path: str, image_shape: tuple = None):
10 |     image_dimensions = (100, 100, 3)
11 |     if image_shape:
12 |         pass
13 |     else:
14 |         image_shape = image_dimensions
15 |     test_inp = image_path
16 |     test_img = np.asarray(load_img(test_inp, target_size=image_shape))
17 |     test_img = np.divide(test_img, 255.0)
18 |     test_img = np.asarray([test_img]).astype("float32")
19 |     return test_img
20 | 
21 | 
22 | def predict(folder_path: str, model_path: str):
23 |     image_dimensions = (100, 100, 3)
24 |     full_image_path = os.path.join(folder_path, random.choice(os.listdir(folder_path)))
25 |     model = load_model(model_path)
26 |     image = preprocess(image_path=full_image_path, image_shape=image_dimensions)
27 |     results = model.predict(image)
28 |     print(full_image_path)
29 |     print(np.argmax(results[0]))
30 | 
31 | 
32 | model_path = "/Users/vsatpathy/Desktop/docs/training_data/intel/image_classifier.h5"
33 | folder_path = (
34 |     "/Users/vsatpathy/Desktop/off_POCs/intel-image-classification/seg_train/buildings"
35 | )
36 | predict(folder_path=folder_path, model_path=model_path)
37 | 


--------------------------------------------------------------------------------
/image_classifier/train.py:
--------------------------------------------------------------------------------
  1 | import os
  2 | import random
  3 | import numpy as np
  4 | import mlflow
  5 | from mlflow import pyfunc
  6 | import argparse
  7 | import json
  8 | 
  9 | from tensorflow.keras.preprocessing.image import load_img
 10 | from tensorflow.keras.models import Model, load_model
 11 | from tensorflow.keras.layers import (
 12 |     Input,
 13 |     Conv2D,
 14 |     Dense,
 15 |     Flatten,
 16 | )
 17 | 
 18 | tracking_uri = (
 19 |     "http://testuser:password@ec2-18-218-100-222.us-east-2.compute.amazonaws.com"
 20 | )
 21 | s3_bucket = "s3://docuedge-mlflow-bucket"  # replace this value
 22 | 
 23 | 
 24 | def model_arc(y_labels: dict, image_inp_shape: tuple):
 25 |     inp_layer_images = Input(shape=image_inp_shape)
 26 | 
 27 |     conv_layer = Conv2D(filters=64, kernel_size=(2, 2), activation="relu")(
 28 |         inp_layer_images
 29 |     )
 30 |     flatten_layer = Flatten()(conv_layer)
 31 | 
 32 |     out_layer = Dense(len(y_labels), activation="softmax")(flatten_layer)
 33 | 
 34 |     model = Model(inp_layer_images, out_layer)
 35 |     model.compile(
 36 |         optimizer="adam", loss="sparse_categorical_crossentropy", metrics=["accuracy"]
 37 |     )
 38 |     return model
 39 | 
 40 | 
 41 | def data_loader(gt_data_path: list, gt_labels: dict, bs: int, image_shape: tuple):
 42 |     while True:
 43 |         images = []
 44 |         labels = []
 45 |         while len(images) < bs:
 46 |             indice = random.randint(0, len(gt_data_path) - 1)
 47 |             image_path = gt_data_path[indice]
 48 | 
 49 |             label = gt_labels.get(image_path.split("/")[-2])
 50 |             labels.append(label)
 51 | 
 52 |             test_img = np.asarray(load_img(image_path, target_size=image_shape))
 53 |             img = np.divide(test_img, 255.0)
 54 |             images.append(img)
 55 |         yield np.asarray(images), np.asarray(labels)
 56 | 
 57 | 
 58 | def read_data(data_path: str):
 59 |     folders = os.listdir(data_path)
 60 | 
 61 |     all_images_paths = []
 62 |     all_labels = {}
 63 |     for label in folders:
 64 |         if label != ".DS_Store":
 65 |             images = os.path.join(data_path, label)
 66 |             for image in os.listdir(images):
 67 |                 full_image_path = os.path.join(images, image)
 68 |                 all_images_paths.append(full_image_path)
 69 |             if label not in all_labels:
 70 |                 all_labels[label] = len(all_labels)
 71 |     rev_labels = {}
 72 |     for key, val in all_labels.items():
 73 |         rev_labels[val] = key
 74 |     return all_images_paths, all_labels, rev_labels
 75 | 
 76 | 
 77 | def train(
 78 |     image_shape: int,
 79 |     epochs: int,
 80 |     batch_size: int,
 81 |     data_path: str,
 82 |     save_dir_path: str,
 83 |     art_name: str,
 84 |     exp_name: str,
 85 |     trained_model_path: str,
 86 | ):
 87 |     mlflow.set_tracking_uri(tracking_uri)
 88 |     client = mlflow.tracking.MlflowClient(tracking_uri=tracking_uri)
 89 |     try:
 90 |         expr_name = exp_name  # create a new experiment (do not replace)
 91 |         mlflow.create_experiment(expr_name, s3_bucket)
 92 |         mlflow.set_experiment(expr_name)
 93 |         experiment = mlflow.get_experiment_by_name(exp_name)
 94 |     except:
 95 |         experiment = mlflow.get_experiment_by_name(exp_name)
 96 | 
 97 |     mlflow.tensorflow.autolog(every_n_iter=1)
 98 |     with mlflow.start_run(experiment_id=experiment.experiment_id):
 99 |         image_dimensions = (image_shape, image_shape, 3)
100 |         no_of_epochs = epochs
101 |         batch_size = batch_size
102 |         dataset_path = data_path
103 |         gt_image_paths, gt_labels, gt_rev_labels = read_data(data_path=dataset_path)
104 |         os.makedirs(os.path.join(save_dir_path, art_name), exist_ok=True)
105 | 
106 |         mlflow.log_metrics(
107 |             {
108 |                 "batch_size": batch_size,
109 |                 "epochs": epochs,
110 |                 "image_shape": image_shape,
111 |             }
112 |         )
113 | 
114 |         with open(
115 |             os.path.join(save_dir_path, art_name, f"rev_labels_{art_name}.json"),
116 |             "w+",
117 |         ) as tar:
118 |             json.dump(gt_rev_labels, tar)
119 | 
120 |         print("target_encodings: ", gt_labels)
121 |         print("Number of training images: ", len(gt_image_paths))
122 | 
123 |         train_gen = data_loader(
124 |             gt_data_path=gt_image_paths,
125 |             gt_labels=gt_labels,
126 |             bs=batch_size,
127 |             image_shape=image_dimensions,
128 |         )
129 | 
130 |         if os.path.isfile(trained_model_path):
131 |             model = load_model(trained_model_path)
132 |         else:
133 |             model = model_arc(y_labels=gt_labels, image_inp_shape=image_dimensions)
134 |         model.fit(
135 |             x=train_gen,
136 |             steps_per_epoch=len(gt_image_paths) // batch_size,
137 |             epochs=no_of_epochs,
138 |         )
139 |         model.save(
140 |             filepath=os.path.join(save_dir_path, art_name, f"image_classifier.h5")
141 |         )
142 | 
143 |         meta_data_path = os.path.join(save_dir_path, art_name)
144 |         for artifact in sorted(os.listdir(meta_data_path)):
145 |             if artifact != ".DS_Store":
146 |                 artifact_path = os.path.join(meta_data_path, artifact)
147 |                 if (
148 |                     os.path.isfile(artifact_path)
149 |                     and artifact_path.split(".")[-1] != "h5"
150 |                 ):
151 |                     print(f"artifact to be uploaded is: {artifact}")
152 |                     mlflow.log_artifact(local_path=artifact_path)
153 | 
154 |         artifact_uri = mlflow.get_artifact_uri()
155 |         print(artifact_uri)
156 |         mlflow.end_run()
157 | 
158 | 
159 | parser = argparse.ArgumentParser()
160 | parser.add_argument("-dp", "--data_path", help="File path of the dataset")
161 | parser.add_argument("-fp", "--file_path", help="Directory path to save artifacts")
162 | parser.add_argument("-a", "--art_name", help="Artifacts name")
163 | parser.add_argument("-b", "--batch_size", default=8, help="Batch size for training")
164 | parser.add_argument("-e", "--epochs", default=3, help="Number of epochs")
165 | parser.add_argument("-is", "--img_shape", default=100, help="One dimension of image")
166 | parser.add_argument("-mp", "--model_path", default="NULL", help="Path to trained model")
167 | parser.add_argument(
168 |     "-exp",
169 |     "--experiment_name",
170 |     default="intel_classification",
171 |     help="Name of the experiment for tracking",
172 | )
173 | args = parser.parse_args()
174 | train(
175 |     image_shape=args.img_shape,
176 |     epochs=args.epochs,
177 |     batch_size=args.batch_size,
178 |     data_path=args.data_path,
179 |     save_dir_path=args.file_path,
180 |     art_name=args.art_name,
181 |     exp_name=args.experiment_name,
182 |     trained_model_path=args.model_path,
183 | )
184 | 


--------------------------------------------------------------------------------
/multiple_models/bento_package.py:
--------------------------------------------------------------------------------
 1 | from bento_predictor import ModelZoo
 2 | import tensorflow as tf
 3 | from tensorflow.keras.models import load_model
 4 | from tensorflow.keras.preprocessing.text import Tokenizer
 5 | import spacy
 6 | import json
 7 | 
 8 | 
 9 | def mcr_models(
10 |     model_zoo, tf_model_path_mcr: str, text_file_path_mcr: str, labels_path_mcr: str
11 | ):
12 |     model_cnn = load_model(tf_model_path_mcr)
13 |     tf.saved_model.save(model_cnn, "artifacts/")
14 |     model_cnn = tf.saved_model.load("artifacts/")
15 |     model_zoo.pack("mcr_model", model_cnn)
16 | 
17 |     text_model = spacy.load("en_core_web_sm")
18 |     model_zoo.pack("mcr_spacy_model", text_model)
19 | 
20 |     tokenizer = Tokenizer()
21 |     with open(text_file_path_mcr, "r") as f:
22 |         bow = f.read()
23 |     tokenizer.fit_on_texts(bow.split("####"))
24 |     model_zoo.pack("mcr_tokenizer", tokenizer)
25 | 
26 |     with open(labels_path_mcr, "r") as f:
27 |         labels_mcr = json.load(f)
28 |     model_zoo.pack("mcr_labels", labels_mcr)
29 | 
30 | 
31 | def som_models(
32 |     model_zoo,
33 |     tf_model_path_som: str,
34 |     text_file_path_som: str,
35 |     master_labels_path: str,
36 |     sub_labels_path: str,
37 | ):
38 |     model_cnn = load_model(tf_model_path_som)
39 |     tf.saved_model.save(model_cnn, "artifacts/")
40 |     model_cnn = tf.saved_model.load("artifacts/")
41 |     model_zoo.pack("som_model", model_cnn)
42 | 
43 |     text_model = spacy.load("en_core_web_sm")
44 |     model_zoo.pack("som_spacy_model", text_model)
45 | 
46 |     tokenizer = Tokenizer()
47 |     with open(text_file_path_som, "r") as f:
48 |         bow = f.read()
49 |     tokenizer.fit_on_texts(bow.split("####"))
50 |     model_zoo.pack("som_tokenizer", tokenizer)
51 | 
52 |     with open(master_labels_path, "r") as f:
53 |         labels_som = json.load(f)
54 |     model_zoo.pack("som_master_labels", labels_som)
55 | 
56 |     with open(sub_labels_path, "r") as g:
57 |         sub_labels_som = json.load(g)
58 |     model_zoo.pack("som_sub_labels", sub_labels_som)
59 | 
60 | 
61 | def main():
62 |     model_zoo = ModelZoo()
63 |     mcr_models(
64 |         model_zoo=model_zoo,
65 |         tf_model_path_mcr=tf_model_path_mcr,
66 |         text_file_path_mcr=text_file_path_mcr,
67 |         labels_path_mcr=labels_path_mcr,
68 |     )
69 |     som_models(
70 |         model_zoo=model_zoo,
71 |         tf_model_path_som=tf_model_path_som,
72 |         text_file_path_som=text_file_path_som,
73 |         master_labels_path=master_labels_path,
74 |         sub_labels_path=sub_labels_path,
75 |     )
76 |     saved_path = model_zoo.save()
77 | 
78 | 
79 | tf_model_path_mcr = (
80 |     "/Users/vsatpathy/Desktop/docs/training_data/mcr/document_classifier.h5"
81 | )
82 | text_file_path_mcr = (
83 |     "/Users/vsatpathy/Desktop/docs/training_data/mcr/file_and_text_mcr.txt"
84 | )
85 | labels_path_mcr = "/Users/vsatpathy/Desktop/docs/training_data/mcr/rev_labels_mcr.json"
86 | 
87 | tf_model_path_som = (
88 |     "/Users/vsatpathy/Desktop/docs/training_data/som/document_classifier.h5"
89 | )
90 | text_file_path_som = (
91 |     "/Users/vsatpathy/Desktop/docs/training_data/som/file_and_text_som.txt"
92 | )
93 | master_labels_path = (
94 |     "/Users/vsatpathy/Desktop/docs/training_data/som/rev_labels_master_som.json"
95 | )
96 | sub_labels_path = "/Users/vsatpathy/Desktop/docs/training_data/som/rev_labels_som.json"
97 | main()
98 | 


--------------------------------------------------------------------------------
/multiple_models/bento_predictor.py:
--------------------------------------------------------------------------------
  1 | import bentoml
  2 | from bentoml.types import FileLike
  3 | from bentoml.adapters import JsonInput, FileInput, MultiFileInput
  4 | from bentoml.frameworks.spacy import SpacyModelArtifact
  5 | from bentoml.frameworks.tensorflow import TensorflowSavedModelArtifact
  6 | from bentoml.service.artifacts.common import (
  7 |     JSONArtifact,
  8 |     PickleArtifact,
  9 | )
 10 | 
 11 | import tensorflow as tf
 12 | from tensorflow.keras.preprocessing.image import load_img
 13 | import numpy as np
 14 | from pytesseract import image_to_string
 15 | import re
 16 | from PIL import Image
 17 | from typing import List
 18 | 
 19 | 
 20 | @bentoml.env(infer_pip_packages=True)
 21 | @bentoml.artifacts(
 22 |     [
 23 |         TensorflowSavedModelArtifact("mcr_model"),
 24 |         SpacyModelArtifact("mcr_spacy_model"),
 25 |         PickleArtifact("mcr_tokenizer"),
 26 |         JSONArtifact("mcr_labels"),
 27 |         TensorflowSavedModelArtifact("som_model"),
 28 |         SpacyModelArtifact("som_spacy_model"),
 29 |         PickleArtifact("som_tokenizer"),
 30 |         JSONArtifact("som_master_labels"),
 31 |         JSONArtifact("som_sub_labels"),
 32 |     ]
 33 | )
 34 | class ModelZoo(bentoml.BentoService):
 35 |     def helper(self, text):
 36 |         dummy = []
 37 |         for word in text:
 38 |             dummy.append(str(word))
 39 |         final = " ".join(dummy)
 40 |         return final
 41 | 
 42 |     def preprocess_spacy(self, spacy_model, text, num_of_words: int):
 43 |         text = str(text)
 44 |         text = text.split(" ")
 45 |         text = self.helper(text)
 46 |         text = str(text.lower())
 47 |         # Remove all the special characters
 48 |         text = re.sub(r"\W", " ", text)
 49 |         text = re.sub(r"[^a-zA-Z ]+", "", text)
 50 |         # remove all single characters
 51 |         text = re.sub(r"\s+[a-zA-Z]\s+", " ", text)
 52 |         # Remove single characters from the start
 53 |         text = re.sub(r"\^[a-zA-Z]\s+", " ", text)
 54 |         # Substituting multiple spaces with single space
 55 |         text = re.sub(r"\s+", " ", text, flags=re.I)
 56 |         # text = self.artifacts.mcr_spacy_model(text)
 57 |         text = spacy_model(text)
 58 |         filtered = [token.lemma_ for token in text if token.is_stop == False]
 59 |         text = " ".join(filtered[: num_of_words * 2])
 60 |         text = text.strip().split(" ")
 61 |         text = " ".join(text[:num_of_words])
 62 |         return text
 63 | 
 64 |     def tokenize_sentence(self, sentence, tokenizer, maximum_word_length):
 65 |         updated_sentence = sentence.split(" ")
 66 |         tok_sent = []
 67 |         for word in updated_sentence:
 68 |             if word in tokenizer.word_index:
 69 |                 tok_sent.append(tokenizer.word_index[word])
 70 |             else:
 71 |                 tok_sent.append(0)
 72 |         if len(tok_sent) != maximum_word_length:
 73 |             delta = maximum_word_length - len(tok_sent)
 74 |             for i in range(delta):
 75 |                 tok_sent.append(0)
 76 |         return tok_sent
 77 | 
 78 |     def pre_process_image(self, image_file):
 79 |         ocr_image = np.asarray(Image.open(image_file))
 80 |         image = np.asarray(
 81 |             Image.open(image_file).convert(mode="RGB").resize((100, 100))
 82 |         )
 83 |         image = np.divide(image, 255.0)
 84 |         image = np.asarray([image]).astype("float32")
 85 |         return ocr_image, image
 86 | 
 87 |     def pre_process_mcr(self, file):
 88 |         ocr_image, image = self.pre_process_image(image_file=file)
 89 |         doc_text = image_to_string(ocr_image)
 90 |         doc_text_processed = self.preprocess_spacy(
 91 |             spacy_model=self.artifacts.mcr_spacy_model, text=doc_text, num_of_words=10
 92 |         )
 93 |         fin_text = self.tokenize_sentence(
 94 |             sentence=doc_text_processed,
 95 |             tokenizer=self.artifacts.mcr_tokenizer,
 96 |             maximum_word_length=10,
 97 |         )
 98 |         return image, np.asarray([fin_text]).astype("float32")
 99 | 
100 |     def pre_process_som(self, file):
101 |         ocr_image, image = self.pre_process_image(image_file=file)
102 |         doc_text = image_to_string(ocr_image)
103 |         doc_text_processed = self.preprocess_spacy(
104 |             spacy_model=self.artifacts.som_spacy_model, text=doc_text, num_of_words=10
105 |         )
106 |         fin_text = self.tokenize_sentence(
107 |             sentence=doc_text_processed,
108 |             tokenizer=self.artifacts.som_tokenizer,
109 |             maximum_word_length=10,
110 |         )
111 |         return image, np.asarray([fin_text]).astype("float32")
112 | 
113 |     @bentoml.api(input=FileInput())
114 |     def predict_document_labels_mcr(self, file_stream):
115 |         image, text = self.pre_process_mcr(file=file_stream)
116 |         model = self.artifacts.mcr_model.signatures["serving_default"]
117 |         model._num_positional_args = 2
118 |         results = model(tf.constant(text), tf.constant(image))
119 |         conv_results = results.get("dense_1")[0].numpy()
120 |         document_label = self.artifacts.mcr_labels[str(np.argmax(conv_results))]
121 |         return {"document_type": document_label}
122 | 
123 |     @bentoml.api(input=FileInput())
124 |     def predict_document_labels_som(self, file_stream):
125 |         image, text = self.pre_process_som(file=file_stream)
126 |         model = self.artifacts.som_model.signatures["serving_default"]
127 |         model._num_positional_args = 2
128 |         results = model(tf.constant(text), tf.constant(image))
129 |         mas_results = results.get("dense_1")[0].numpy()
130 |         sub_results = results.get("dense_4")[0].numpy()
131 |         master_label = self.artifacts.som_master_labels[str(np.argmax(mas_results))]
132 |         sub_label = self.artifacts.som_sub_labels[str(np.argmax(sub_results))]
133 |         return {"master document type": master_label, "sub document type": sub_label}
134 | 


--------------------------------------------------------------------------------
/multiple_models/docker-compose.yml:
--------------------------------------------------------------------------------
 1 | version: "3"
 2 | services:
 3 |   web:
 4 |     image: 142339138776.dkr.ecr.us-east-2.amazonaws.com/docuedge-model-zoo:latest
 5 |     ports:
 6 |       - "5000:5000"
 7 |     logging:
 8 |       driver: awslogs
 9 |       options:
10 |         awslogs-group: docuedge-modelserver-ecs
11 |         awslogs-region: us-east-2
12 |         awslogs-stream-prefix: web
13 |     volumes:
14 |       - /app/temp
15 | 


--------------------------------------------------------------------------------
/multiple_models/ecs-params.yml:
--------------------------------------------------------------------------------
 1 | version: 1
 2 | task_definition:
 3 |   task_execution_role: ecsTaskExecutionRoleBento
 4 |   ecs_network_mode: awsvpc
 5 |   task_size:
 6 |     mem_limit: 8GB
 7 |     cpu_limit: 4096
 8 |   efs_volumes:
 9 |     - name: docuedgedev-efs
10 |       filesystem_id: fs-4717c93f
11 |       root_directory: /smartbox-config
12 | run_params:
13 |   network_configuration:
14 |     awsvpc_configuration:
15 |       subnets:
16 |         - subnet-00e7bff093931a167
17 |         - subnet-0345b051535c9625d
18 |       security_groups:
19 |         - sg-09b7e06cb8b13167d
20 |         - sg-0601a52d4ea28af05
21 |       assign_public_ip: ENABLED
22 | 


--------------------------------------------------------------------------------
/multiple_models/hybrid_v1.py:
--------------------------------------------------------------------------------
  1 | import numpy as np
  2 | import random
  3 | from tensorflow.keras.preprocessing.image import load_img
  4 | from tensorflow.keras.preprocessing.text import Tokenizer
  5 | from tensorflow.keras.layers import (
  6 |     Input,
  7 |     Conv2D,
  8 |     Dense,
  9 |     Flatten,
 10 |     Embedding,
 11 |     Concatenate,
 12 |     GlobalMaxPool1D,
 13 | )
 14 | from tensorflow.keras.models import Model, load_model
 15 | import os
 16 | import json
 17 | import mlflow
 18 | import mlflow.tensorflow
 19 | 
 20 | tracking_uri = "http://testuser:test@ec2-18-220-228-243.us-east-2.compute.amazonaws.com"
 21 | mlflow.set_tracking_uri(tracking_uri)
 22 | client = mlflow.tracking.MlflowClient(tracking_uri=tracking_uri)
 23 | try:
 24 |     expr_name = "hybrid_v1"  # create a new experiment (do not replace)
 25 |     s3_bucket = "s3://docuedge-mlflow-bucket"  # replace this value
 26 |     mlflow.create_experiment(expr_name, s3_bucket)
 27 |     mlflow.set_experiment(expr_name)
 28 | except:
 29 |     experiment = mlflow.get_experiment_by_name("hybrid_v1")
 30 | 
 31 | 
 32 | def read_data(path):
 33 |     bow = open(path, "r")
 34 |     data = bow.readlines()
 35 |     all_data_paths = []
 36 |     all_texts = []
 37 |     y_labels = {}
 38 |     for line in data:
 39 |         line_data = line.split("####")
 40 |         all_data_paths.append(line_data[0])
 41 |         all_texts.append(line_data[-1][:-1])
 42 |         label = line_data[0].split("/")[-2]
 43 |         if label not in y_labels:
 44 |             y_labels[label] = len(y_labels)
 45 | 
 46 |     rev_labels = {}
 47 |     for key, val in y_labels.items():
 48 |         rev_labels[val] = key
 49 | 
 50 |     return all_data_paths, y_labels, rev_labels, all_texts
 51 | 
 52 | 
 53 | def tokenize_sentence(sentence, tokenizer, maximum_word_length):
 54 |     updated_sentence = sentence.split(" ")
 55 |     tok_sent = []
 56 |     for word in updated_sentence:
 57 |         if word in tokenizer.word_index:
 58 |             tok_sent.append(tokenizer.word_index[word])
 59 |         else:
 60 |             tok_sent.append(0)
 61 |     if len(tok_sent) != maximum_word_length:
 62 |         delta = maximum_word_length - len(tok_sent)
 63 |         for i in range(delta):
 64 |             tok_sent.append(0)
 65 |     return tok_sent
 66 | 
 67 | 
 68 | def data_loader_text(
 69 |     bs, data, y_lab, tokenizer, text_data, image_input_shape, max_word_length
 70 | ):
 71 |     while True:
 72 |         images = []
 73 |         labels = []
 74 |         texts = []
 75 |         while len(images) < bs:
 76 |             indice = random.randint(0, len(data) - 1)
 77 |             target = data[indice].split("/")[-2]
 78 |             labels.append(y_lab[target])
 79 | 
 80 |             test_img = np.asarray(load_img(data[indice], target_size=image_input_shape))
 81 |             img = np.divide(test_img, 255.0)
 82 |             images.append(img)
 83 | 
 84 |             tok_sen = tokenize_sentence(
 85 |                 text_data[indice], tokenizer, maximum_word_length=max_word_length
 86 |             )
 87 |             texts.append(tok_sen)
 88 |         yield [np.asarray(images), np.asarray(texts)], np.asarray(labels)
 89 | 
 90 | 
 91 | def model_arc(y_labels, tokenizer, text_model_inp_shape, image_inp_shape):
 92 |     inp_layer_texts = Input(shape=text_model_inp_shape)
 93 |     inp_layer_images = Input(shape=image_inp_shape)
 94 | 
 95 |     embedding_layer = Embedding(
 96 |         input_dim=len(tokenizer.word_index) + 1,
 97 |         output_dim=64,
 98 |         input_length=text_model_inp_shape,
 99 |         trainable=True,
100 |     )(inp_layer_texts)
101 |     pooling_layer = GlobalMaxPool1D()(embedding_layer)
102 |     dense_layer = Dense(units=64, activation="relu")(pooling_layer)
103 |     # lstm_layer = Bidirectional(LSTM(units=32))(embedding_layer)
104 | 
105 |     conv_layer = Conv2D(filters=64, kernel_size=(2, 2), activation="relu")(
106 |         inp_layer_images
107 |     )
108 |     flatten_layer = Flatten()(conv_layer)
109 | 
110 |     concat_layer = Concatenate()([flatten_layer, dense_layer])
111 |     out_layer = Dense(len(y_labels), activation="softmax")(concat_layer)
112 | 
113 |     model = Model([inp_layer_images, inp_layer_texts], out_layer)
114 |     model.compile(
115 |         optimizer="adam", loss="sparse_categorical_crossentropy", metrics=["accuracy"]
116 |     )
117 |     return model
118 | 
119 | 
120 | def train_hybrid_v1(
121 |     text_plus_file_path: str,
122 |     batch_size: int,
123 |     epochs: int,
124 |     image_shape: int,
125 |     max_words: int,
126 |     artifact_name: str,
127 |     save_dir_path: str,
128 |     trained_model_path: str,
129 | ):
130 |     all_imgs_path, y_labels, rev_labels, all_text = read_data(path=text_plus_file_path)
131 |     num_train_img = len(all_imgs_path)
132 | 
133 |     with open(
134 |         os.path.join(save_dir_path, artifact_name, f"rev_labels_{artifact_name}.json"),
135 |         "w+",
136 |     ) as tar:
137 |         json.dump(rev_labels, tar)
138 | 
139 |     print("target_encodings: ", y_labels)
140 |     print("Number of training images: ", num_train_img)
141 | 
142 |     bow = open(text_plus_file_path, "r")
143 |     tokenizer = Tokenizer()
144 |     tokenizer.fit_on_texts(bow.read().split("####"))
145 | 
146 |     train_gen = data_loader_text(
147 |         tokenizer=tokenizer,
148 |         y_lab=y_labels,
149 |         data=all_imgs_path,
150 |         text_data=all_text,
151 |         bs=batch_size,
152 |         image_input_shape=(image_shape, image_shape, 3),
153 |         max_word_length=max_words,
154 |     )
155 |     if os.path.isfile(trained_model_path):
156 |         model = load_model(trained_model_path)
157 |     else:
158 |         model = model_arc(
159 |             y_labels=y_labels,
160 |             tokenizer=tokenizer,
161 |             text_model_inp_shape=(max_words,),
162 |             image_inp_shape=(image_shape, image_shape, 3),
163 |         )
164 |     mlflow.tensorflow.autolog(every_n_iter=1)
165 |     with mlflow.start_run(experiment_id=experiment.experiment_id):
166 |         mlflow.log_metrics(
167 |             {
168 |                 "batch_size": batch_size,
169 |                 "epochs": epochs,
170 |                 "image_shape": image_shape,
171 |                 "max_words": max_words,
172 |             }
173 |         )
174 |         history = model.fit(
175 |             x=train_gen,
176 |             steps_per_epoch=num_train_img // batch_size,
177 |             epochs=epochs,
178 |         )
179 |         model.save(
180 |             filepath=os.path.join(
181 |                 save_dir_path, artifact_name, "document_classifier.h5"
182 |             )
183 |         )
184 |         meta_data_path = os.path.join(save_dir_path, artifact_name)
185 |         for artifact in sorted(os.listdir(meta_data_path)):
186 |             if artifact != ".DS_Store":
187 |                 artifact_path = os.path.join(meta_data_path, artifact)
188 |                 if (
189 |                     os.path.isfile(artifact_path)
190 |                     and artifact_path.split(".")[-1] != "h5"
191 |                 ):
192 |                     print(f"artifact to be uploaded is: {artifact}")
193 |                     mlflow.log_artifact(local_path=artifact_path)
194 | 
195 |         artifact_uri = mlflow.get_artifact_uri()
196 |         print(artifact_uri)
197 |         mlflow.end_run()
198 | 


--------------------------------------------------------------------------------
/multiple_models/hybrid_v2.py:
--------------------------------------------------------------------------------
  1 | import numpy as np
  2 | import random
  3 | import tensorflow as tf
  4 | from tensorflow.keras.preprocessing.image import load_img
  5 | from tensorflow.keras.preprocessing.text import Tokenizer
  6 | from tensorflow.keras.layers import (
  7 |     Input,
  8 |     Conv2D,
  9 |     Dense,
 10 |     Flatten,
 11 |     Embedding,
 12 |     Concatenate,
 13 |     GlobalMaxPool1D,
 14 |     Conv1D,
 15 |     MaxPooling1D,
 16 | )
 17 | from tensorflow.keras.models import Model, load_model
 18 | import os
 19 | import json
 20 | import mlflow
 21 | import mlflow.tensorflow
 22 | 
 23 | tracking_uri = "http://testuser:test@ec2-18-220-228-243.us-east-2.compute.amazonaws.com"
 24 | mlflow.set_tracking_uri(tracking_uri)
 25 | client = mlflow.tracking.MlflowClient(tracking_uri=tracking_uri)
 26 | try:
 27 |     expr_name = "hybrid_v2"  # create a new experiment (do not replace)
 28 |     s3_bucket = "s3://docuedge-mlflow-bucket"  # replace this value
 29 |     mlflow.create_experiment(expr_name, s3_bucket)
 30 |     mlflow.set_experiment(expr_name)
 31 | except:
 32 |     experiment = mlflow.get_experiment_by_name("hybrid_v2")
 33 | 
 34 | 
 35 | def read_data(path):
 36 |     bow = open(path, "r")
 37 |     data = bow.readlines()
 38 |     all_data_paths = []
 39 |     all_texts = []
 40 |     doc_type_y_labels = {}
 41 |     master_doc_type_y_labels = {}
 42 |     for line in data:
 43 |         line_data = line.split("####")
 44 |         all_data_paths.append(line_data[0])
 45 |         all_texts.append(line_data[-1][:-1])
 46 |         doc_type_label = line_data[0].split("/")[-2]
 47 |         master_doc_type_label = line_data[0].split("/")[-3]
 48 |         if doc_type_label not in doc_type_y_labels:
 49 |             doc_type_y_labels[doc_type_label] = len(doc_type_y_labels)
 50 |         if master_doc_type_label not in master_doc_type_y_labels:
 51 |             master_doc_type_y_labels[master_doc_type_label] = len(
 52 |                 master_doc_type_y_labels
 53 |             )
 54 | 
 55 |     rev_labels_doc_type = {}
 56 |     for key, val in doc_type_y_labels.items():
 57 |         rev_labels_doc_type[val] = key
 58 |     rev_labels_master_doc_type = {}
 59 |     for key, val in master_doc_type_y_labels.items():
 60 |         rev_labels_master_doc_type[val] = key
 61 | 
 62 |     return (
 63 |         all_data_paths,
 64 |         doc_type_y_labels,
 65 |         rev_labels_doc_type,
 66 |         all_texts,
 67 |         master_doc_type_y_labels,
 68 |         rev_labels_master_doc_type,
 69 |     )
 70 | 
 71 | 
 72 | def tokenize_sentence(sentence, tokenizer, maximum_word_length):
 73 |     updated_sentence = sentence.split(" ")
 74 |     tok_sent = []
 75 |     for word in updated_sentence:
 76 |         if word in tokenizer.word_index:
 77 |             tok_sent.append(tokenizer.word_index[word])
 78 |         else:
 79 |             tok_sent.append(0)
 80 |     if len(tok_sent) != maximum_word_length:
 81 |         delta = maximum_word_length - len(tok_sent)
 82 |         for i in range(delta):
 83 |             tok_sent.append(0)
 84 |     return tok_sent
 85 | 
 86 | 
 87 | def data_loader_text(
 88 |     bs,
 89 |     data,
 90 |     y_lab,
 91 |     tokenizer,
 92 |     text_data,
 93 |     image_input_shape,
 94 |     max_word_length,
 95 |     y_sub_labels,
 96 | ):
 97 |     while True:
 98 |         images = []
 99 |         master_labels = []
100 |         sub_labels = []
101 |         texts = []
102 |         while len(images) < bs:
103 |             indice = random.randint(0, len(data) - 1)
104 |             target = data[indice].split("/")[-3]
105 |             sub_target = data[indice].split("/")[-2]
106 |             master_labels.append(y_lab[target])
107 |             sub_labels.append(y_sub_labels[sub_target])
108 | 
109 |             test_img = np.asarray(load_img(data[indice], target_size=image_input_shape))
110 |             img = np.divide(test_img, 255.0)
111 |             images.append(img)
112 | 
113 |             tok_sen = tokenize_sentence(
114 |                 text_data[indice], tokenizer, maximum_word_length=max_word_length
115 |             )
116 |             texts.append(tok_sen)
117 |         yield [np.asarray(images), np.asarray(texts)], [
118 |             np.asarray(master_labels),
119 |             np.asarray(sub_labels),
120 |         ]
121 | 
122 | 
123 | def model_arc(y_labels, tokenizer, text_model_inp_shape, image_inp_shape, y_sub_labels):
124 |     inp_layer_texts = Input(shape=text_model_inp_shape)
125 |     inp_layer_images = Input(shape=image_inp_shape)
126 | 
127 |     embedding_layer = Embedding(
128 |         input_dim=len(tokenizer.word_index) + 1,
129 |         output_dim=64,
130 |         input_length=text_model_inp_shape,
131 |         trainable=True,
132 |     )(inp_layer_texts)
133 |     pooling_layer = GlobalMaxPool1D()(embedding_layer)
134 |     dense_layer = Dense(units=64, activation="relu")(pooling_layer)
135 | 
136 |     conv_layer = Conv2D(filters=64, kernel_size=(2, 2), activation="relu")(
137 |         inp_layer_images
138 |     )
139 |     flatten_layer = Flatten()(conv_layer)
140 | 
141 |     concat_layer = Concatenate()([flatten_layer, dense_layer])
142 |     out_layer = Dense(len(y_labels), activation="softmax")(concat_layer)
143 | 
144 |     sub_model_inp = Dense(units=64, activation="relu")(out_layer)
145 |     sub_dense_layer = Dense(units=256, activation="relu")(sub_model_inp)
146 |     # sub_expansion_layer = tf.expand_dims(sub_model_inp, axis=-1)
147 |     # sub_conv_layer = Conv1D(filters=64, kernel_size=(2,), activation="relu")(
148 |     #     sub_expansion_layer
149 |     # )
150 |     # sub_pool_layer = MaxPooling1D(pool_size=2)(sub_conv_layer)
151 |     # sub_flatten_layer = Flatten()(sub_pool_layer)
152 | 
153 |     sub_concat_layer = Concatenate()([sub_dense_layer, concat_layer])
154 |     sub_out_layer = Dense(units=len(y_sub_labels), activation="softmax")(
155 |         sub_concat_layer
156 |     )
157 | 
158 |     model = Model([inp_layer_images, inp_layer_texts], [out_layer, sub_out_layer])
159 |     model.compile(
160 |         optimizer="adam", loss="sparse_categorical_crossentropy", metrics=["accuracy"]
161 |     )
162 |     return model
163 | 
164 | 
165 | def train_hybrid_v2(
166 |     text_plus_file_path: str,
167 |     batch_size: int,
168 |     epochs: int,
169 |     image_shape: int,
170 |     max_words: int,
171 |     artifact_name: str,
172 |     save_dir_path: str,
173 |     trained_model_path: str,
174 | ):
175 |     (
176 |         all_imgs_path,
177 |         doc_type_y_labels,
178 |         rev_labels_doc_type,
179 |         all_text,
180 |         master_doc_type_label,
181 |         rev_labels_master_doc_type,
182 |     ) = read_data(path=text_plus_file_path)
183 |     num_train_img = len(all_imgs_path)
184 | 
185 |     with open(
186 |         os.path.join(save_dir_path, artifact_name, f"rev_labels_{artifact_name}.json"),
187 |         "w+",
188 |     ) as tar:
189 |         json.dump(rev_labels_doc_type, tar)
190 |     with open(
191 |         os.path.join(
192 |             save_dir_path, artifact_name, f"rev_labels_master_{artifact_name}.json"
193 |         ),
194 |         "w+",
195 |     ) as tar:
196 |         json.dump(rev_labels_master_doc_type, tar)
197 | 
198 |     print("target_encodings: ", master_doc_type_label)
199 |     print("target_encodings: ", doc_type_y_labels)
200 |     print("Number of training images: ", num_train_img)
201 | 
202 |     bow = open(text_plus_file_path, "r")
203 |     tokenizer = Tokenizer()
204 |     tokenizer.fit_on_texts(bow.read().split("####"))
205 | 
206 |     train_gen = data_loader_text(
207 |         tokenizer=tokenizer,
208 |         y_lab=master_doc_type_label,
209 |         data=all_imgs_path,
210 |         text_data=all_text,
211 |         bs=batch_size,
212 |         image_input_shape=(image_shape, image_shape, 3),
213 |         max_word_length=max_words,
214 |         y_sub_labels=doc_type_y_labels,
215 |     )
216 |     if os.path.isfile(trained_model_path):
217 |         model = load_model(trained_model_path)
218 |     else:
219 |         model = model_arc(
220 |             y_labels=master_doc_type_label,
221 |             tokenizer=tokenizer,
222 |             text_model_inp_shape=(max_words,),
223 |             image_inp_shape=(image_shape, image_shape, 3),
224 |             y_sub_labels=doc_type_y_labels,
225 |         )
226 |     mlflow.tensorflow.autolog(every_n_iter=1)
227 |     with mlflow.start_run(experiment_id=experiment.experiment_id):
228 |         mlflow.log_metrics(
229 |             {
230 |                 "batch_size": batch_size,
231 |                 "epochs": epochs,
232 |                 "image_shape": image_shape,
233 |                 "max_words": max_words,
234 |             }
235 |         )
236 |         model.fit(
237 |             x=train_gen, steps_per_epoch=num_train_img // batch_size, epochs=epochs
238 |         )
239 |         model.save(
240 |             filepath=os.path.join(
241 |                 save_dir_path, artifact_name, "document_classifier.h5"
242 |             )
243 |         )
244 |         meta_data_path = os.path.join(save_dir_path, artifact_name)
245 |         for artifact in sorted(os.listdir(meta_data_path)):
246 |             if artifact != ".DS_Store":
247 |                 artifact_path = os.path.join(meta_data_path, artifact)
248 |                 if (
249 |                     os.path.isfile(artifact_path)
250 |                     and artifact_path.split(".")[-1] != "h5"
251 |                 ):
252 |                     print(f"artifact to be uploaded is: {artifact}")
253 |                     mlflow.log_artifact(local_path=artifact_path)
254 | 
255 |         artifact_uri = mlflow.get_artifact_uri()
256 |         print(artifact_uri)
257 |         mlflow.end_run()
258 | 


--------------------------------------------------------------------------------
/multiple_models/pre_process_text.py:
--------------------------------------------------------------------------------
  1 | import spacy
  2 | import re
  3 | from pdf2image import convert_from_path
  4 | import os
  5 | from tqdm import tqdm
  6 | import pre_processing
  7 | from PIL import Image
  8 | import pytesseract
  9 | 
 10 | nlp = spacy.load("en_core_web_sm")
 11 | 
 12 | 
 13 | def helper(text):
 14 |     dummy = []
 15 |     for word in text:
 16 |         dummy.append(str(word))
 17 |     final = " ".join(dummy)
 18 |     return final
 19 | 
 20 | 
 21 | def preprocess_spacy(text, num_of_words: int):
 22 |     text = str(text)
 23 |     text = text.split(" ")
 24 |     text = helper(text)
 25 |     text = str(text.lower())
 26 |     # Remove all the special characters
 27 |     text = re.sub(r"\W", " ", text)
 28 |     text = re.sub(r"[^a-zA-Z ]+", "", text)
 29 |     # remove all single characters
 30 |     text = re.sub(r"\s+[a-zA-Z]\s+", " ", text)
 31 |     # Remove single characters from the start
 32 |     text = re.sub(r"\^[a-zA-Z]\s+", " ", text)
 33 |     # Substituting multiple spaces with single space
 34 |     text = re.sub(r"\s+", " ", text, flags=re.I)
 35 |     text = nlp(text)
 36 |     filtered = [token.lemma_ for token in text if token.is_stop == False]
 37 |     text = " ".join(filtered[: num_of_words * 2])
 38 |     text = text.strip().split(" ")
 39 |     text = " ".join(text[:num_of_words])
 40 |     return text
 41 | 
 42 | 
 43 | def read_text_from_pages(
 44 |     complete_folder_path: str,
 45 |     path_to_save_essential_data: str,
 46 |     meta_name: str,
 47 |     num_of_words: int,
 48 | ):
 49 |     final_path_for_data = os.path.join(
 50 |         path_to_save_essential_data, meta_name, f"file_and_text_{meta_name}.txt"
 51 |     )
 52 |     if os.path.isfile(final_path_for_data):
 53 |         data = open(final_path_for_data, "r").read()
 54 |         file_exists = True
 55 |     else:
 56 |         data = "null"
 57 |         file_exists = False
 58 |     print("####  Reading pages ####")
 59 |     doc_image_types = sorted(os.listdir(complete_folder_path))
 60 |     text_of_all_pages = []
 61 |     for doc_image_type in doc_image_types:
 62 |         if doc_image_type != ".DS_Store":
 63 |             print("DOCUMENT TYPE: ", doc_image_type)
 64 |             complete_doc_image_path = os.path.join(complete_folder_path, doc_image_type)
 65 |             pages = sorted(os.listdir(complete_doc_image_path))
 66 |             for page in tqdm(pages):
 67 |                 if page != ".DS_Store":
 68 |                     page_path = os.path.join(complete_doc_image_path, page)
 69 |                     if page_path not in data:
 70 |                         document_page = Image.open(page_path)
 71 |                         document_text = pytesseract.image_to_string(document_page)
 72 |                         document_page.close()
 73 |                         essential_file_path_and_text = (
 74 |                             page_path
 75 |                             + "####"
 76 |                             + preprocess_spacy(document_text, num_of_words=num_of_words)
 77 |                             + "\n"
 78 |                         )
 79 |                         text_of_all_pages.append(essential_file_path_and_text)
 80 | 
 81 |     if os.path.isfile(final_path_for_data):
 82 |         all_essential_data = open(final_path_for_data, "a+")
 83 |         all_essential_data.writelines(text_of_all_pages)
 84 |     else:
 85 |         all_essential_data = open(final_path_for_data, "w")
 86 |         all_essential_data.writelines(text_of_all_pages)
 87 |     return final_path_for_data
 88 | 
 89 | 
 90 | def read_text_from_pages_v2(
 91 |     complete_folder_path: str,
 92 |     path_to_save_essential_data: str,
 93 |     meta_name: str,
 94 |     num_of_words: int,
 95 | ):
 96 |     final_path_for_data = os.path.join(
 97 |         path_to_save_essential_data, meta_name, f"file_and_text_{meta_name}.txt"
 98 |     )
 99 |     if os.path.isfile(final_path_for_data):
100 |         data = open(final_path_for_data, "r").read()
101 |         file_exists = True
102 |     else:
103 |         data = "null"
104 |         file_exists = False
105 |     print("####  Reading pages ####")
106 |     document_folders_path = os.path.join(complete_folder_path)
107 |     master_doc_types = sorted(os.listdir(document_folders_path))
108 |     text_of_all_pages = []
109 |     for master_doc_type in master_doc_types:
110 |         if master_doc_type != ".DS_Store":
111 |             print("MASTER DOCUMENT TYPE: ", master_doc_type)
112 |             sub_doc_type_path = os.path.join(document_folders_path, master_doc_type)
113 |             for doc_image_type in sorted(os.listdir(sub_doc_type_path)):
114 |                 if doc_image_type != ".DS_Store":
115 |                     print("DOCUMENT TYPE: ", doc_image_type)
116 |                     complete_doc_image_path = os.path.join(
117 |                         sub_doc_type_path, doc_image_type
118 |                     )
119 |                     pages = sorted(os.listdir(complete_doc_image_path))
120 |                     for page in tqdm(pages):
121 |                         if page != ".DS_Store":
122 |                             page_path = os.path.join(complete_doc_image_path, page)
123 |                             if page_path not in data:
124 |                                 document_page = Image.open(page_path)
125 |                                 document_text = pytesseract.image_to_string(
126 |                                     document_page
127 |                                 )
128 |                                 document_page.close()
129 |                                 essential_file_path_and_text = (
130 |                                     page_path
131 |                                     + "####"
132 |                                     + preprocess_spacy(
133 |                                         document_text, num_of_words=num_of_words
134 |                                     )
135 |                                     + "\n"
136 |                                 )
137 |                                 text_of_all_pages.append(essential_file_path_and_text)
138 | 
139 |     if os.path.isfile(final_path_for_data):
140 |         all_essential_data = open(final_path_for_data, "a+")
141 |         all_essential_data.writelines(text_of_all_pages)
142 |     else:
143 |         all_essential_data = open(final_path_for_data, "w")
144 |         all_essential_data.writelines(text_of_all_pages)
145 |     return final_path_for_data
146 | 
147 | 
148 | def pdf_to_images(full_path_pdf: str, converted_images_path: str, meta_name: str):
149 |     doc = full_path_pdf.split("/")[-1]
150 |     index = 0
151 |     OUTPUT_PATH = converted_images_path
152 |     os.makedirs(name=OUTPUT_PATH, exist_ok=True)
153 | 
154 |     print("Document name: ", doc)
155 |     if str(doc.split(".pdf")[-2]) + "_" + str(index) + ".jpg" not in os.listdir(
156 |         converted_images_path
157 |     ):
158 |         pil_images = convert_from_path(full_path_pdf, dpi=300)
159 | 
160 |         for image in tqdm(pil_images):
161 |             processed_image = pre_processing.preprocess_image_file(image)
162 |             try:
163 |                 processed_image = Image.fromarray(processed_image)
164 |                 processed_image.save(
165 |                     os.path.join(OUTPUT_PATH, str(doc.split(".pdf")[-2]))
166 |                     + "_"
167 |                     + str(index)
168 |                     + ".jpg",
169 |                     format="JPEG",
170 |                     subsampling=0,
171 |                     quality=100,
172 |                 )
173 |                 index += 1
174 |                 processed_image.close()
175 |             except:
176 |                 index += 1
177 |     else:
178 |         pass


--------------------------------------------------------------------------------
/multiple_models/pre_processing.py:
--------------------------------------------------------------------------------
  1 | """IMAGE PREPROCESSING FUNCTIONS
  2 | """
  3 | import cv2
  4 | import numpy as np
  5 | from scipy.ndimage.filters import rank_filter
  6 | 
  7 | # from sbox.utils.sbox_logger import logger
  8 | import pytesseract
  9 | import re
 10 | import imutils
 11 | from PIL import Image
 12 | 
 13 | # print("error") # = logger(__name__)
 14 | 
 15 | 
 16 | class PagePreprocess(object):
 17 |     def __init__(self, im):
 18 |         self.err = False
 19 |         self.orig_im = im
 20 |         self.orig_shape = self.orig_im.shape
 21 |         self.image = im
 22 | 
 23 |     def crop(self):
 24 |         try:
 25 |             self.image, self.num_tries = process_image(self.orig_im)
 26 |             self.crop_shape = self.image.shape
 27 |             return self.image
 28 |         except Exception as e:
 29 |             print("crop_obj_Error")  # (f"Error: {e}", exc_info=True)
 30 | 
 31 |     def deskew(self):
 32 |         try:
 33 |             self.image, self.theta_est = process_skewed_crop(self.image)
 34 |             return self.image
 35 |         except Exception as e:
 36 |             print("deskew_obj_Error")  # (f"Error: {e}", exc_info=True)
 37 | 
 38 | 
 39 | def auto_canny(image, sigma=0.33):
 40 |     try:
 41 |         v = np.median(image)
 42 |         lower = int(max(0, (1.0 - sigma) * v))
 43 |         upper = int(min(255, (1.0 + sigma) * v))
 44 |         edged = cv2.Canny(image, lower, upper, True)
 45 |         return edged
 46 |     except Exception as e:
 47 |         print("auto_canny_Error")  # (f"Error: {e}", exc_info=True)
 48 | 
 49 | 
 50 | def dilate(image, kernel, iterations):
 51 |     dilated_image = cv2.dilate(image, kernel, iterations=iterations)
 52 |     return dilated_image
 53 | 
 54 | 
 55 | def downscale_image(im, max_dim=2048):
 56 |     try:
 57 |         a, b = im.shape[:2]
 58 |         if max(a, b) <= max_dim:
 59 |             return 1.0, im
 60 | 
 61 |         scale = 1.0 * max_dim / max(a, b)
 62 |         new_im = cv2.resize(im, (int(b * scale), int(a * scale)), cv2.INTER_AREA)
 63 |         return scale, new_im
 64 |     except Exception as e:
 65 |         print("error")  # (f"Error: {e}", exc_info=True)
 66 | 
 67 | 
 68 | def find_components(im, max_components=16):
 69 |     try:
 70 |         kernel = cv2.getStructuringElement(cv2.MORPH_RECT, (10, 10))
 71 |         dilation = dilate(im, kernel, 6)
 72 | 
 73 |         count = 21
 74 |         n = 0
 75 |         sigma = 0.000
 76 | 
 77 |         while count > max_components:
 78 |             n += 1
 79 |             sigma += 0.005
 80 |             result = cv2.findContours(dilation, cv2.RETR_TREE, cv2.CHAIN_APPROX_SIMPLE)
 81 |             if len(result) == 3:
 82 |                 _, contours, hierarchy = result
 83 |             elif len(result) == 2:
 84 |                 contours, hierarchy = result
 85 |             possible = find_likely_rectangles(contours, sigma)
 86 |             count = len(possible)
 87 | 
 88 |         return (dilation, possible, n)
 89 |     except Exception as e:
 90 |         print("comp_error")  # (f"Error: {e}", exc_info=True)
 91 | 
 92 | 
 93 | def find_likely_rectangles(contours, sigma):
 94 |     try:
 95 |         contours = sorted(contours, key=cv2.contourArea, reverse=True)[:10]
 96 |         possible = []
 97 |         for c in contours:
 98 | 
 99 |             peri = cv2.arcLength(c, True)
100 |             approx = cv2.approxPolyDP(c, sigma * peri, True)
101 |             box = make_box(approx)
102 |             possible.append(box)
103 | 
104 |         return possible
105 |     except Exception as e:
106 |         print("likely_rec_error")  # (f"Error: {e}", exc_info=True)
107 | 
108 | 
109 | def make_box(poly):
110 |     try:
111 |         x = []
112 |         y = []
113 |         for p in poly:
114 |             for point in p:
115 |                 x.append(point[0])
116 |                 y.append(point[1])
117 |         xmax = max(x)
118 |         ymax = max(y)
119 |         xmin = min(x)
120 |         ymin = min(y)
121 |         return (xmin, ymin, xmax, ymax)
122 |     except Exception as e:
123 |         print("bbox_error")  # (f"Error: {e}", exc_info=True)
124 | 
125 | 
126 | def rect_union(crop1, crop2):
127 |     x11, y11, x21, y21 = crop1
128 |     x12, y12, x22, y22 = crop2
129 |     return min(x11, x12), min(y11, y12), max(x21, x22), max(y21, y22)
130 | 
131 | 
132 | def rect_area(crop):
133 |     x1, y1, x2, y2 = crop
134 |     return max(0, x2 - x1) * max(0, y2 - y1)
135 | 
136 | 
137 | def crop_image(im, rect, scale):
138 |     try:
139 |         xmin, ymin, xmax, ymax = rect
140 |         crop = [xmin, ymin, xmax, ymax]
141 |         xmin, ymin, xmax, ymax = [int(x / scale) for x in crop]
142 |         if ((ymax - ymin) * (xmax - xmin)) > 0.25 * im.size:
143 |             cropped = im[ymin:ymax, xmin:xmax]
144 |         else:
145 |             cropped = im
146 |         return cropped
147 |     except Exception as e:
148 |         print("crop_error_1")  # (f"Error: {e}", exc_info=True)
149 | 
150 | 
151 | def reduce_noise_raw(im):
152 |     bilat = cv2.bilateralFilter(im, 4, 75, 75)
153 |     blur = cv2.medianBlur(bilat, 1)
154 |     return blur
155 | 
156 | 
157 | def reduce_noise_edges(im):
158 |     try:
159 |         structuring_element = cv2.getStructuringElement(cv2.MORPH_RECT, (1, 1))
160 |         opening = cv2.morphologyEx(im, cv2.MORPH_OPEN, structuring_element)
161 |         maxed_rows = rank_filter(opening, -4, size=(1, 20))
162 |         maxed_cols = rank_filter(opening, -4, size=(20, 1))
163 |         debordered = np.minimum(np.minimum(opening, maxed_rows), maxed_cols)
164 |         return debordered
165 |     except Exception as e:
166 |         print("noise_red_Error")  # (f"Error: {e}", exc_info=True)
167 | 
168 | 
169 | def rects_are_vertical(rect1, rect2, rect_align=2):
170 |     try:
171 |         xmin1, ymin1, xmax1, ymax1 = rect1
172 |         xmin2, ymin2, xmax2, ymax2 = rect2
173 | 
174 |         midpoint1 = (xmin1 + xmax1) / 2
175 |         midpoint2 = (xmin2 + xmax2) / 2
176 |         dist = abs(midpoint1 - midpoint2)
177 | 
178 |         rectarea1 = rect_area(rect1)
179 |         rectarea2 = rect_area(rect2)
180 |         if rectarea1 > rectarea2:
181 |             thres = (xmax1 - xmin1) * rect_align
182 |         else:
183 |             thres = (xmax2 - xmin2) * rect_align
184 | 
185 |         if thres > dist:
186 |             align = True
187 |         else:
188 |             align = False
189 |         return align
190 |     except Exception as e:
191 |         print("vert_rec_Error")  # (f"Error: {e}", exc_info=True)
192 | 
193 | 
194 | def find_final_crop(im, rects, orig_im):
195 |     try:
196 |         current = None
197 |         for rect in rects:
198 |             if current is None:
199 |                 current = rect
200 |                 continue
201 | 
202 |             aligned = rects_are_vertical(current, rect)
203 | 
204 |             if not aligned:
205 |                 continue
206 | 
207 |             current = rect_union(current, rect)
208 |         if current is not None:
209 |             return current
210 |         else:
211 |             return (0, 0, orig_im.shape[0], orig_im.shape[1])
212 |     except Exception as e:
213 |         print("crop_Error")  # (f"Error: {e}", exc_info=True)
214 | 
215 | 
216 | def process_image(orig_im):
217 |     try:
218 |         scale, im = downscale_image(orig_im)
219 | 
220 |         blur = reduce_noise_raw(im.copy())
221 | 
222 |         edges = auto_canny(blur.copy())
223 | 
224 |         debordered = reduce_noise_edges(edges.copy())
225 | 
226 |         dilation, rects, num_tries = find_components(debordered, 16)
227 | 
228 |         final_rect = find_final_crop(dilation, rects, orig_im)
229 | 
230 |         cropped = crop_image(orig_im, final_rect, scale)
231 |         # kernel = np.ones((3, 3), np.float32) / 25
232 |         # smooth2d = cv2.filter2D(cropped, -1, kernel=kernel)
233 |         return (cropped, num_tries)
234 |     except Exception as e:
235 |         print("process")  # (f"Error: {e}", exc_info=True)
236 | 
237 | 
238 | def rad_to_deg(theta):
239 |     return theta * 180 / np.pi
240 | 
241 | 
242 | def rotate(image, theta):
243 |     try:
244 |         (h, w) = image.shape[:2]
245 |         center = (w / 2, h / 2)
246 |         M = cv2.getRotationMatrix2D(center, theta, 1)
247 |         rotated = cv2.warpAffine(
248 |             image,
249 |             M,
250 |             (int(w), int(h)),
251 |             cv2.INTER_LINEAR,
252 |             borderMode=cv2.BORDER_CONSTANT,
253 |             borderValue=(255, 255, 255),
254 |         )
255 |         return rotated
256 |     except Exception as e:
257 |         print("rotation_error")  # (f"Error: {e}", exc_info=True)
258 | 
259 | 
260 | def angle_calculation(gray):
261 |     gray = cv2.cvtColor(gray, cv2.COLOR_BGR2GRAY)
262 |     gray = cv2.bitwise_not(gray)
263 |     thresh = cv2.threshold(gray, 0, 255, cv2.THRESH_BINARY | cv2.THRESH_OTSU)[1]
264 | 
265 |     coords = np.column_stack(np.where(thresh > 0))
266 |     # print(coords, coords.shape)
267 | 
268 |     min_y = coords[0][0]
269 |     max_y = coords[-1][0]
270 |     min_x = coords[0][0]
271 |     max_x = coords[-1][1]
272 | 
273 |     left_most = coords[0]
274 |     right_most = coords[0]
275 |     top_most = coords[0]
276 |     bottom_most = coords[0]
277 |     # print(coords[0], coords[-1])
278 |     for i in range(1, coords.shape[0]):
279 |         y, x = coords[i][0], coords[i][1]
280 |         if y <= min_y:
281 |             min_y = y
282 |             top_most = coords[i]
283 |         elif y >= max_y:
284 |             max_y = y
285 |             bottom_most = coords[i]
286 |         if x <= min_x:
287 |             min_x = x
288 |             left_most = coords[i]
289 |         elif x >= max_x:
290 |             max_x = x
291 |             right_most = coords[i]
292 |     # print(top_most, left_most, bottom_most, right_most)
293 | 
294 |     slopes = []
295 |     edge_coor = [top_most, left_most, bottom_most, right_most]
296 |     for i in range(0, len(edge_coor)):
297 |         if i == len(edge_coor) - 1:
298 |             if abs((edge_coor[0][1] - edge_coor[i][1])) >= 10:
299 |                 angle = (
300 |                     (
301 |                         (edge_coor[0][0] - edge_coor[i][0])
302 |                         / (edge_coor[0][1] - edge_coor[i][1])
303 |                     )
304 |                     * 180
305 |                 ) / 3.14
306 |                 slopes.append(angle)
307 |             else:
308 |                 slopes.append(0.0)
309 |         else:
310 |             if abs((edge_coor[i + 1][1] - edge_coor[i][1])) >= 10:
311 |                 angle = (
312 |                     (
313 |                         (edge_coor[i + 1][0] - edge_coor[i][0])
314 |                         / (edge_coor[i + 1][1] - edge_coor[i][1])
315 |                     )
316 |                     * 180
317 |                 ) / 3.14
318 |                 slopes.append(angle)
319 |             else:
320 |                 slopes.append(0.0)
321 |         # img = cv2.circle(thresh, (edge_coor[i][1], edge_coor[i][0]), 5, (255, 0, 0), 2)
322 | 
323 |     slopes = np.asarray(slopes)
324 |     if len(np.where(slopes == 0.0)[0]) >= 2:
325 |         # print("error") #(f"Error: {e}", exc_info=True)don't rotate")
326 |         return None
327 |     else:
328 |         # print("error") #(f"Error: {e}", exc_info=True)rotate")
329 |         neg_slope = (slopes[0] + slopes[2]) / 2
330 |         pos_slope = (slopes[1] + slopes[3]) / 2
331 |         # print(pos_slope, neg_slope)
332 |         new_pos_slope = pos_slope
333 |         new_neg_slope = neg_slope
334 |         if pos_slope > 90:
335 |             if pos_slope < 180:
336 |                 new_pos_slope = 180 - pos_slope
337 |             else:
338 |                 new_pos_slope = pos_slope - ((pos_slope // 180) * 180)
339 |                 # print(new_pos_slope)
340 |         if neg_slope < -90:
341 |             new_neg_slope = 180 + neg_slope
342 |         # print(new_pos_slope, new_neg_slope)
343 |         if new_pos_slope <= new_neg_slope:
344 |             fin_angle = pos_slope
345 |         else:
346 |             fin_angle = neg_slope
347 | 
348 |         if fin_angle < -90:
349 |             rot_angle = 180 + fin_angle
350 |         elif fin_angle > 90:
351 |             rot_angle = -(180 - fin_angle)
352 |         elif -90 < fin_angle < 0:
353 |             rot_angle = fin_angle
354 |         elif 0 < fin_angle < 90:
355 |             rot_angle = fin_angle
356 |         return rot_angle
357 | 
358 | 
359 | def estimate_skew(image):
360 |     try:
361 |         osd = pytesseract.image_to_osd(image)
362 |         angle = float(re.search("(?<=Rotate: )\d+", osd).group(0))
363 |         if angle == 0:
364 |             # fin_image = rotate(image_gray, angle)
365 |             edges = auto_canny(image)
366 |             # print(edges.shape)
367 |             # print("error") #(f"Error: {e}", exc_info=True)edges found: ", edges)
368 |             lines = cv2.HoughLines(edges, 1, np.pi / 270, 400)
369 |             # print("error") #(f"Error: {e}", exc_info=True)lines found: ", lines)
370 |             if lines is not None:
371 |                 new = edges.copy()
372 |                 thetas = []
373 |                 for line in lines:
374 |                     for rho, theta in line:
375 |                         a = np.cos(theta)
376 |                         b = np.sin(theta)
377 |                         x0 = a * rho
378 |                         y0 = b * rho
379 |                         x1 = int(x0 + 1000 * (-b))
380 |                         y1 = int(y0 + 1000 * (a))
381 |                         x2 = int(x0 - 1000 * (-b))
382 |                         y2 = int(y0 - 1000 * (a))
383 |                         if theta > np.pi / 3 and theta < np.pi * 2 / 3:
384 |                             thetas.append(theta)
385 |                             new = cv2.line(new, (x1, y1), (x2, y2), (255, 255, 255), 1)
386 | 
387 |                 theta_mean = np.mean(thetas)
388 |                 theta = -(90 - (rad_to_deg(theta_mean) if len(thetas) > 0 else 0))
389 |             else:
390 |                 # theta = angle_calculation(image)
391 |                 theta = 0.0
392 |         else:
393 |             theta = angle
394 |         return theta
395 |     except Exception as e:
396 |         print("theta_error")  # (f"Error: {e}", exc_info=True)
397 | 
398 | 
399 | def process_skewed_crop(image):
400 |     try:
401 |         theta = estimate_skew(image)
402 |         # print(theta)
403 |         # ret, thresh = cv2.threshold(image, 0, 127, cv2.THRESH_OTSU)
404 |         # print(thresh)
405 |         if theta is not None and (theta % 90) != 0:
406 |             rotated = rotate(image, theta)
407 |         elif (theta % 90) == 0:
408 |             rotated = imutils.rotate_bound(image, theta)
409 |         else:
410 |             rotated = image
411 |         # print(rotated)
412 |         return rotated, theta
413 |     except Exception as e:
414 |         print("skew_Error")  # (f"Error: {e}", exc_info=True)
415 | 
416 | 
417 | def preprocess_image(file_path: str):
418 |     try:
419 |         gray_page = cv2.imread(file_path, 0)
420 |         process_page = PagePreprocess(gray_page)
421 |         _ = process_page.crop()
422 |         deskewed_page = process_page.deskew()
423 |         # cv2.imwrite(file_path, deskewed_page)
424 |         return deskewed_page
425 |     except Exception as e:
426 |         print("process_image_error")  # (f"Error: {e}", exc_info=True)
427 | 
428 | 
429 | def preprocess_image_file(img):
430 |     try:
431 |         # converted_image = cv2.cvtColor(np.asarray(img), cv2.COLOR_RGB2BGR)
432 |         gray_page = cv2.cvtColor(np.array(img), cv2.COLOR_BGR2GRAY)
433 |         # gray_page = cv2.cvtColor(gray_page, cv2.COLOR_BGR2RGB)
434 |         process_page = PagePreprocess(gray_page)
435 |         _ = process_page.crop()
436 |         deskewed_page = process_page.deskew()
437 |         return deskewed_page
438 |     except Exception as e:
439 |         print("error")  # (f"Error: {e}", exc_info=True)
440 | 


--------------------------------------------------------------------------------
/multiple_models/train.py:
--------------------------------------------------------------------------------
  1 | import argparse
  2 | import os
  3 | from pre_process_text import (
  4 |     pdf_to_images,
  5 |     read_text_from_pages,
  6 |     read_text_from_pages_v2,
  7 | )
  8 | from hybrid_v1 import train_hybrid_v1
  9 | from hybrid_v2 import train_hybrid_v2
 10 | 
 11 | 
 12 | def process(
 13 |     dataset_path: str,
 14 |     save_dir: str,
 15 |     pdf_check: bool,
 16 |     artifact_name: str,
 17 |     num_words_to_read: int,
 18 | ):
 19 |     updated_dataset_path = os.path.join(save_dir, artifact_name, "dataset")
 20 |     os.makedirs(updated_dataset_path, exist_ok=True)
 21 |     for document_type in sorted(os.listdir(dataset_path)):
 22 |         if document_type != ".DS_Store":
 23 |             folder_path = os.path.join(dataset_path, document_type)
 24 |             updated_document_type_folder_path = os.path.join(
 25 |                 updated_dataset_path, document_type
 26 |             )
 27 |             os.makedirs(updated_document_type_folder_path, exist_ok=True)
 28 |             for documents in sorted(os.listdir(folder_path)):
 29 |                 if documents != ".DS_Store":
 30 |                     document_path = os.path.join(folder_path, documents)
 31 |                     if pdf_check:
 32 |                         # Perform conversion and store the images in a temp folder
 33 |                         pdf_to_images(
 34 |                             full_path_pdf=document_path,
 35 |                             converted_images_path=updated_document_type_folder_path,
 36 |                             meta_name=artifact_name,
 37 |                         )
 38 |     if pdf_check:
 39 |         images_data_path = os.path.join(save_dir, artifact_name)
 40 |     else:
 41 |         images_data_path = dataset_path
 42 |     master_data_path = read_text_from_pages(
 43 |         complete_folder_path=images_data_path,
 44 |         path_to_save_essential_data=save_dir,
 45 |         meta_name=artifact_name,
 46 |         num_of_words=num_words_to_read,
 47 |     )
 48 |     return master_data_path
 49 | 
 50 | 
 51 | def single_level(args):
 52 |     all_data_path = process(
 53 |         dataset_path=args.data_path,
 54 |         save_dir=args.file_path,
 55 |         pdf_check=bool(args.pdfs),
 56 |         artifact_name=args.art_name,
 57 |         num_words_to_read=int(args.num_of_words),
 58 |     )
 59 |     train_hybrid_v1(
 60 |         text_plus_file_path=all_data_path,
 61 |         batch_size=int(args.batch_size),
 62 |         epochs=int(args.epochs),
 63 |         image_shape=int(args.img_shape),
 64 |         max_words=int(args.num_of_words),
 65 |         artifact_name=args.art_name,
 66 |         save_dir_path=args.file_path,
 67 |         trained_model_path=args.model_path,
 68 |     )
 69 | 
 70 | 
 71 | def process_multi_level(
 72 |     dataset_path: str,
 73 |     save_dir: str,
 74 |     pdf_check: bool,
 75 |     artifact_name: str,
 76 |     num_words_to_read: int,
 77 | ):
 78 |     updated_dataset_path = os.path.join(save_dir, artifact_name, "dataset")
 79 |     os.makedirs(updated_dataset_path, exist_ok=True)
 80 |     for master_document_type in sorted(os.listdir(dataset_path)):
 81 |         if master_document_type != ".DS_Store":
 82 |             document_folder_path = os.path.join(dataset_path, master_document_type)
 83 |             updated_master_document_type_folder_path = os.path.join(
 84 |                 updated_dataset_path, master_document_type
 85 |             )
 86 |             os.makedirs(updated_master_document_type_folder_path, exist_ok=True)
 87 |             for document_type in sorted(os.listdir(document_folder_path)):
 88 |                 if document_type != ".DS_Store":
 89 |                     folder_path = os.path.join(document_folder_path, document_type)
 90 |                     updated_document_type_folder_path = os.path.join(
 91 |                         updated_master_document_type_folder_path, document_type
 92 |                     )
 93 |                     os.makedirs(updated_document_type_folder_path, exist_ok=True)
 94 |                     for documents in sorted(os.listdir(folder_path)):
 95 |                         if documents != ".DS_Store":
 96 |                             docuemnt_path = os.path.join(folder_path, documents)
 97 |                             if pdf_check:
 98 |                                 # Perform conversion and store the images in a temp folder
 99 |                                 pdf_to_images(
100 |                                     full_path_pdf=docuemnt_path,
101 |                                     converted_images_path=updated_document_type_folder_path,
102 |                                     meta_name=artifact_name,
103 |                                 )
104 |     if pdf_check:
105 |         images_data_path = os.path.join(save_dir, artifact_name)
106 |     else:
107 |         images_data_path = dataset_path
108 |     master_data_path = read_text_from_pages_v2(
109 |         complete_folder_path=images_data_path,
110 |         path_to_save_essential_data=save_dir,
111 |         meta_name=artifact_name,
112 |         num_of_words=num_words_to_read,
113 |     )
114 |     return master_data_path
115 | 
116 | 
117 | def multi_level(args):
118 |     all_data_path = process_multi_level(
119 |         dataset_path=args.data_path,
120 |         save_dir=args.file_path,
121 |         pdf_check=bool(args.pdfs),
122 |         artifact_name=args.art_name,
123 |         num_words_to_read=args.num_of_words,
124 |     )
125 |     train_hybrid_v2(
126 |         text_plus_file_path=all_data_path,
127 |         batch_size=int(args.batch_size),
128 |         epochs=int(args.epochs),
129 |         image_shape=int(args.img_shape),
130 |         max_words=int(args.num_of_words),
131 |         artifact_name=args.art_name,
132 |         save_dir_path=args.file_path,
133 |         trained_model_path=args.model_path,
134 |     )
135 | 
136 | 
137 | parser = argparse.ArgumentParser()
138 | parser.add_argument("-dp", "--data_path", help="File path of the dataset")
139 | parser.add_argument("-fp", "--file_path", help="Directory path to save artifacts")
140 | parser.add_argument("-a", "--art_name", help="Artifacts name")
141 | parser.add_argument("-p", "--pdfs", default=False, help="Dataset type")
142 | parser.add_argument("-n", "--num_of_words", default=10, help="No of words to read")
143 | parser.add_argument("-b", "--batch_size", default=8, help="Batch size for training")
144 | parser.add_argument("-e", "--epochs", default=3, help="Number of epochs")
145 | parser.add_argument("-is", "--img_shape", default=100, help="One dimension of image")
146 | parser.add_argument("-mp", "--model_path", default="NULL", help="Path to trained model")
147 | args = parser.parse_args()
148 | single_level(args=args)
149 | # multi_level(args=args)
150 | 


--------------------------------------------------------------------------------
/trial.py:
--------------------------------------------------------------------------------
 1 | import requests
 2 | 
 3 | image_path = "/Users/vsatpathy/Desktop/off_POCs/intel-image-classification/seg_train/buildings/0.jpg"
 4 | 
 5 | with open(image_path, "rb") as f:
 6 |     image_bytes = f.read()
 7 | 
 8 | files = {
 9 |     "image": ("test_image", image_bytes),
10 | }
11 | url = "http://127.0.0.1:5000/predict_image"
12 | # url = "https://bentoml.smartbox-capture.com/predict_document_labels_som"
13 | 
14 | response = requests.post(url, files=files)
15 | print(response.text)
16 | 


--------------------------------------------------------------------------------
/vanilla_GAN/__pycache__/bento_predictor.cpython-38.pyc:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/vaibhavsatpathy/psAI-clOps/7b447177b3e43c8d028360f574e6a20ba5090cc5/vanilla_GAN/__pycache__/bento_predictor.cpython-38.pyc


--------------------------------------------------------------------------------
/vanilla_GAN/artifacts/saved_model.pb:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/vaibhavsatpathy/psAI-clOps/7b447177b3e43c8d028360f574e6a20ba5090cc5/vanilla_GAN/artifacts/saved_model.pb


--------------------------------------------------------------------------------
/vanilla_GAN/artifacts/variables/variables.data-00000-of-00001:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/vaibhavsatpathy/psAI-clOps/7b447177b3e43c8d028360f574e6a20ba5090cc5/vanilla_GAN/artifacts/variables/variables.data-00000-of-00001


--------------------------------------------------------------------------------
/vanilla_GAN/artifacts/variables/variables.index:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/vaibhavsatpathy/psAI-clOps/7b447177b3e43c8d028360f574e6a20ba5090cc5/vanilla_GAN/artifacts/variables/variables.index


--------------------------------------------------------------------------------
/vanilla_GAN/bento_package.py:
--------------------------------------------------------------------------------
 1 | from bento_predictor import DigitGenerator
 2 | from tensorflow.keras.models import load_model
 3 | import tensorflow as tf
 4 | 
 5 | 
 6 | def classifier_models(model_service, model_path: str):
 7 |     model_gen = load_model(model_path)
 8 |     tf.saved_model.save(model_gen, "artifacts/")
 9 |     model_gen = tf.saved_model.load("artifacts/")
10 |     model_service.pack("model", model_gen)
11 | 
12 | 
13 | def main():
14 |     model_service = DigitGenerator()
15 |     classifier_models(model_service=model_service, model_path=generator_model_path)
16 |     saved_path = model_service.save()
17 | 
18 | 
19 | generator_model_path = (
20 |     "/Users/vsatpathy/Desktop/docs/training_data/van_gan/generator.h5"
21 | )
22 | main()


--------------------------------------------------------------------------------
/vanilla_GAN/bento_predictor.py:
--------------------------------------------------------------------------------
 1 | import bentoml
 2 | from bentoml.adapters import JsonInput
 3 | from bentoml.frameworks.tensorflow import TensorflowSavedModelArtifact
 4 | 
 5 | import tensorflow as tf
 6 | import importlib.util
 7 | import numpy as np
 8 | from PIL import Image
 9 | 
10 | 
11 | @bentoml.env(infer_pip_packages=True)
12 | @bentoml.artifacts([TensorflowSavedModelArtifact("model")])
13 | class DigitGenerator(bentoml.BentoService):
14 |     @bentoml.api(input=JsonInput())
15 |     def generate_image(self, file_stream):
16 |         model = self.artifacts.model.signatures["serving_default"]
17 |         model._num_positional_args = 1
18 |         noise = np.random.normal(0, 1, (1, 100))
19 |         noise = tf.convert_to_tensor(noise, dtype=tf.float32)
20 |         results = model(noise)
21 |         generated_image = results.get("dense_3")[0].numpy().reshape(28, 28)
22 |         return {"digit_generated": generated_image}
23 | 


--------------------------------------------------------------------------------
/vanilla_GAN/infer.py:
--------------------------------------------------------------------------------
 1 | from tensorflow.keras.models import load_model
 2 | import numpy as np
 3 | import matplotlib.pyplot as plt
 4 | from PIL import Image
 5 | 
 6 | 
 7 | def test(gen_model_path: str, i: int):
 8 |     gen = load_model(gen_model_path)
 9 |     noise = np.random.normal(0, 1, (1, 100))
10 |     image = np.squeeze(gen.predict(noise), axis=0)
11 |     plt.imsave(
12 |         "/Users/vsatpathy/Desktop/off_POCs/cycle_gan/epoch_%d" % i,
13 |         image.reshape(28, 28),
14 |         format="jpg",
15 |         cmap="gray",
16 |     )
17 | 
18 | 
19 | generator_model_path = (
20 |     "/Users/vsatpathy/Desktop/docs/training_data/van_gan/generator.h5"
21 | )
22 | test(gen_model_path=generator_model_path, i=0)
23 | 


--------------------------------------------------------------------------------
/vanilla_GAN/train.py:
--------------------------------------------------------------------------------
  1 | import os
  2 | import numpy as np
  3 | import matplotlib.pyplot as plt
  4 | import mlflow
  5 | import argparse
  6 | 
  7 | from tensorflow.keras.models import Model, Sequential
  8 | from tensorflow.keras.datasets import mnist
  9 | from tensorflow.keras.optimizers import Adam
 10 | from tensorflow.keras import backend as K
 11 | from tensorflow.keras import initializers
 12 | from tensorflow.keras.layers import (
 13 |     Reshape,
 14 |     Dense,
 15 |     Dropout,
 16 |     Flatten,
 17 |     BatchNormalization,
 18 |     Convolution2D,
 19 |     UpSampling2D,
 20 |     Input,
 21 |     LeakyReLU,
 22 | )
 23 | 
 24 | 
 25 | tracking_uri = (
 26 |     "http://testuser:password@ec2-18-218-100-222.us-east-2.compute.amazonaws.com"
 27 | )
 28 | # tracking_uri = "postgresql://postgres:postgres@localhost:5432/"
 29 | s3_bucket = "s3://docuedge-mlflow-bucket"  # replace this value
 30 | 
 31 | 
 32 | def generator():
 33 |     gen = Sequential()
 34 |     gen.add(Dense(256, input_dim=100))
 35 |     gen.add(LeakyReLU(0.2))
 36 |     gen.add(Dense(512))
 37 |     gen.add(LeakyReLU(0.2))
 38 |     gen.add(Dense(1024))
 39 |     gen.add(LeakyReLU(0.2))
 40 |     gen.add(Dense(784, activation="tanh"))
 41 |     gen.compile(loss="binary_crossentropy", optimizer=Adam(lr=0.0002, beta_1=0.5))
 42 |     return gen
 43 | 
 44 | 
 45 | def discriminator():
 46 |     disc = Sequential()
 47 |     disc.add(Dense(1024, input_dim=784))
 48 |     disc.add(LeakyReLU(0.2))
 49 |     disc.add(Dropout(0.2))
 50 |     disc.add(Dense(512))
 51 |     disc.add(LeakyReLU(0.2))
 52 |     disc.add(Dropout(0.2))
 53 |     disc.add(Dense(256))
 54 |     disc.add(LeakyReLU(0.2))
 55 |     disc.add(Dropout(0.2))
 56 |     disc.add(Dense(1, activation="sigmoid"))
 57 |     disc.compile(loss="binary_crossentropy", optimizer=Adam(lr=0.0002, beta_1=0.5))
 58 |     return disc
 59 | 
 60 | 
 61 | def stacked_GAN(gen, disc):
 62 |     disc.trainable = False
 63 |     gan_input = Input(shape=(100,))
 64 |     x = gen(gan_input)
 65 |     gan_out = disc(x)
 66 |     gan_stack = Model(inputs=gan_input, outputs=gan_out)
 67 |     gan_stack.compile(loss="binary_crossentropy", optimizer=Adam(lr=0.0002, beta_1=0.5))
 68 |     return gan_stack
 69 | 
 70 | 
 71 | def train(
 72 |     gen,
 73 |     disc,
 74 |     gan_stack,
 75 |     max_iter: int,
 76 |     batch_size: int,
 77 |     img_shape: int,
 78 |     file_path: str,
 79 |     artifact_name: str,
 80 |     exp_name: str,
 81 | ):
 82 | 
 83 |     mlflow.set_tracking_uri(tracking_uri)
 84 |     client = mlflow.tracking.MlflowClient(tracking_uri=tracking_uri)
 85 |     try:
 86 |         expr_name = exp_name  # create a new experiment (do not replace)
 87 |         mlflow.create_experiment(expr_name, s3_bucket)
 88 |         mlflow.set_experiment(expr_name)
 89 |         experiment = mlflow.get_experiment_by_name(exp_name)
 90 |     except:
 91 |         experiment = mlflow.get_experiment_by_name(exp_name)
 92 | 
 93 |     os.makedirs(os.path.join(file_path, artifact_name), exist_ok=True)
 94 |     mlflow.tensorflow.autolog(every_n_iter=1)
 95 |     with mlflow.start_run(experiment_id=experiment.experiment_id) as run:
 96 | 
 97 |         mlflow.log_metrics(
 98 |             {
 99 |                 "batch_size": batch_size,
100 |                 "epochs": max_iter,
101 |                 "image_shape": img_shape,
102 |             }
103 |         )
104 | 
105 |         (X_train, _), (_, _) = mnist.load_data()
106 |         X_train = (X_train.astype(np.float32) - 127.5) / 127.5
107 |         X_train = X_train.reshape(60000, img_shape)
108 | 
109 |         for i in range(0, max_iter):
110 |             noise = np.random.normal(0, 1, (batch_size, 100))
111 |             image_batch = X_train[
112 |                 np.random.randint(0, X_train.shape[0], size=batch_size)
113 |             ]
114 | 
115 |             fake_images = gen.predict(noise)
116 | 
117 |             final_images = np.concatenate([image_batch, fake_images])
118 |             final_labels = np.concatenate(
119 |                 (
120 |                     np.ones((np.int64(batch_size), 1)),
121 |                     np.zeros((np.int64(batch_size), 1)),
122 |                 )
123 |             )
124 | 
125 |             disc.trainable = True
126 |             disc_loss = disc.train_on_batch(final_images, final_labels)
127 | 
128 |             disc.trainable = False
129 |             y_mis_labels = np.ones(batch_size)
130 |             gen_loss = gan_stack.train_on_batch(noise, y_mis_labels)
131 | 
132 |             mlflow.log_metrics(
133 |                 {"generator_loss": gen_loss, "discriminator_loss": disc_loss}
134 |             )
135 | 
136 |             print(
137 |                 "epoch_%d---->gen_loss:[%f]---->disc_loss:[%f]"
138 |                 % (i, gen_loss, disc_loss)
139 |             )
140 |             # if i % 1000 == 0:
141 |             #     test(gen, i)
142 | 
143 |         gen.save(os.path.join(file_path, artifact_name, "generator.h5"))
144 |         # disc.save(os.path.join(file_path, artifact_name, "discriminator.h5"))
145 | 
146 |         meta_data_path = os.path.join(file_path, artifact_name)
147 |         for artifact in sorted(os.listdir(meta_data_path)):
148 |             if artifact != ".DS_Store":
149 |                 artifact_path = os.path.join(meta_data_path, artifact)
150 |                 if (
151 |                     os.path.isfile(artifact_path)
152 |                     and artifact_path.split(".")[-1] != "h5"
153 |                 ):
154 |                     print(f"artifact to be uploaded is: {artifact}")
155 |                     mlflow.log_artifact(local_path=artifact_path)
156 | 
157 |         artifact_uri = mlflow.get_artifact_uri()
158 |         print(artifact_uri)
159 |         mlflow.end_run()
160 | 
161 | 
162 | parser = argparse.ArgumentParser()
163 | parser.add_argument("-fp", "--file_path", help="Directory path to save artifacts")
164 | parser.add_argument("-a", "--art_name", help="Artifacts name")
165 | parser.add_argument("-b", "--batch_size", default=32, help="Batch size for training")
166 | parser.add_argument("-e", "--epochs", default=20000, help="Number of epochs")
167 | parser.add_argument("-is", "--img_shape", default=784, help="One dimension of image")
168 | parser.add_argument(
169 |     "-exp",
170 |     "--experiment_name",
171 |     default="vanilla_gan",
172 |     help="Name of the experiment for tracking",
173 | )
174 | args = parser.parse_args()
175 | train(
176 |     gen=generator(),
177 |     disc=discriminator(),
178 |     gan_stack=stacked_GAN(gen=generator(), disc=discriminator()),
179 |     max_iter=int(args.epochs),
180 |     batch_size=int(args.batch_size),
181 |     img_shape=int(args.img_shape),
182 |     file_path=args.file_path,
183 |     artifact_name=args.art_name,
184 |     exp_name=args.experiment_name,
185 | )
186 | 


--------------------------------------------------------------------------------