├── .gitattributes ├── .gitignore ├── 2_level_doc_classification ├── bento_package.py ├── bento_predictor.py ├── docker-compose.yml ├── ecs-params.yml ├── hybrid_v2.py ├── pre_process_text.py ├── pre_processing.py └── train.py ├── LICENSE ├── README.md ├── conditional_GAN ├── __pycache__ │ └── bento_predictor.cpython-38.pyc ├── artifacts │ ├── saved_model.pb │ └── variables │ │ ├── variables.data-00000-of-00001 │ │ └── variables.index ├── bento_package.py ├── bento_predictor.py ├── infer.py └── train.py ├── document_classification ├── __pycache__ │ ├── hybrid_v1.cpython-38.pyc │ ├── pre_process_text.cpython-38.pyc │ └── pre_processing.cpython-38.pyc ├── bento_package.py ├── bento_predictor.py ├── docker-compose.yml ├── ecs-params.yml ├── hybrid_v1.py ├── pre_process_text.py ├── pre_processing.py └── train.py ├── image_classifier ├── __pycache__ │ └── bento_predictor.cpython-38.pyc ├── artifacts │ ├── saved_model.pb │ └── variables │ │ ├── variables.data-00000-of-00001 │ │ └── variables.index ├── bento_package.py ├── bento_predictor.py ├── docker-compose.yml ├── ecs-params.yml ├── infer.py └── train.py ├── multiple_models ├── bento_package.py ├── bento_predictor.py ├── docker-compose.yml ├── ecs-params.yml ├── hybrid_v1.py ├── hybrid_v2.py ├── pre_process_text.py ├── pre_processing.py └── train.py ├── trial.py └── vanilla_GAN ├── __pycache__ └── bento_predictor.cpython-38.pyc ├── artifacts ├── saved_model.pb └── variables │ ├── variables.data-00000-of-00001 │ └── variables.index ├── bento_package.py ├── bento_predictor.py ├── infer.py └── train.py /.gitattributes: -------------------------------------------------------------------------------- 1 | # Auto detect text files and perform LF normalization 2 | * text=auto 3 | -------------------------------------------------------------------------------- /.gitignore: -------------------------------------------------------------------------------- 1 | 2 | *.pyc 3 | *.pyc 4 | -------------------------------------------------------------------------------- /2_level_doc_classification/bento_package.py: -------------------------------------------------------------------------------- 1 | from bento_predictor import ModelZoo 2 | import tensorflow as tf 3 | from tensorflow.keras.models import load_model 4 | from tensorflow.keras.preprocessing.text import Tokenizer 5 | import spacy 6 | import json 7 | 8 | 9 | def som_models( 10 | model_zoo, 11 | tf_model_path_som: str, 12 | text_file_path_som: str, 13 | master_labels_path: str, 14 | sub_labels_path: str, 15 | ): 16 | model_cnn = load_model(tf_model_path_som) 17 | tf.saved_model.save(model_cnn, "artifacts/") 18 | model_cnn = tf.saved_model.load("artifacts/") 19 | model_zoo.pack("som_model", model_cnn) 20 | 21 | text_model = spacy.load("en_core_web_sm") 22 | model_zoo.pack("som_spacy_model", text_model) 23 | 24 | tokenizer = Tokenizer() 25 | with open(text_file_path_som, "r") as f: 26 | bow = f.read() 27 | tokenizer.fit_on_texts(bow.split("####")) 28 | model_zoo.pack("som_tokenizer", tokenizer) 29 | 30 | with open(master_labels_path, "r") as f: 31 | labels_som = json.load(f) 32 | model_zoo.pack("som_master_labels", labels_som) 33 | 34 | with open(sub_labels_path, "r") as g: 35 | sub_labels_som = json.load(g) 36 | model_zoo.pack("som_sub_labels", sub_labels_som) 37 | 38 | 39 | def main(): 40 | model_zoo = ModelZoo() 41 | som_models( 42 | model_zoo=model_zoo, 43 | tf_model_path_som=tf_model_path_som, 44 | text_file_path_som=text_file_path_som, 45 | master_labels_path=master_labels_path, 46 | sub_labels_path=sub_labels_path, 47 | ) 48 | saved_path = model_zoo.save() 49 | 50 | 51 | tf_model_path_som = ( 52 | "/Users/vsatpathy/Desktop/docs/training_data/som/document_classifier.h5" 53 | ) 54 | text_file_path_som = ( 55 | "/Users/vsatpathy/Desktop/docs/training_data/som/file_and_text_som.txt" 56 | ) 57 | master_labels_path = ( 58 | "/Users/vsatpathy/Desktop/docs/training_data/som/rev_labels_master_som.json" 59 | ) 60 | sub_labels_path = "/Users/vsatpathy/Desktop/docs/training_data/som/rev_labels_som.json" 61 | main() 62 | -------------------------------------------------------------------------------- /2_level_doc_classification/bento_predictor.py: -------------------------------------------------------------------------------- 1 | import bentoml 2 | from bentoml.types import FileLike 3 | from bentoml.adapters import JsonInput, FileInput, MultiFileInput 4 | from bentoml.frameworks.spacy import SpacyModelArtifact 5 | from bentoml.frameworks.tensorflow import TensorflowSavedModelArtifact 6 | from bentoml.service.artifacts.common import ( 7 | JSONArtifact, 8 | PickleArtifact, 9 | ) 10 | 11 | import tensorflow as tf 12 | from tensorflow.keras.preprocessing.image import load_img 13 | import numpy as np 14 | from pytesseract import image_to_string 15 | import re 16 | from PIL import Image 17 | from typing import List 18 | 19 | 20 | @bentoml.env(infer_pip_packages=True) 21 | @bentoml.artifacts( 22 | [ 23 | TensorflowSavedModelArtifact("som_model"), 24 | SpacyModelArtifact("som_spacy_model"), 25 | PickleArtifact("som_tokenizer"), 26 | JSONArtifact("som_master_labels"), 27 | JSONArtifact("som_sub_labels"), 28 | ] 29 | ) 30 | class ModelZoo(bentoml.BentoService): 31 | def helper(self, text): 32 | dummy = [] 33 | for word in text: 34 | dummy.append(str(word)) 35 | final = " ".join(dummy) 36 | return final 37 | 38 | def preprocess_spacy(self, spacy_model, text, num_of_words: int): 39 | text = str(text) 40 | text = text.split(" ") 41 | text = self.helper(text) 42 | text = str(text.lower()) 43 | # Remove all the special characters 44 | text = re.sub(r"\W", " ", text) 45 | text = re.sub(r"[^a-zA-Z ]+", "", text) 46 | # remove all single characters 47 | text = re.sub(r"\s+[a-zA-Z]\s+", " ", text) 48 | # Remove single characters from the start 49 | text = re.sub(r"\^[a-zA-Z]\s+", " ", text) 50 | # Substituting multiple spaces with single space 51 | text = re.sub(r"\s+", " ", text, flags=re.I) 52 | # text = self.artifacts.mcr_spacy_model(text) 53 | text = spacy_model(text) 54 | filtered = [token.lemma_ for token in text if token.is_stop == False] 55 | text = " ".join(filtered[: num_of_words * 2]) 56 | text = text.strip().split(" ") 57 | text = " ".join(text[:num_of_words]) 58 | return text 59 | 60 | def tokenize_sentence(self, sentence, tokenizer, maximum_word_length): 61 | updated_sentence = sentence.split(" ") 62 | tok_sent = [] 63 | for word in updated_sentence: 64 | if word in tokenizer.word_index: 65 | tok_sent.append(tokenizer.word_index[word]) 66 | else: 67 | tok_sent.append(0) 68 | if len(tok_sent) != maximum_word_length: 69 | delta = maximum_word_length - len(tok_sent) 70 | for i in range(delta): 71 | tok_sent.append(0) 72 | return tok_sent 73 | 74 | def pre_process_image(self, image_file): 75 | ocr_image = np.asarray(Image.open(image_file)) 76 | image = np.asarray( 77 | Image.open(image_file).convert(mode="RGB").resize((100, 100)) 78 | ) 79 | image = np.divide(image, 255.0) 80 | image = np.asarray([image]).astype("float32") 81 | return ocr_image, image 82 | 83 | def pre_process_som(self, file): 84 | ocr_image, image = self.pre_process_image(image_file=file) 85 | doc_text = image_to_string(ocr_image) 86 | doc_text_processed = self.preprocess_spacy( 87 | spacy_model=self.artifacts.som_spacy_model, text=doc_text, num_of_words=10 88 | ) 89 | fin_text = self.tokenize_sentence( 90 | sentence=doc_text_processed, 91 | tokenizer=self.artifacts.som_tokenizer, 92 | maximum_word_length=10, 93 | ) 94 | return image, np.asarray([fin_text]).astype("float32") 95 | 96 | @bentoml.api(input=FileInput()) 97 | def predict_document_labels_som(self, file_stream): 98 | image, text = self.pre_process_som(file=file_stream) 99 | model = self.artifacts.som_model.signatures["serving_default"] 100 | model._num_positional_args = 2 101 | results = model(tf.constant(text), tf.constant(image)) 102 | mas_results = results.get("dense_1")[0].numpy() 103 | sub_results = results.get("dense_4")[0].numpy() 104 | master_label = self.artifacts.som_master_labels[str(np.argmax(mas_results))] 105 | sub_label = self.artifacts.som_sub_labels[str(np.argmax(sub_results))] 106 | return {"master document type": master_label, "sub document type": sub_label} 107 | -------------------------------------------------------------------------------- /2_level_doc_classification/docker-compose.yml: -------------------------------------------------------------------------------- 1 | version: "3" 2 | services: 3 | web: 4 | image: 142339138776.dkr.ecr.us-east-2.amazonaws.com/docuedge-model-zoo:latest 5 | ports: 6 | - "5000:5000" 7 | logging: 8 | driver: awslogs 9 | options: 10 | awslogs-group: docuedge-modelserver-ecs 11 | awslogs-region: us-east-2 12 | awslogs-stream-prefix: web 13 | volumes: 14 | - /app/temp 15 | -------------------------------------------------------------------------------- /2_level_doc_classification/ecs-params.yml: -------------------------------------------------------------------------------- 1 | version: 1 2 | task_definition: 3 | task_execution_role: ecsTaskExecutionRoleBento 4 | ecs_network_mode: awsvpc 5 | task_size: 6 | mem_limit: 8GB 7 | cpu_limit: 4096 8 | efs_volumes: 9 | - name: docuedgedev-efs 10 | filesystem_id: fs-4717c93f 11 | root_directory: /smartbox-config 12 | run_params: 13 | network_configuration: 14 | awsvpc_configuration: 15 | subnets: 16 | - subnet-00e7bff093931a167 17 | - subnet-0345b051535c9625d 18 | security_groups: 19 | - sg-09b7e06cb8b13167d 20 | - sg-0601a52d4ea28af05 21 | assign_public_ip: ENABLED 22 | -------------------------------------------------------------------------------- /2_level_doc_classification/hybrid_v2.py: -------------------------------------------------------------------------------- 1 | import numpy as np 2 | import random 3 | import tensorflow as tf 4 | from tensorflow.keras.preprocessing.image import load_img 5 | from tensorflow.keras.preprocessing.text import Tokenizer 6 | from tensorflow.keras.layers import ( 7 | Input, 8 | Conv2D, 9 | Dense, 10 | Flatten, 11 | Embedding, 12 | Concatenate, 13 | GlobalMaxPool1D, 14 | Conv1D, 15 | MaxPooling1D, 16 | ) 17 | from tensorflow.keras.models import Model, load_model 18 | import os 19 | import json 20 | import mlflow 21 | import mlflow.tensorflow 22 | 23 | tracking_uri = ( 24 | "http://testuser:password@ec2-18-218-100-222.us-east-2.compute.amazonaws.com" 25 | ) 26 | s3_bucket = "s3://docuedge-mlflow-bucket" # replace this value 27 | 28 | 29 | def read_data(path): 30 | bow = open(path, "r") 31 | data = bow.readlines() 32 | all_data_paths = [] 33 | all_texts = [] 34 | doc_type_y_labels = {} 35 | master_doc_type_y_labels = {} 36 | for line in data: 37 | line_data = line.split("####") 38 | all_data_paths.append(line_data[0]) 39 | all_texts.append(line_data[-1][:-1]) 40 | doc_type_label = line_data[0].split("/")[-2] 41 | master_doc_type_label = line_data[0].split("/")[-3] 42 | if doc_type_label not in doc_type_y_labels: 43 | doc_type_y_labels[doc_type_label] = len(doc_type_y_labels) 44 | if master_doc_type_label not in master_doc_type_y_labels: 45 | master_doc_type_y_labels[master_doc_type_label] = len( 46 | master_doc_type_y_labels 47 | ) 48 | 49 | rev_labels_doc_type = {} 50 | for key, val in doc_type_y_labels.items(): 51 | rev_labels_doc_type[val] = key 52 | rev_labels_master_doc_type = {} 53 | for key, val in master_doc_type_y_labels.items(): 54 | rev_labels_master_doc_type[val] = key 55 | 56 | return ( 57 | all_data_paths, 58 | doc_type_y_labels, 59 | rev_labels_doc_type, 60 | all_texts, 61 | master_doc_type_y_labels, 62 | rev_labels_master_doc_type, 63 | ) 64 | 65 | 66 | def tokenize_sentence(sentence, tokenizer, maximum_word_length): 67 | updated_sentence = sentence.split(" ") 68 | tok_sent = [] 69 | for word in updated_sentence: 70 | if word in tokenizer.word_index: 71 | tok_sent.append(tokenizer.word_index[word]) 72 | else: 73 | tok_sent.append(0) 74 | if len(tok_sent) != maximum_word_length: 75 | delta = maximum_word_length - len(tok_sent) 76 | for i in range(delta): 77 | tok_sent.append(0) 78 | return tok_sent 79 | 80 | 81 | def data_loader_text( 82 | bs, 83 | data, 84 | y_lab, 85 | tokenizer, 86 | text_data, 87 | image_input_shape, 88 | max_word_length, 89 | y_sub_labels, 90 | ): 91 | while True: 92 | images = [] 93 | master_labels = [] 94 | sub_labels = [] 95 | texts = [] 96 | while len(images) < bs: 97 | indice = random.randint(0, len(data) - 1) 98 | target = data[indice].split("/")[-3] 99 | sub_target = data[indice].split("/")[-2] 100 | master_labels.append(y_lab[target]) 101 | sub_labels.append(y_sub_labels[sub_target]) 102 | 103 | test_img = np.asarray(load_img(data[indice], target_size=image_input_shape)) 104 | img = np.divide(test_img, 255.0) 105 | images.append(img) 106 | 107 | tok_sen = tokenize_sentence( 108 | text_data[indice], tokenizer, maximum_word_length=max_word_length 109 | ) 110 | texts.append(tok_sen) 111 | yield [np.asarray(images), np.asarray(texts)], [ 112 | np.asarray(master_labels), 113 | np.asarray(sub_labels), 114 | ] 115 | 116 | 117 | def model_arc(y_labels, tokenizer, text_model_inp_shape, image_inp_shape, y_sub_labels): 118 | inp_layer_texts = Input(shape=text_model_inp_shape) 119 | inp_layer_images = Input(shape=image_inp_shape) 120 | 121 | embedding_layer = Embedding( 122 | input_dim=len(tokenizer.word_index) + 1, 123 | output_dim=64, 124 | input_length=text_model_inp_shape, 125 | trainable=True, 126 | )(inp_layer_texts) 127 | pooling_layer = GlobalMaxPool1D()(embedding_layer) 128 | dense_layer = Dense(units=64, activation="relu")(pooling_layer) 129 | 130 | conv_layer = Conv2D(filters=64, kernel_size=(2, 2), activation="relu")( 131 | inp_layer_images 132 | ) 133 | flatten_layer = Flatten()(conv_layer) 134 | 135 | concat_layer = Concatenate()([flatten_layer, dense_layer]) 136 | out_layer = Dense(len(y_labels), activation="softmax")(concat_layer) 137 | 138 | sub_model_inp = Dense(units=64, activation="relu")(out_layer) 139 | sub_dense_layer = Dense(units=256, activation="relu")(sub_model_inp) 140 | 141 | sub_concat_layer = Concatenate()([sub_dense_layer, concat_layer]) 142 | sub_out_layer = Dense(units=len(y_sub_labels), activation="softmax")( 143 | sub_concat_layer 144 | ) 145 | 146 | model = Model([inp_layer_images, inp_layer_texts], [out_layer, sub_out_layer]) 147 | model.compile( 148 | optimizer="adam", loss="sparse_categorical_crossentropy", metrics=["accuracy"] 149 | ) 150 | return model 151 | 152 | 153 | def train_hybrid_v2( 154 | text_plus_file_path: str, 155 | batch_size: int, 156 | epochs: int, 157 | image_shape: int, 158 | max_words: int, 159 | artifact_name: str, 160 | save_dir_path: str, 161 | trained_model_path: str, 162 | experiment_name: str, 163 | ): 164 | mlflow.set_tracking_uri(tracking_uri) 165 | client = mlflow.tracking.MlflowClient(tracking_uri=tracking_uri) 166 | try: 167 | expr_name = experiment_name # create a new experiment (do not replace) 168 | mlflow.create_experiment(expr_name, s3_bucket) 169 | mlflow.set_experiment(expr_name) 170 | experiment = mlflow.get_experiment_by_name(experiment_name) 171 | except: 172 | experiment = mlflow.get_experiment_by_name(experiment_name) 173 | 174 | ( 175 | all_imgs_path, 176 | doc_type_y_labels, 177 | rev_labels_doc_type, 178 | all_text, 179 | master_doc_type_label, 180 | rev_labels_master_doc_type, 181 | ) = read_data(path=text_plus_file_path) 182 | num_train_img = len(all_imgs_path) 183 | 184 | with open( 185 | os.path.join(save_dir_path, artifact_name, f"rev_labels_{artifact_name}.json"), 186 | "w+", 187 | ) as tar: 188 | json.dump(rev_labels_doc_type, tar) 189 | with open( 190 | os.path.join( 191 | save_dir_path, artifact_name, f"rev_labels_master_{artifact_name}.json" 192 | ), 193 | "w+", 194 | ) as tar: 195 | json.dump(rev_labels_master_doc_type, tar) 196 | 197 | print("target_encodings: ", master_doc_type_label) 198 | print("target_encodings: ", doc_type_y_labels) 199 | print("Number of training images: ", num_train_img) 200 | 201 | bow = open(text_plus_file_path, "r") 202 | tokenizer = Tokenizer() 203 | tokenizer.fit_on_texts(bow.read().split("####")) 204 | 205 | train_gen = data_loader_text( 206 | tokenizer=tokenizer, 207 | y_lab=master_doc_type_label, 208 | data=all_imgs_path, 209 | text_data=all_text, 210 | bs=batch_size, 211 | image_input_shape=(image_shape, image_shape, 3), 212 | max_word_length=max_words, 213 | y_sub_labels=doc_type_y_labels, 214 | ) 215 | if os.path.isfile(trained_model_path): 216 | model = load_model(trained_model_path) 217 | else: 218 | model = model_arc( 219 | y_labels=master_doc_type_label, 220 | tokenizer=tokenizer, 221 | text_model_inp_shape=(max_words,), 222 | image_inp_shape=(image_shape, image_shape, 3), 223 | y_sub_labels=doc_type_y_labels, 224 | ) 225 | mlflow.tensorflow.autolog(every_n_iter=1) 226 | with mlflow.start_run(experiment_id=experiment.experiment_id): 227 | mlflow.log_metrics( 228 | { 229 | "batch_size": batch_size, 230 | "epochs": epochs, 231 | "image_shape": image_shape, 232 | "max_words": max_words, 233 | } 234 | ) 235 | model.fit( 236 | x=train_gen, steps_per_epoch=num_train_img // batch_size, epochs=epochs 237 | ) 238 | model.save( 239 | filepath=os.path.join( 240 | save_dir_path, artifact_name, "document_classifier.h5" 241 | ) 242 | ) 243 | meta_data_path = os.path.join(save_dir_path, artifact_name) 244 | for artifact in sorted(os.listdir(meta_data_path)): 245 | if artifact != ".DS_Store": 246 | artifact_path = os.path.join(meta_data_path, artifact) 247 | if ( 248 | os.path.isfile(artifact_path) 249 | and artifact_path.split(".")[-1] != "h5" 250 | ): 251 | print(f"artifact to be uploaded is: {artifact}") 252 | mlflow.log_artifact(local_path=artifact_path) 253 | 254 | artifact_uri = mlflow.get_artifact_uri() 255 | print(artifact_uri) 256 | mlflow.end_run() 257 | -------------------------------------------------------------------------------- /2_level_doc_classification/pre_process_text.py: -------------------------------------------------------------------------------- 1 | import spacy 2 | import re 3 | from pdf2image import convert_from_path 4 | import os 5 | from tqdm import tqdm 6 | import pre_processing 7 | from PIL import Image 8 | import pytesseract 9 | 10 | nlp = spacy.load("en_core_web_sm") 11 | 12 | 13 | def helper(text): 14 | dummy = [] 15 | for word in text: 16 | dummy.append(str(word)) 17 | final = " ".join(dummy) 18 | return final 19 | 20 | 21 | def preprocess_spacy(text, num_of_words: int): 22 | text = str(text) 23 | text = text.split(" ") 24 | text = helper(text) 25 | text = str(text.lower()) 26 | # Remove all the special characters 27 | text = re.sub(r"\W", " ", text) 28 | text = re.sub(r"[^a-zA-Z ]+", "", text) 29 | # remove all single characters 30 | text = re.sub(r"\s+[a-zA-Z]\s+", " ", text) 31 | # Remove single characters from the start 32 | text = re.sub(r"\^[a-zA-Z]\s+", " ", text) 33 | # Substituting multiple spaces with single space 34 | text = re.sub(r"\s+", " ", text, flags=re.I) 35 | text = nlp(text) 36 | filtered = [token.lemma_ for token in text if token.is_stop == False] 37 | text = " ".join(filtered[: num_of_words * 2]) 38 | text = text.strip().split(" ") 39 | text = " ".join(text[:num_of_words]) 40 | return text 41 | 42 | 43 | def read_text_from_pages_v2( 44 | complete_folder_path: str, 45 | path_to_save_essential_data: str, 46 | meta_name: str, 47 | num_of_words: int, 48 | ): 49 | final_path_for_data = os.path.join( 50 | path_to_save_essential_data, meta_name, f"file_and_text_{meta_name}.txt" 51 | ) 52 | if os.path.isfile(final_path_for_data): 53 | data = open(final_path_for_data, "r").read() 54 | else: 55 | data = "null" 56 | print("#### Reading pages ####") 57 | document_folders_path = os.path.join(complete_folder_path) 58 | master_doc_types = sorted(os.listdir(document_folders_path)) 59 | text_of_all_pages = [] 60 | for master_doc_type in master_doc_types: 61 | if master_doc_type != ".DS_Store": 62 | print("MASTER DOCUMENT TYPE: ", master_doc_type) 63 | sub_doc_type_path = os.path.join(document_folders_path, master_doc_type) 64 | for doc_image_type in sorted(os.listdir(sub_doc_type_path)): 65 | if doc_image_type != ".DS_Store": 66 | print("DOCUMENT TYPE: ", doc_image_type) 67 | complete_doc_image_path = os.path.join( 68 | sub_doc_type_path, doc_image_type 69 | ) 70 | pages = sorted(os.listdir(complete_doc_image_path)) 71 | for page in tqdm(pages): 72 | if page != ".DS_Store": 73 | page_path = os.path.join(complete_doc_image_path, page) 74 | if page_path not in data: 75 | document_page = Image.open(page_path) 76 | document_text = pytesseract.image_to_string( 77 | document_page 78 | ) 79 | document_page.close() 80 | essential_file_path_and_text = ( 81 | page_path 82 | + "####" 83 | + preprocess_spacy( 84 | document_text, num_of_words=num_of_words 85 | ) 86 | + "\n" 87 | ) 88 | text_of_all_pages.append(essential_file_path_and_text) 89 | 90 | if os.path.isfile(final_path_for_data): 91 | all_essential_data = open(final_path_for_data, "a+") 92 | all_essential_data.writelines(text_of_all_pages) 93 | else: 94 | all_essential_data = open(final_path_for_data, "w") 95 | all_essential_data.writelines(text_of_all_pages) 96 | return final_path_for_data 97 | 98 | 99 | def pdf_to_images(full_path_pdf: str, converted_images_path: str, meta_name: str): 100 | doc = full_path_pdf.split("/")[-1] 101 | index = 0 102 | OUTPUT_PATH = converted_images_path 103 | os.makedirs(name=OUTPUT_PATH, exist_ok=True) 104 | 105 | print("Document name: ", doc) 106 | if str(doc.split(".pdf")[-2]) + "_" + str(index) + ".jpg" not in os.listdir( 107 | converted_images_path 108 | ): 109 | pil_images = convert_from_path(full_path_pdf, dpi=300) 110 | 111 | for image in tqdm(pil_images): 112 | processed_image = pre_processing.preprocess_image_file(image) 113 | try: 114 | processed_image = Image.fromarray(processed_image) 115 | processed_image.save( 116 | os.path.join(OUTPUT_PATH, str(doc.split(".pdf")[-2])) 117 | + "_" 118 | + str(index) 119 | + ".jpg", 120 | format="JPEG", 121 | subsampling=0, 122 | quality=100, 123 | ) 124 | index += 1 125 | processed_image.close() 126 | except: 127 | index += 1 128 | else: 129 | pass -------------------------------------------------------------------------------- /2_level_doc_classification/pre_processing.py: -------------------------------------------------------------------------------- 1 | """IMAGE PREPROCESSING FUNCTIONS 2 | """ 3 | import cv2 4 | import numpy as np 5 | from scipy.ndimage.filters import rank_filter 6 | 7 | # from sbox.utils.sbox_logger import logger 8 | import pytesseract 9 | import re 10 | import imutils 11 | from PIL import Image 12 | 13 | # print("error") # = logger(__name__) 14 | 15 | 16 | class PagePreprocess(object): 17 | def __init__(self, im): 18 | self.err = False 19 | self.orig_im = im 20 | self.orig_shape = self.orig_im.shape 21 | self.image = im 22 | 23 | def crop(self): 24 | try: 25 | self.image, self.num_tries = process_image(self.orig_im) 26 | self.crop_shape = self.image.shape 27 | return self.image 28 | except Exception as e: 29 | print("crop_obj_Error") # (f"Error: {e}", exc_info=True) 30 | 31 | def deskew(self): 32 | try: 33 | self.image, self.theta_est = process_skewed_crop(self.image) 34 | return self.image 35 | except Exception as e: 36 | print("deskew_obj_Error") # (f"Error: {e}", exc_info=True) 37 | 38 | 39 | def auto_canny(image, sigma=0.33): 40 | try: 41 | v = np.median(image) 42 | lower = int(max(0, (1.0 - sigma) * v)) 43 | upper = int(min(255, (1.0 + sigma) * v)) 44 | edged = cv2.Canny(image, lower, upper, True) 45 | return edged 46 | except Exception as e: 47 | print("auto_canny_Error") # (f"Error: {e}", exc_info=True) 48 | 49 | 50 | def dilate(image, kernel, iterations): 51 | dilated_image = cv2.dilate(image, kernel, iterations=iterations) 52 | return dilated_image 53 | 54 | 55 | def downscale_image(im, max_dim=2048): 56 | try: 57 | a, b = im.shape[:2] 58 | if max(a, b) <= max_dim: 59 | return 1.0, im 60 | 61 | scale = 1.0 * max_dim / max(a, b) 62 | new_im = cv2.resize(im, (int(b * scale), int(a * scale)), cv2.INTER_AREA) 63 | return scale, new_im 64 | except Exception as e: 65 | print("error") # (f"Error: {e}", exc_info=True) 66 | 67 | 68 | def find_components(im, max_components=16): 69 | try: 70 | kernel = cv2.getStructuringElement(cv2.MORPH_RECT, (10, 10)) 71 | dilation = dilate(im, kernel, 6) 72 | 73 | count = 21 74 | n = 0 75 | sigma = 0.000 76 | 77 | while count > max_components: 78 | n += 1 79 | sigma += 0.005 80 | result = cv2.findContours(dilation, cv2.RETR_TREE, cv2.CHAIN_APPROX_SIMPLE) 81 | if len(result) == 3: 82 | _, contours, hierarchy = result 83 | elif len(result) == 2: 84 | contours, hierarchy = result 85 | possible = find_likely_rectangles(contours, sigma) 86 | count = len(possible) 87 | 88 | return (dilation, possible, n) 89 | except Exception as e: 90 | print("comp_error") # (f"Error: {e}", exc_info=True) 91 | 92 | 93 | def find_likely_rectangles(contours, sigma): 94 | try: 95 | contours = sorted(contours, key=cv2.contourArea, reverse=True)[:10] 96 | possible = [] 97 | for c in contours: 98 | 99 | peri = cv2.arcLength(c, True) 100 | approx = cv2.approxPolyDP(c, sigma * peri, True) 101 | box = make_box(approx) 102 | possible.append(box) 103 | 104 | return possible 105 | except Exception as e: 106 | print("likely_rec_error") # (f"Error: {e}", exc_info=True) 107 | 108 | 109 | def make_box(poly): 110 | try: 111 | x = [] 112 | y = [] 113 | for p in poly: 114 | for point in p: 115 | x.append(point[0]) 116 | y.append(point[1]) 117 | xmax = max(x) 118 | ymax = max(y) 119 | xmin = min(x) 120 | ymin = min(y) 121 | return (xmin, ymin, xmax, ymax) 122 | except Exception as e: 123 | print("bbox_error") # (f"Error: {e}", exc_info=True) 124 | 125 | 126 | def rect_union(crop1, crop2): 127 | x11, y11, x21, y21 = crop1 128 | x12, y12, x22, y22 = crop2 129 | return min(x11, x12), min(y11, y12), max(x21, x22), max(y21, y22) 130 | 131 | 132 | def rect_area(crop): 133 | x1, y1, x2, y2 = crop 134 | return max(0, x2 - x1) * max(0, y2 - y1) 135 | 136 | 137 | def crop_image(im, rect, scale): 138 | try: 139 | xmin, ymin, xmax, ymax = rect 140 | crop = [xmin, ymin, xmax, ymax] 141 | xmin, ymin, xmax, ymax = [int(x / scale) for x in crop] 142 | if ((ymax - ymin) * (xmax - xmin)) > 0.25 * im.size: 143 | cropped = im[ymin:ymax, xmin:xmax] 144 | else: 145 | cropped = im 146 | return cropped 147 | except Exception as e: 148 | print("crop_error_1") # (f"Error: {e}", exc_info=True) 149 | 150 | 151 | def reduce_noise_raw(im): 152 | bilat = cv2.bilateralFilter(im, 4, 75, 75) 153 | blur = cv2.medianBlur(bilat, 1) 154 | return blur 155 | 156 | 157 | def reduce_noise_edges(im): 158 | try: 159 | structuring_element = cv2.getStructuringElement(cv2.MORPH_RECT, (1, 1)) 160 | opening = cv2.morphologyEx(im, cv2.MORPH_OPEN, structuring_element) 161 | maxed_rows = rank_filter(opening, -4, size=(1, 20)) 162 | maxed_cols = rank_filter(opening, -4, size=(20, 1)) 163 | debordered = np.minimum(np.minimum(opening, maxed_rows), maxed_cols) 164 | return debordered 165 | except Exception as e: 166 | print("noise_red_Error") # (f"Error: {e}", exc_info=True) 167 | 168 | 169 | def rects_are_vertical(rect1, rect2, rect_align=2): 170 | try: 171 | xmin1, ymin1, xmax1, ymax1 = rect1 172 | xmin2, ymin2, xmax2, ymax2 = rect2 173 | 174 | midpoint1 = (xmin1 + xmax1) / 2 175 | midpoint2 = (xmin2 + xmax2) / 2 176 | dist = abs(midpoint1 - midpoint2) 177 | 178 | rectarea1 = rect_area(rect1) 179 | rectarea2 = rect_area(rect2) 180 | if rectarea1 > rectarea2: 181 | thres = (xmax1 - xmin1) * rect_align 182 | else: 183 | thres = (xmax2 - xmin2) * rect_align 184 | 185 | if thres > dist: 186 | align = True 187 | else: 188 | align = False 189 | return align 190 | except Exception as e: 191 | print("vert_rec_Error") # (f"Error: {e}", exc_info=True) 192 | 193 | 194 | def find_final_crop(im, rects, orig_im): 195 | try: 196 | current = None 197 | for rect in rects: 198 | if current is None: 199 | current = rect 200 | continue 201 | 202 | aligned = rects_are_vertical(current, rect) 203 | 204 | if not aligned: 205 | continue 206 | 207 | current = rect_union(current, rect) 208 | if current is not None: 209 | return current 210 | else: 211 | return (0, 0, orig_im.shape[0], orig_im.shape[1]) 212 | except Exception as e: 213 | print("crop_Error") # (f"Error: {e}", exc_info=True) 214 | 215 | 216 | def process_image(orig_im): 217 | try: 218 | scale, im = downscale_image(orig_im) 219 | 220 | blur = reduce_noise_raw(im.copy()) 221 | 222 | edges = auto_canny(blur.copy()) 223 | 224 | debordered = reduce_noise_edges(edges.copy()) 225 | 226 | dilation, rects, num_tries = find_components(debordered, 16) 227 | 228 | final_rect = find_final_crop(dilation, rects, orig_im) 229 | 230 | cropped = crop_image(orig_im, final_rect, scale) 231 | # kernel = np.ones((3, 3), np.float32) / 25 232 | # smooth2d = cv2.filter2D(cropped, -1, kernel=kernel) 233 | return (cropped, num_tries) 234 | except Exception as e: 235 | print("process") # (f"Error: {e}", exc_info=True) 236 | 237 | 238 | def rad_to_deg(theta): 239 | return theta * 180 / np.pi 240 | 241 | 242 | def rotate(image, theta): 243 | try: 244 | (h, w) = image.shape[:2] 245 | center = (w / 2, h / 2) 246 | M = cv2.getRotationMatrix2D(center, theta, 1) 247 | rotated = cv2.warpAffine( 248 | image, 249 | M, 250 | (int(w), int(h)), 251 | cv2.INTER_LINEAR, 252 | borderMode=cv2.BORDER_CONSTANT, 253 | borderValue=(255, 255, 255), 254 | ) 255 | return rotated 256 | except Exception as e: 257 | print("rotation_error") # (f"Error: {e}", exc_info=True) 258 | 259 | 260 | def angle_calculation(gray): 261 | gray = cv2.cvtColor(gray, cv2.COLOR_BGR2GRAY) 262 | gray = cv2.bitwise_not(gray) 263 | thresh = cv2.threshold(gray, 0, 255, cv2.THRESH_BINARY | cv2.THRESH_OTSU)[1] 264 | 265 | coords = np.column_stack(np.where(thresh > 0)) 266 | # print(coords, coords.shape) 267 | 268 | min_y = coords[0][0] 269 | max_y = coords[-1][0] 270 | min_x = coords[0][0] 271 | max_x = coords[-1][1] 272 | 273 | left_most = coords[0] 274 | right_most = coords[0] 275 | top_most = coords[0] 276 | bottom_most = coords[0] 277 | # print(coords[0], coords[-1]) 278 | for i in range(1, coords.shape[0]): 279 | y, x = coords[i][0], coords[i][1] 280 | if y <= min_y: 281 | min_y = y 282 | top_most = coords[i] 283 | elif y >= max_y: 284 | max_y = y 285 | bottom_most = coords[i] 286 | if x <= min_x: 287 | min_x = x 288 | left_most = coords[i] 289 | elif x >= max_x: 290 | max_x = x 291 | right_most = coords[i] 292 | # print(top_most, left_most, bottom_most, right_most) 293 | 294 | slopes = [] 295 | edge_coor = [top_most, left_most, bottom_most, right_most] 296 | for i in range(0, len(edge_coor)): 297 | if i == len(edge_coor) - 1: 298 | if abs((edge_coor[0][1] - edge_coor[i][1])) >= 10: 299 | angle = ( 300 | ( 301 | (edge_coor[0][0] - edge_coor[i][0]) 302 | / (edge_coor[0][1] - edge_coor[i][1]) 303 | ) 304 | * 180 305 | ) / 3.14 306 | slopes.append(angle) 307 | else: 308 | slopes.append(0.0) 309 | else: 310 | if abs((edge_coor[i + 1][1] - edge_coor[i][1])) >= 10: 311 | angle = ( 312 | ( 313 | (edge_coor[i + 1][0] - edge_coor[i][0]) 314 | / (edge_coor[i + 1][1] - edge_coor[i][1]) 315 | ) 316 | * 180 317 | ) / 3.14 318 | slopes.append(angle) 319 | else: 320 | slopes.append(0.0) 321 | # img = cv2.circle(thresh, (edge_coor[i][1], edge_coor[i][0]), 5, (255, 0, 0), 2) 322 | 323 | slopes = np.asarray(slopes) 324 | if len(np.where(slopes == 0.0)[0]) >= 2: 325 | # print("error") #(f"Error: {e}", exc_info=True)don't rotate") 326 | return None 327 | else: 328 | # print("error") #(f"Error: {e}", exc_info=True)rotate") 329 | neg_slope = (slopes[0] + slopes[2]) / 2 330 | pos_slope = (slopes[1] + slopes[3]) / 2 331 | # print(pos_slope, neg_slope) 332 | new_pos_slope = pos_slope 333 | new_neg_slope = neg_slope 334 | if pos_slope > 90: 335 | if pos_slope < 180: 336 | new_pos_slope = 180 - pos_slope 337 | else: 338 | new_pos_slope = pos_slope - ((pos_slope // 180) * 180) 339 | # print(new_pos_slope) 340 | if neg_slope < -90: 341 | new_neg_slope = 180 + neg_slope 342 | # print(new_pos_slope, new_neg_slope) 343 | if new_pos_slope <= new_neg_slope: 344 | fin_angle = pos_slope 345 | else: 346 | fin_angle = neg_slope 347 | 348 | if fin_angle < -90: 349 | rot_angle = 180 + fin_angle 350 | elif fin_angle > 90: 351 | rot_angle = -(180 - fin_angle) 352 | elif -90 < fin_angle < 0: 353 | rot_angle = fin_angle 354 | elif 0 < fin_angle < 90: 355 | rot_angle = fin_angle 356 | return rot_angle 357 | 358 | 359 | def estimate_skew(image): 360 | try: 361 | osd = pytesseract.image_to_osd(image) 362 | angle = float(re.search("(?<=Rotate: )\d+", osd).group(0)) 363 | if angle == 0: 364 | # fin_image = rotate(image_gray, angle) 365 | edges = auto_canny(image) 366 | # print(edges.shape) 367 | # print("error") #(f"Error: {e}", exc_info=True)edges found: ", edges) 368 | lines = cv2.HoughLines(edges, 1, np.pi / 270, 400) 369 | # print("error") #(f"Error: {e}", exc_info=True)lines found: ", lines) 370 | if lines is not None: 371 | new = edges.copy() 372 | thetas = [] 373 | for line in lines: 374 | for rho, theta in line: 375 | a = np.cos(theta) 376 | b = np.sin(theta) 377 | x0 = a * rho 378 | y0 = b * rho 379 | x1 = int(x0 + 1000 * (-b)) 380 | y1 = int(y0 + 1000 * (a)) 381 | x2 = int(x0 - 1000 * (-b)) 382 | y2 = int(y0 - 1000 * (a)) 383 | if theta > np.pi / 3 and theta < np.pi * 2 / 3: 384 | thetas.append(theta) 385 | new = cv2.line(new, (x1, y1), (x2, y2), (255, 255, 255), 1) 386 | 387 | theta_mean = np.mean(thetas) 388 | theta = -(90 - (rad_to_deg(theta_mean) if len(thetas) > 0 else 0)) 389 | else: 390 | # theta = angle_calculation(image) 391 | theta = 0.0 392 | else: 393 | theta = angle 394 | return theta 395 | except Exception as e: 396 | print("theta_error") # (f"Error: {e}", exc_info=True) 397 | 398 | 399 | def process_skewed_crop(image): 400 | try: 401 | theta = estimate_skew(image) 402 | # print(theta) 403 | # ret, thresh = cv2.threshold(image, 0, 127, cv2.THRESH_OTSU) 404 | # print(thresh) 405 | if theta is not None and (theta % 90) != 0: 406 | rotated = rotate(image, theta) 407 | elif (theta % 90) == 0: 408 | rotated = imutils.rotate_bound(image, theta) 409 | else: 410 | rotated = image 411 | # print(rotated) 412 | return rotated, theta 413 | except Exception as e: 414 | print("skew_Error") # (f"Error: {e}", exc_info=True) 415 | 416 | 417 | def preprocess_image(file_path: str): 418 | try: 419 | gray_page = cv2.imread(file_path, 0) 420 | process_page = PagePreprocess(gray_page) 421 | _ = process_page.crop() 422 | deskewed_page = process_page.deskew() 423 | # cv2.imwrite(file_path, deskewed_page) 424 | return deskewed_page 425 | except Exception as e: 426 | print("process_image_error") # (f"Error: {e}", exc_info=True) 427 | 428 | 429 | def preprocess_image_file(img): 430 | try: 431 | # converted_image = cv2.cvtColor(np.asarray(img), cv2.COLOR_RGB2BGR) 432 | gray_page = cv2.cvtColor(np.array(img), cv2.COLOR_BGR2GRAY) 433 | # gray_page = cv2.cvtColor(gray_page, cv2.COLOR_BGR2RGB) 434 | process_page = PagePreprocess(gray_page) 435 | _ = process_page.crop() 436 | deskewed_page = process_page.deskew() 437 | return deskewed_page 438 | except Exception as e: 439 | print("error") # (f"Error: {e}", exc_info=True) 440 | -------------------------------------------------------------------------------- /2_level_doc_classification/train.py: -------------------------------------------------------------------------------- 1 | import argparse 2 | import os 3 | from hybrid_v2 import train_hybrid_v2 4 | from pre_process_text import ( 5 | pdf_to_images, 6 | read_text_from_pages_v2, 7 | ) 8 | 9 | 10 | def process_multi_level( 11 | dataset_path: str, 12 | save_dir: str, 13 | pdf_check: bool, 14 | artifact_name: str, 15 | num_words_to_read: int, 16 | ): 17 | updated_dataset_path = os.path.join(save_dir, artifact_name, "dataset") 18 | os.makedirs(updated_dataset_path, exist_ok=True) 19 | for master_document_type in sorted(os.listdir(dataset_path)): 20 | if master_document_type != ".DS_Store": 21 | document_folder_path = os.path.join(dataset_path, master_document_type) 22 | updated_master_document_type_folder_path = os.path.join( 23 | updated_dataset_path, master_document_type 24 | ) 25 | os.makedirs(updated_master_document_type_folder_path, exist_ok=True) 26 | for document_type in sorted(os.listdir(document_folder_path)): 27 | if document_type != ".DS_Store": 28 | folder_path = os.path.join(document_folder_path, document_type) 29 | updated_document_type_folder_path = os.path.join( 30 | updated_master_document_type_folder_path, document_type 31 | ) 32 | os.makedirs(updated_document_type_folder_path, exist_ok=True) 33 | for documents in sorted(os.listdir(folder_path)): 34 | if documents != ".DS_Store": 35 | docuemnt_path = os.path.join(folder_path, documents) 36 | if pdf_check: 37 | # Perform conversion and store the images in a temp folder 38 | pdf_to_images( 39 | full_path_pdf=docuemnt_path, 40 | converted_images_path=updated_document_type_folder_path, 41 | meta_name=artifact_name, 42 | ) 43 | if pdf_check: 44 | images_data_path = os.path.join(save_dir, artifact_name) 45 | else: 46 | images_data_path = dataset_path 47 | master_data_path = read_text_from_pages_v2( 48 | complete_folder_path=images_data_path, 49 | path_to_save_essential_data=save_dir, 50 | meta_name=artifact_name, 51 | num_of_words=num_words_to_read, 52 | ) 53 | return master_data_path 54 | 55 | 56 | def multi_level(args): 57 | all_data_path = process_multi_level( 58 | dataset_path=args.data_path, 59 | save_dir=args.file_path, 60 | pdf_check=bool(args.pdfs), 61 | artifact_name=args.art_name, 62 | num_words_to_read=args.num_of_words, 63 | ) 64 | train_hybrid_v2( 65 | text_plus_file_path=all_data_path, 66 | batch_size=int(args.batch_size), 67 | epochs=int(args.epochs), 68 | image_shape=int(args.img_shape), 69 | max_words=int(args.num_of_words), 70 | artifact_name=args.art_name, 71 | save_dir_path=args.file_path, 72 | trained_model_path=args.model_path, 73 | experiment_name=args.experiment_name, 74 | ) 75 | 76 | 77 | parser = argparse.ArgumentParser() 78 | parser.add_argument("-dp", "--data_path", help="File path of the dataset") 79 | parser.add_argument("-fp", "--file_path", help="Directory path to save artifacts") 80 | parser.add_argument("-a", "--art_name", help="Artifacts name") 81 | parser.add_argument("-p", "--pdfs", default=False, help="Dataset type") 82 | parser.add_argument("-n", "--num_of_words", default=10, help="No of words to read") 83 | parser.add_argument("-b", "--batch_size", default=8, help="Batch size for training") 84 | parser.add_argument("-e", "--epochs", default=3, help="Number of epochs") 85 | parser.add_argument("-is", "--img_shape", default=100, help="One dimension of image") 86 | parser.add_argument("-mp", "--model_path", default="NULL", help="Path to trained model") 87 | parser.add_argument( 88 | "-exp", 89 | "--experiment_name", 90 | default="multi_label_classification", 91 | help="Name of the experiment for tracking", 92 | ) 93 | args = parser.parse_args() 94 | multi_level(args=args) 95 | -------------------------------------------------------------------------------- /LICENSE: -------------------------------------------------------------------------------- 1 | MIT License 2 | 3 | Copyright (c) 2021 Vaibhav Satpathy 4 | 5 | Permission is hereby granted, free of charge, to any person obtaining a copy 6 | of this software and associated documentation files (the "Software"), to deal 7 | in the Software without restriction, including without limitation the rights 8 | to use, copy, modify, merge, publish, distribute, sublicense, and/or sell 9 | copies of the Software, and to permit persons to whom the Software is 10 | furnished to do so, subject to the following conditions: 11 | 12 | The above copyright notice and this permission notice shall be included in all 13 | copies or substantial portions of the Software. 14 | 15 | THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR 16 | IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, 17 | FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE 18 | AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER 19 | LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, 20 | OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE 21 | SOFTWARE. 22 | -------------------------------------------------------------------------------- /README.md: -------------------------------------------------------------------------------- 1 | # psAI-clOps 2 | End to End MLOps 3 | 4 | This is a workflow to ensure Model Ops is taken care properly in any organisation 5 | 6 | To utilize this repository to it's fullest qnd set up the required dashboard on AWS for tracking, deploying and versioning, check out the following blogs: 7 | 1. Setup MLflow on AWS: https://vaibhavsatpathy.medium.com/setup-mlflow-on-aws-ec2-94b8e473618f 8 | 2. MLOps deployment into AWS Fargate I: https://vaibhavsatpathy.medium.com/mlops-deployment-into-aws-fargate-i-bd612af5dd7a 9 | 3. MLOps deployment into AWS Fargate II: https://vaibhavsatpathy.medium.com/mlops-deployment-in-to-aws-fargate-ii-95321942b9e1 10 | -------------------------------------------------------------------------------- /conditional_GAN/__pycache__/bento_predictor.cpython-38.pyc: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/vaibhavsatpathy/psAI-clOps/7b447177b3e43c8d028360f574e6a20ba5090cc5/conditional_GAN/__pycache__/bento_predictor.cpython-38.pyc -------------------------------------------------------------------------------- /conditional_GAN/artifacts/saved_model.pb: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/vaibhavsatpathy/psAI-clOps/7b447177b3e43c8d028360f574e6a20ba5090cc5/conditional_GAN/artifacts/saved_model.pb -------------------------------------------------------------------------------- /conditional_GAN/artifacts/variables/variables.data-00000-of-00001: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/vaibhavsatpathy/psAI-clOps/7b447177b3e43c8d028360f574e6a20ba5090cc5/conditional_GAN/artifacts/variables/variables.data-00000-of-00001 -------------------------------------------------------------------------------- /conditional_GAN/artifacts/variables/variables.index: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/vaibhavsatpathy/psAI-clOps/7b447177b3e43c8d028360f574e6a20ba5090cc5/conditional_GAN/artifacts/variables/variables.index -------------------------------------------------------------------------------- /conditional_GAN/bento_package.py: -------------------------------------------------------------------------------- 1 | from bento_predictor import ConditionalDigitGenerator 2 | from tensorflow.keras.models import load_model 3 | import tensorflow as tf 4 | 5 | 6 | def classifier_models(model_service, model_path: str): 7 | model_gen = load_model(model_path) 8 | tf.saved_model.save(model_gen, "artifacts/") 9 | model_gen = tf.saved_model.load("artifacts/") 10 | model_service.pack("model", model_gen) 11 | 12 | 13 | def main(): 14 | model_service = ConditionalDigitGenerator() 15 | classifier_models(model_service=model_service, model_path=generator_model_path) 16 | saved_path = model_service.save() 17 | 18 | 19 | generator_model_path = "/Users/vsatpathy/Desktop/docs/training_data/c_gan/generator.h5" 20 | main() -------------------------------------------------------------------------------- /conditional_GAN/bento_predictor.py: -------------------------------------------------------------------------------- 1 | import bentoml 2 | from bentoml.adapters import JsonInput 3 | from bentoml.frameworks.tensorflow import TensorflowSavedModelArtifact 4 | 5 | import tensorflow as tf 6 | import importlib.util 7 | import numpy as np 8 | from PIL import Image 9 | 10 | 11 | @bentoml.env(infer_pip_packages=True) 12 | @bentoml.artifacts([TensorflowSavedModelArtifact("model")]) 13 | class ConditionalDigitGenerator(bentoml.BentoService): 14 | @bentoml.api(input=JsonInput()) 15 | def generate_conditional_image(self, parsed_json): 16 | model = self.artifacts.model.signatures["serving_default"] 17 | model._num_positional_args = 2 18 | noise = np.random.normal(0, 1, (1, 100)) 19 | noise = tf.convert_to_tensor(noise, dtype=tf.float32) 20 | label = np.asarray(int(parsed_json.get("number"))).reshape(-1, 1) 21 | label = tf.convert_to_tensor(label, dtype=tf.int32) 22 | results = model(noise, label) 23 | generated_image = results.get("sequential")[0].numpy().reshape(28, 28) 24 | return {"digit_generated": generated_image} 25 | -------------------------------------------------------------------------------- /conditional_GAN/infer.py: -------------------------------------------------------------------------------- 1 | from tensorflow.keras.models import load_model 2 | import numpy as np 3 | import matplotlib.pyplot as plt 4 | from PIL import Image 5 | 6 | 7 | def test(gen_model_path: str, i: int): 8 | gen = load_model(gen_model_path) 9 | noise = np.random.normal(0, 1, (1, 100)) 10 | label = np.random.randint(0, 10, 1).reshape(-1, 1) 11 | image = np.squeeze(gen.predict([noise, label]), axis=0) 12 | plt.imsave( 13 | "/Users/vsatpathy/Desktop/off_POCs/cycle_gan/epoch_%d_tag_%s" % (i, label[0]), 14 | image.reshape(28, 28), 15 | format="jpg", 16 | cmap="gray", 17 | ) 18 | 19 | 20 | generator_model_path = "/Users/vsatpathy/Desktop/docs/training_data/c_gan/generator.h5" 21 | test(gen_model_path=generator_model_path, i=0) 22 | -------------------------------------------------------------------------------- /conditional_GAN/train.py: -------------------------------------------------------------------------------- 1 | import os 2 | import numpy as np 3 | import matplotlib.pyplot as plt 4 | import argparse 5 | import mlflow 6 | 7 | from tensorflow.keras.models import Model, Sequential 8 | from tensorflow.keras.datasets import mnist 9 | from tensorflow.keras.optimizers import Adam 10 | from tensorflow.keras import backend as K 11 | from tensorflow.keras import initializers 12 | from tensorflow.keras.layers import ( 13 | Input, 14 | multiply, 15 | Embedding, 16 | LeakyReLU, 17 | Reshape, 18 | Dense, 19 | Dropout, 20 | Flatten, 21 | Convolution2D, 22 | UpSampling2D, 23 | BatchNormalization, 24 | ) 25 | 26 | 27 | tracking_uri = ( 28 | "http://testuser:password@ec2-18-218-100-222.us-east-2.compute.amazonaws.com" 29 | ) 30 | s3_bucket = "s3://docuedge-mlflow-bucket" # replace this value 31 | 32 | 33 | def generator(): 34 | gen = Sequential() 35 | gen.add(Dense(256, input_dim=100)) 36 | gen.add(LeakyReLU(0.2)) 37 | gen.add(BatchNormalization(momentum=0.8)) 38 | gen.add(Dense(512)) 39 | gen.add(LeakyReLU(0.2)) 40 | gen.add(BatchNormalization(momentum=0.8)) 41 | gen.add(Dense(1024)) 42 | gen.add(LeakyReLU(0.2)) 43 | gen.add(BatchNormalization(momentum=0.8)) 44 | gen.add(Dense(784, activation="tanh")) 45 | # gen.summary() 46 | 47 | noise = Input(shape=(100,)) 48 | label = Input(shape=(1,), dtype="int32") 49 | label_embedding = Flatten()(Embedding(10, 100)(label)) 50 | model_input = multiply([noise, label_embedding]) 51 | image = gen(model_input) 52 | 53 | gen = Model([noise, label], image) 54 | gen.compile(loss="binary_crossentropy", optimizer=Adam(lr=0.0002, beta_1=0.5)) 55 | return gen 56 | 57 | 58 | def discriminator(): 59 | disc = Sequential() 60 | disc.add(Dense(512, input_dim=784)) 61 | disc.add(LeakyReLU(0.2)) 62 | disc.add(Dropout(0.4)) 63 | disc.add(Dense(512)) 64 | disc.add(LeakyReLU(0.2)) 65 | disc.add(Dropout(0.4)) 66 | disc.add(Dense(512)) 67 | disc.add(LeakyReLU(0.2)) 68 | disc.add(Dropout(0.4)) 69 | disc.add(Dense(1, activation="sigmoid")) 70 | # disc.summary() 71 | 72 | image = Input(shape=(784,)) 73 | label = Input(shape=(1,), dtype="int32") 74 | label_embedding = Flatten()(Embedding(10, 784)(label)) 75 | model_input = multiply([image, label_embedding]) 76 | prediction = disc(model_input) 77 | 78 | disc = Model([image, label], prediction) 79 | disc.compile( 80 | loss="binary_crossentropy", 81 | optimizer=Adam(lr=0.0002, beta_1=0.5), 82 | metrics=["accuracy"], 83 | ) 84 | return disc 85 | 86 | 87 | def stacked_GAN(gen, disc): 88 | gan_input = Input(shape=(100,)) 89 | label = Input(shape=(1,)) 90 | x = gen([gan_input, label]) 91 | disc.trainable = False 92 | gan_out = disc([x, label]) 93 | gan_stack = Model([gan_input, label], gan_out) 94 | gan_stack.compile(loss="binary_crossentropy", optimizer=Adam(lr=0.0002, beta_1=0.5)) 95 | return gan_stack 96 | 97 | 98 | def train( 99 | gen, 100 | disc, 101 | gan_stack, 102 | max_iter: int, 103 | batch_size: int, 104 | img_shape: int, 105 | file_path: str, 106 | artifact_name: str, 107 | exp_name: str, 108 | ): 109 | 110 | mlflow.set_tracking_uri(tracking_uri) 111 | client = mlflow.tracking.MlflowClient(tracking_uri=tracking_uri) 112 | try: 113 | expr_name = exp_name # create a new experiment (do not replace) 114 | mlflow.create_experiment(expr_name, s3_bucket) 115 | mlflow.set_experiment(expr_name) 116 | experiment = mlflow.get_experiment_by_name(exp_name) 117 | except: 118 | experiment = mlflow.get_experiment_by_name(exp_name) 119 | 120 | os.makedirs(os.path.join(file_path, artifact_name), exist_ok=True) 121 | mlflow.tensorflow.autolog(every_n_iter=1) 122 | with mlflow.start_run(experiment_id=experiment.experiment_id): 123 | 124 | mlflow.log_metrics( 125 | { 126 | "batch_size": batch_size, 127 | "epochs": max_iter, 128 | "image_shape": img_shape, 129 | } 130 | ) 131 | 132 | (X_train, y_train), (_, _) = mnist.load_data() 133 | X_train = (X_train.astype(np.float32) - 127.5) / 127.5 134 | X_train = X_train.reshape(60000, 784) 135 | y_train = y_train.reshape(-1, 1) 136 | 137 | valid = np.ones((batch_size, 1)) 138 | fake = np.zeros((batch_size, 1)) 139 | for i in range(max_iter): 140 | noise = np.random.normal(0, 1, (batch_size, 100)) 141 | index = np.random.randint(0, X_train.shape[0], size=batch_size) 142 | image_batch = X_train[index] 143 | label_batch = y_train[index] 144 | 145 | fake_images = gen.predict([noise, label_batch]) 146 | 147 | disc.trainable = True 148 | disc_loss_real = disc.train_on_batch([image_batch, label_batch], valid) 149 | disc_loss_fake = disc.train_on_batch([fake_images, label_batch], fake) 150 | disc_loss_final = 0.5 * np.add(disc_loss_real, disc_loss_fake) 151 | 152 | fake_labels = np.random.randint(0, 10, batch_size).reshape(-1, 1) 153 | disc.trainable = False 154 | gen_loss = gan_stack.train_on_batch([noise, fake_labels], valid) 155 | 156 | mlflow.log_metrics( 157 | {"generator_loss": gen_loss, "discriminator_loss": disc_loss_final[0]} 158 | ) 159 | 160 | print( 161 | "epoch_%d---->gen_loss:[%f]---->disc_loss:[%f]---->acc:[%f]" 162 | % (i, gen_loss, disc_loss_final[0], disc_loss_final[1] * 100) 163 | ) 164 | # if i % 100 == 0: 165 | # test(gen, i) 166 | gen.save(os.path.join(file_path, artifact_name, "generator.h5")) 167 | # disc.save(os.path.join(file_path, artifact_name, "discriminator.h5")) 168 | 169 | meta_data_path = os.path.join(file_path, artifact_name) 170 | for artifact in sorted(os.listdir(meta_data_path)): 171 | if artifact != ".DS_Store": 172 | artifact_path = os.path.join(meta_data_path, artifact) 173 | if ( 174 | os.path.isfile(artifact_path) 175 | and artifact_path.split(".")[-1] != "h5" 176 | ): 177 | print(f"artifact to be uploaded is: {artifact}") 178 | mlflow.log_artifact(local_path=artifact_path) 179 | 180 | artifact_uri = mlflow.get_artifact_uri() 181 | print(artifact_uri) 182 | mlflow.end_run() 183 | 184 | 185 | parser = argparse.ArgumentParser() 186 | parser.add_argument("-fp", "--file_path", help="Directory path to save artifacts") 187 | parser.add_argument("-a", "--art_name", help="Artifacts name") 188 | parser.add_argument("-b", "--batch_size", default=32, help="Batch size for training") 189 | parser.add_argument("-e", "--epochs", default=20000, help="Number of epochs") 190 | parser.add_argument("-is", "--img_shape", default=784, help="One dimension of image") 191 | parser.add_argument( 192 | "-exp", 193 | "--experiment_name", 194 | default="conditional_gan", 195 | help="Name of the experiment for tracking", 196 | ) 197 | args = parser.parse_args() 198 | train( 199 | gen=generator(), 200 | disc=discriminator(), 201 | gan_stack=stacked_GAN(gen=generator(), disc=discriminator()), 202 | max_iter=int(args.epochs), 203 | batch_size=int(args.batch_size), 204 | img_shape=int(args.img_shape), 205 | file_path=args.file_path, 206 | artifact_name=args.art_name, 207 | exp_name=args.experiment_name, 208 | ) 209 | -------------------------------------------------------------------------------- /document_classification/__pycache__/hybrid_v1.cpython-38.pyc: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/vaibhavsatpathy/psAI-clOps/7b447177b3e43c8d028360f574e6a20ba5090cc5/document_classification/__pycache__/hybrid_v1.cpython-38.pyc -------------------------------------------------------------------------------- /document_classification/__pycache__/pre_process_text.cpython-38.pyc: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/vaibhavsatpathy/psAI-clOps/7b447177b3e43c8d028360f574e6a20ba5090cc5/document_classification/__pycache__/pre_process_text.cpython-38.pyc -------------------------------------------------------------------------------- /document_classification/__pycache__/pre_processing.cpython-38.pyc: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/vaibhavsatpathy/psAI-clOps/7b447177b3e43c8d028360f574e6a20ba5090cc5/document_classification/__pycache__/pre_processing.cpython-38.pyc -------------------------------------------------------------------------------- /document_classification/bento_package.py: -------------------------------------------------------------------------------- 1 | from bento_predictor import ModelZoo 2 | import tensorflow as tf 3 | from tensorflow.keras.models import load_model 4 | from tensorflow.keras.preprocessing.text import Tokenizer 5 | import spacy 6 | import json 7 | 8 | 9 | def mcr_models( 10 | model_zoo, tf_model_path_mcr: str, text_file_path_mcr: str, labels_path_mcr: str 11 | ): 12 | model_cnn = load_model(tf_model_path_mcr) 13 | tf.saved_model.save(model_cnn, "artifacts/") 14 | model_cnn = tf.saved_model.load("artifacts/") 15 | model_zoo.pack("mcr_model", model_cnn) 16 | 17 | text_model = spacy.load("en_core_web_sm") 18 | model_zoo.pack("mcr_spacy_model", text_model) 19 | 20 | tokenizer = Tokenizer() 21 | with open(text_file_path_mcr, "r") as f: 22 | bow = f.read() 23 | tokenizer.fit_on_texts(bow.split("####")) 24 | model_zoo.pack("mcr_tokenizer", tokenizer) 25 | 26 | with open(labels_path_mcr, "r") as f: 27 | labels_mcr = json.load(f) 28 | model_zoo.pack("mcr_labels", labels_mcr) 29 | 30 | 31 | def main(): 32 | model_zoo = ModelZoo() 33 | mcr_models( 34 | model_zoo=model_zoo, 35 | tf_model_path_mcr=tf_model_path_mcr, 36 | text_file_path_mcr=text_file_path_mcr, 37 | labels_path_mcr=labels_path_mcr, 38 | ) 39 | saved_path = model_zoo.save() 40 | 41 | 42 | tf_model_path_mcr = ( 43 | "/Users/vsatpathy/Desktop/docs/training_data/mcr/document_classifier.h5" 44 | ) 45 | text_file_path_mcr = ( 46 | "/Users/vsatpathy/Desktop/docs/training_data/mcr/file_and_text_mcr.txt" 47 | ) 48 | labels_path_mcr = "/Users/vsatpathy/Desktop/docs/training_data/mcr/rev_labels_mcr.json" 49 | main() 50 | -------------------------------------------------------------------------------- /document_classification/bento_predictor.py: -------------------------------------------------------------------------------- 1 | import bentoml 2 | from bentoml.types import FileLike 3 | from bentoml.adapters import JsonInput, FileInput, MultiFileInput 4 | from bentoml.frameworks.spacy import SpacyModelArtifact 5 | from bentoml.frameworks.tensorflow import TensorflowSavedModelArtifact 6 | from bentoml.service.artifacts.common import ( 7 | JSONArtifact, 8 | PickleArtifact, 9 | ) 10 | 11 | import tensorflow as tf 12 | from tensorflow.keras.preprocessing.image import load_img 13 | import numpy as np 14 | from pytesseract import image_to_string 15 | import re 16 | from PIL import Image 17 | from typing import List 18 | 19 | 20 | @bentoml.env(infer_pip_packages=True) 21 | @bentoml.artifacts( 22 | [ 23 | TensorflowSavedModelArtifact("mcr_model"), 24 | SpacyModelArtifact("mcr_spacy_model"), 25 | PickleArtifact("mcr_tokenizer"), 26 | JSONArtifact("mcr_labels"), 27 | ] 28 | ) 29 | class ModelZoo(bentoml.BentoService): 30 | def helper(self, text): 31 | dummy = [] 32 | for word in text: 33 | dummy.append(str(word)) 34 | final = " ".join(dummy) 35 | return final 36 | 37 | def preprocess_spacy(self, spacy_model, text, num_of_words: int): 38 | text = str(text) 39 | text = text.split(" ") 40 | text = self.helper(text) 41 | text = str(text.lower()) 42 | # Remove all the special characters 43 | text = re.sub(r"\W", " ", text) 44 | text = re.sub(r"[^a-zA-Z ]+", "", text) 45 | # remove all single characters 46 | text = re.sub(r"\s+[a-zA-Z]\s+", " ", text) 47 | # Remove single characters from the start 48 | text = re.sub(r"\^[a-zA-Z]\s+", " ", text) 49 | # Substituting multiple spaces with single space 50 | text = re.sub(r"\s+", " ", text, flags=re.I) 51 | # text = self.artifacts.mcr_spacy_model(text) 52 | text = spacy_model(text) 53 | filtered = [token.lemma_ for token in text if token.is_stop == False] 54 | text = " ".join(filtered[: num_of_words * 2]) 55 | text = text.strip().split(" ") 56 | text = " ".join(text[:num_of_words]) 57 | return text 58 | 59 | def tokenize_sentence(self, sentence, tokenizer, maximum_word_length): 60 | updated_sentence = sentence.split(" ") 61 | tok_sent = [] 62 | for word in updated_sentence: 63 | if word in tokenizer.word_index: 64 | tok_sent.append(tokenizer.word_index[word]) 65 | else: 66 | tok_sent.append(0) 67 | if len(tok_sent) != maximum_word_length: 68 | delta = maximum_word_length - len(tok_sent) 69 | for i in range(delta): 70 | tok_sent.append(0) 71 | return tok_sent 72 | 73 | def pre_process_image(self, image_file): 74 | ocr_image = np.asarray(Image.open(image_file)) 75 | image = np.asarray( 76 | Image.open(image_file).convert(mode="RGB").resize((100, 100)) 77 | ) 78 | image = np.divide(image, 255.0) 79 | image = np.asarray([image]).astype("float32") 80 | return ocr_image, image 81 | 82 | def pre_process_mcr(self, file): 83 | ocr_image, image = self.pre_process_image(image_file=file) 84 | doc_text = image_to_string(ocr_image) 85 | doc_text_processed = self.preprocess_spacy( 86 | spacy_model=self.artifacts.mcr_spacy_model, text=doc_text, num_of_words=10 87 | ) 88 | fin_text = self.tokenize_sentence( 89 | sentence=doc_text_processed, 90 | tokenizer=self.artifacts.mcr_tokenizer, 91 | maximum_word_length=10, 92 | ) 93 | return image, np.asarray([fin_text]).astype("float32") 94 | 95 | @bentoml.api(input=FileInput()) 96 | def predict_document_labels_mcr(self, file_stream): 97 | image, text = self.pre_process_mcr(file=file_stream) 98 | model = self.artifacts.mcr_model.signatures["serving_default"] 99 | model._num_positional_args = 2 100 | results = model(tf.constant(text), tf.constant(image)) 101 | conv_results = results.get("dense_1")[0].numpy() 102 | document_label = self.artifacts.mcr_labels[str(np.argmax(conv_results))] 103 | return {"document_type": document_label} 104 | -------------------------------------------------------------------------------- /document_classification/docker-compose.yml: -------------------------------------------------------------------------------- 1 | version: "3" 2 | services: 3 | web: 4 | image: 142339138776.dkr.ecr.us-east-2.amazonaws.com/docuedge-model-zoo:latest 5 | ports: 6 | - "5000:5000" 7 | logging: 8 | driver: awslogs 9 | options: 10 | awslogs-group: docuedge-modelserver-ecs 11 | awslogs-region: us-east-2 12 | awslogs-stream-prefix: web 13 | volumes: 14 | - /app/temp 15 | -------------------------------------------------------------------------------- /document_classification/ecs-params.yml: -------------------------------------------------------------------------------- 1 | version: 1 2 | task_definition: 3 | task_execution_role: ecsTaskExecutionRoleBento 4 | ecs_network_mode: awsvpc 5 | task_size: 6 | mem_limit: 8GB 7 | cpu_limit: 4096 8 | efs_volumes: 9 | - name: docuedgedev-efs 10 | filesystem_id: fs-4717c93f 11 | root_directory: /smartbox-config 12 | run_params: 13 | network_configuration: 14 | awsvpc_configuration: 15 | subnets: 16 | - subnet-00e7bff093931a167 17 | - subnet-0345b051535c9625d 18 | security_groups: 19 | - sg-09b7e06cb8b13167d 20 | - sg-0601a52d4ea28af05 21 | assign_public_ip: ENABLED 22 | -------------------------------------------------------------------------------- /document_classification/hybrid_v1.py: -------------------------------------------------------------------------------- 1 | import numpy as np 2 | import random 3 | from tensorflow.keras.preprocessing.image import load_img 4 | from tensorflow.keras.preprocessing.text import Tokenizer 5 | from tensorflow.keras.layers import ( 6 | Input, 7 | Conv2D, 8 | Dense, 9 | Flatten, 10 | Embedding, 11 | Concatenate, 12 | GlobalMaxPool1D, 13 | ) 14 | from tensorflow.keras.models import Model, load_model 15 | import os 16 | import json 17 | import mlflow 18 | import mlflow.tensorflow 19 | 20 | 21 | tracking_uri = ( 22 | "http://testuser:password@ec2-18-218-100-222.us-east-2.compute.amazonaws.com" 23 | ) 24 | s3_bucket = "s3://docuedge-mlflow-bucket" # replace this value 25 | 26 | 27 | def read_data(path): 28 | bow = open(path, "r") 29 | data = bow.readlines() 30 | all_data_paths = [] 31 | all_texts = [] 32 | y_labels = {} 33 | for line in data: 34 | line_data = line.split("####") 35 | all_data_paths.append(line_data[0]) 36 | all_texts.append(line_data[-1][:-1]) 37 | label = line_data[0].split("/")[-2] 38 | if label not in y_labels: 39 | y_labels[label] = len(y_labels) 40 | 41 | rev_labels = {} 42 | for key, val in y_labels.items(): 43 | rev_labels[val] = key 44 | 45 | return all_data_paths, y_labels, rev_labels, all_texts 46 | 47 | 48 | def tokenize_sentence(sentence, tokenizer, maximum_word_length): 49 | updated_sentence = sentence.split(" ") 50 | tok_sent = [] 51 | for word in updated_sentence: 52 | if word in tokenizer.word_index: 53 | tok_sent.append(tokenizer.word_index[word]) 54 | else: 55 | tok_sent.append(0) 56 | if len(tok_sent) != maximum_word_length: 57 | delta = maximum_word_length - len(tok_sent) 58 | for i in range(delta): 59 | tok_sent.append(0) 60 | return tok_sent 61 | 62 | 63 | def data_loader_text( 64 | bs, data, y_lab, tokenizer, text_data, image_input_shape, max_word_length 65 | ): 66 | while True: 67 | images = [] 68 | labels = [] 69 | texts = [] 70 | while len(images) < bs: 71 | indice = random.randint(0, len(data) - 1) 72 | target = data[indice].split("/")[-2] 73 | labels.append(y_lab[target]) 74 | 75 | test_img = np.asarray(load_img(data[indice], target_size=image_input_shape)) 76 | img = np.divide(test_img, 255.0) 77 | images.append(img) 78 | 79 | tok_sen = tokenize_sentence( 80 | text_data[indice], tokenizer, maximum_word_length=max_word_length 81 | ) 82 | texts.append(tok_sen) 83 | yield [np.asarray(images), np.asarray(texts)], np.asarray(labels) 84 | 85 | 86 | def model_arc(y_labels, tokenizer, text_model_inp_shape, image_inp_shape): 87 | inp_layer_texts = Input(shape=text_model_inp_shape) 88 | inp_layer_images = Input(shape=image_inp_shape) 89 | 90 | embedding_layer = Embedding( 91 | input_dim=len(tokenizer.word_index) + 1, 92 | output_dim=64, 93 | input_length=text_model_inp_shape, 94 | trainable=True, 95 | )(inp_layer_texts) 96 | pooling_layer = GlobalMaxPool1D()(embedding_layer) 97 | dense_layer = Dense(units=64, activation="relu")(pooling_layer) 98 | # lstm_layer = Bidirectional(LSTM(units=32))(embedding_layer) 99 | 100 | conv_layer = Conv2D(filters=64, kernel_size=(2, 2), activation="relu")( 101 | inp_layer_images 102 | ) 103 | flatten_layer = Flatten()(conv_layer) 104 | 105 | concat_layer = Concatenate()([flatten_layer, dense_layer]) 106 | out_layer = Dense(len(y_labels), activation="softmax")(concat_layer) 107 | 108 | model = Model([inp_layer_images, inp_layer_texts], out_layer) 109 | model.compile( 110 | optimizer="adam", loss="sparse_categorical_crossentropy", metrics=["accuracy"] 111 | ) 112 | return model 113 | 114 | 115 | def train_hybrid_v1( 116 | text_plus_file_path: str, 117 | batch_size: int, 118 | epochs: int, 119 | image_shape: int, 120 | max_words: int, 121 | artifact_name: str, 122 | save_dir_path: str, 123 | trained_model_path: str, 124 | experiment_name: str, 125 | ): 126 | 127 | mlflow.set_tracking_uri(tracking_uri) 128 | client = mlflow.tracking.MlflowClient(tracking_uri=tracking_uri) 129 | try: 130 | expr_name = experiment_name # create a new experiment (do not replace) 131 | mlflow.create_experiment(expr_name, s3_bucket) 132 | mlflow.set_experiment(expr_name) 133 | experiment = mlflow.get_experiment_by_name(experiment_name) 134 | except: 135 | experiment = mlflow.get_experiment_by_name(experiment_name) 136 | 137 | all_imgs_path, y_labels, rev_labels, all_text = read_data(path=text_plus_file_path) 138 | num_train_img = len(all_imgs_path) 139 | 140 | with open( 141 | os.path.join(save_dir_path, artifact_name, f"rev_labels_{artifact_name}.json"), 142 | "w+", 143 | ) as tar: 144 | json.dump(rev_labels, tar) 145 | 146 | print("target_encodings: ", y_labels) 147 | print("Number of training images: ", num_train_img) 148 | 149 | bow = open(text_plus_file_path, "r") 150 | tokenizer = Tokenizer() 151 | tokenizer.fit_on_texts(bow.read().split("####")) 152 | 153 | train_gen = data_loader_text( 154 | tokenizer=tokenizer, 155 | y_lab=y_labels, 156 | data=all_imgs_path, 157 | text_data=all_text, 158 | bs=batch_size, 159 | image_input_shape=(image_shape, image_shape, 3), 160 | max_word_length=max_words, 161 | ) 162 | if os.path.isfile(trained_model_path): 163 | model = load_model(trained_model_path) 164 | else: 165 | model = model_arc( 166 | y_labels=y_labels, 167 | tokenizer=tokenizer, 168 | text_model_inp_shape=(max_words,), 169 | image_inp_shape=(image_shape, image_shape, 3), 170 | ) 171 | mlflow.tensorflow.autolog(every_n_iter=1) 172 | with mlflow.start_run(experiment_id=experiment.experiment_id): 173 | mlflow.log_metrics( 174 | { 175 | "batch_size": batch_size, 176 | "epochs": epochs, 177 | "image_shape": image_shape, 178 | "max_words": max_words, 179 | } 180 | ) 181 | history = model.fit( 182 | x=train_gen, 183 | steps_per_epoch=num_train_img // batch_size, 184 | epochs=epochs, 185 | ) 186 | model.save( 187 | filepath=os.path.join( 188 | save_dir_path, artifact_name, "document_classifier.h5" 189 | ) 190 | ) 191 | meta_data_path = os.path.join(save_dir_path, artifact_name) 192 | for artifact in sorted(os.listdir(meta_data_path)): 193 | if artifact != ".DS_Store": 194 | artifact_path = os.path.join(meta_data_path, artifact) 195 | if ( 196 | os.path.isfile(artifact_path) 197 | and artifact_path.split(".")[-1] != "h5" 198 | ): 199 | print(f"artifact to be uploaded is: {artifact}") 200 | mlflow.log_artifact(local_path=artifact_path) 201 | 202 | artifact_uri = mlflow.get_artifact_uri() 203 | print(artifact_uri) 204 | mlflow.end_run() 205 | -------------------------------------------------------------------------------- /document_classification/pre_process_text.py: -------------------------------------------------------------------------------- 1 | import spacy 2 | import re 3 | from pdf2image import convert_from_path 4 | import os 5 | from tqdm import tqdm 6 | import pre_processing 7 | from PIL import Image 8 | import pytesseract 9 | 10 | nlp = spacy.load("en_core_web_sm") 11 | 12 | 13 | def helper(text): 14 | dummy = [] 15 | for word in text: 16 | dummy.append(str(word)) 17 | final = " ".join(dummy) 18 | return final 19 | 20 | 21 | def preprocess_spacy(text, num_of_words: int): 22 | text = str(text) 23 | text = text.split(" ") 24 | text = helper(text) 25 | text = str(text.lower()) 26 | # Remove all the special characters 27 | text = re.sub(r"\W", " ", text) 28 | text = re.sub(r"[^a-zA-Z ]+", "", text) 29 | # remove all single characters 30 | text = re.sub(r"\s+[a-zA-Z]\s+", " ", text) 31 | # Remove single characters from the start 32 | text = re.sub(r"\^[a-zA-Z]\s+", " ", text) 33 | # Substituting multiple spaces with single space 34 | text = re.sub(r"\s+", " ", text, flags=re.I) 35 | text = nlp(text) 36 | filtered = [token.lemma_ for token in text if token.is_stop == False] 37 | text = " ".join(filtered[: num_of_words * 2]) 38 | text = text.strip().split(" ") 39 | text = " ".join(text[:num_of_words]) 40 | return text 41 | 42 | 43 | def read_text_from_pages( 44 | complete_folder_path: str, 45 | path_to_save_essential_data: str, 46 | meta_name: str, 47 | num_of_words: int, 48 | ): 49 | final_path_for_data = os.path.join( 50 | path_to_save_essential_data, meta_name, f"file_and_text_{meta_name}.txt" 51 | ) 52 | if os.path.isfile(final_path_for_data): 53 | data = open(final_path_for_data, "r").read() 54 | else: 55 | data = "null" 56 | print("#### Reading pages ####") 57 | doc_image_types = sorted(os.listdir(complete_folder_path)) 58 | text_of_all_pages = [] 59 | for doc_image_type in doc_image_types: 60 | if doc_image_type != ".DS_Store": 61 | print("DOCUMENT TYPE: ", doc_image_type) 62 | complete_doc_image_path = os.path.join(complete_folder_path, doc_image_type) 63 | pages = sorted(os.listdir(complete_doc_image_path)) 64 | for page in tqdm(pages): 65 | if page != ".DS_Store": 66 | page_path = os.path.join(complete_doc_image_path, page) 67 | if page_path not in data: 68 | document_page = Image.open(page_path) 69 | document_text = pytesseract.image_to_string(document_page) 70 | document_page.close() 71 | essential_file_path_and_text = ( 72 | page_path 73 | + "####" 74 | + preprocess_spacy(document_text, num_of_words=num_of_words) 75 | + "\n" 76 | ) 77 | text_of_all_pages.append(essential_file_path_and_text) 78 | 79 | if os.path.isfile(final_path_for_data): 80 | all_essential_data = open(final_path_for_data, "a+") 81 | all_essential_data.writelines(text_of_all_pages) 82 | else: 83 | all_essential_data = open(final_path_for_data, "w") 84 | all_essential_data.writelines(text_of_all_pages) 85 | return final_path_for_data 86 | 87 | 88 | def pdf_to_images(full_path_pdf: str, converted_images_path: str, meta_name: str): 89 | doc = full_path_pdf.split("/")[-1] 90 | index = 0 91 | OUTPUT_PATH = converted_images_path 92 | os.makedirs(name=OUTPUT_PATH, exist_ok=True) 93 | 94 | print("Document name: ", doc) 95 | if str(doc.split(".pdf")[-2]) + "_" + str(index) + ".jpg" not in os.listdir( 96 | converted_images_path 97 | ): 98 | pil_images = convert_from_path(full_path_pdf, dpi=300) 99 | 100 | for image in tqdm(pil_images): 101 | processed_image = pre_processing.preprocess_image_file(image) 102 | try: 103 | processed_image = Image.fromarray(processed_image) 104 | processed_image.save( 105 | os.path.join(OUTPUT_PATH, str(doc.split(".pdf")[-2])) 106 | + "_" 107 | + str(index) 108 | + ".jpg", 109 | format="JPEG", 110 | subsampling=0, 111 | quality=100, 112 | ) 113 | index += 1 114 | processed_image.close() 115 | except: 116 | index += 1 117 | else: 118 | pass -------------------------------------------------------------------------------- /document_classification/pre_processing.py: -------------------------------------------------------------------------------- 1 | """IMAGE PREPROCESSING FUNCTIONS 2 | """ 3 | import cv2 4 | import numpy as np 5 | from scipy.ndimage.filters import rank_filter 6 | 7 | # from sbox.utils.sbox_logger import logger 8 | import pytesseract 9 | import re 10 | import imutils 11 | from PIL import Image 12 | 13 | # print("error") # = logger(__name__) 14 | 15 | 16 | class PagePreprocess(object): 17 | def __init__(self, im): 18 | self.err = False 19 | self.orig_im = im 20 | self.orig_shape = self.orig_im.shape 21 | self.image = im 22 | 23 | def crop(self): 24 | try: 25 | self.image, self.num_tries = process_image(self.orig_im) 26 | self.crop_shape = self.image.shape 27 | return self.image 28 | except Exception as e: 29 | print("crop_obj_Error") # (f"Error: {e}", exc_info=True) 30 | 31 | def deskew(self): 32 | try: 33 | self.image, self.theta_est = process_skewed_crop(self.image) 34 | return self.image 35 | except Exception as e: 36 | print("deskew_obj_Error") # (f"Error: {e}", exc_info=True) 37 | 38 | 39 | def auto_canny(image, sigma=0.33): 40 | try: 41 | v = np.median(image) 42 | lower = int(max(0, (1.0 - sigma) * v)) 43 | upper = int(min(255, (1.0 + sigma) * v)) 44 | edged = cv2.Canny(image, lower, upper, True) 45 | return edged 46 | except Exception as e: 47 | print("auto_canny_Error") # (f"Error: {e}", exc_info=True) 48 | 49 | 50 | def dilate(image, kernel, iterations): 51 | dilated_image = cv2.dilate(image, kernel, iterations=iterations) 52 | return dilated_image 53 | 54 | 55 | def downscale_image(im, max_dim=2048): 56 | try: 57 | a, b = im.shape[:2] 58 | if max(a, b) <= max_dim: 59 | return 1.0, im 60 | 61 | scale = 1.0 * max_dim / max(a, b) 62 | new_im = cv2.resize(im, (int(b * scale), int(a * scale)), cv2.INTER_AREA) 63 | return scale, new_im 64 | except Exception as e: 65 | print("error") # (f"Error: {e}", exc_info=True) 66 | 67 | 68 | def find_components(im, max_components=16): 69 | try: 70 | kernel = cv2.getStructuringElement(cv2.MORPH_RECT, (10, 10)) 71 | dilation = dilate(im, kernel, 6) 72 | 73 | count = 21 74 | n = 0 75 | sigma = 0.000 76 | 77 | while count > max_components: 78 | n += 1 79 | sigma += 0.005 80 | result = cv2.findContours(dilation, cv2.RETR_TREE, cv2.CHAIN_APPROX_SIMPLE) 81 | if len(result) == 3: 82 | _, contours, hierarchy = result 83 | elif len(result) == 2: 84 | contours, hierarchy = result 85 | possible = find_likely_rectangles(contours, sigma) 86 | count = len(possible) 87 | 88 | return (dilation, possible, n) 89 | except Exception as e: 90 | print("comp_error") # (f"Error: {e}", exc_info=True) 91 | 92 | 93 | def find_likely_rectangles(contours, sigma): 94 | try: 95 | contours = sorted(contours, key=cv2.contourArea, reverse=True)[:10] 96 | possible = [] 97 | for c in contours: 98 | 99 | peri = cv2.arcLength(c, True) 100 | approx = cv2.approxPolyDP(c, sigma * peri, True) 101 | box = make_box(approx) 102 | possible.append(box) 103 | 104 | return possible 105 | except Exception as e: 106 | print("likely_rec_error") # (f"Error: {e}", exc_info=True) 107 | 108 | 109 | def make_box(poly): 110 | try: 111 | x = [] 112 | y = [] 113 | for p in poly: 114 | for point in p: 115 | x.append(point[0]) 116 | y.append(point[1]) 117 | xmax = max(x) 118 | ymax = max(y) 119 | xmin = min(x) 120 | ymin = min(y) 121 | return (xmin, ymin, xmax, ymax) 122 | except Exception as e: 123 | print("bbox_error") # (f"Error: {e}", exc_info=True) 124 | 125 | 126 | def rect_union(crop1, crop2): 127 | x11, y11, x21, y21 = crop1 128 | x12, y12, x22, y22 = crop2 129 | return min(x11, x12), min(y11, y12), max(x21, x22), max(y21, y22) 130 | 131 | 132 | def rect_area(crop): 133 | x1, y1, x2, y2 = crop 134 | return max(0, x2 - x1) * max(0, y2 - y1) 135 | 136 | 137 | def crop_image(im, rect, scale): 138 | try: 139 | xmin, ymin, xmax, ymax = rect 140 | crop = [xmin, ymin, xmax, ymax] 141 | xmin, ymin, xmax, ymax = [int(x / scale) for x in crop] 142 | if ((ymax - ymin) * (xmax - xmin)) > 0.25 * im.size: 143 | cropped = im[ymin:ymax, xmin:xmax] 144 | else: 145 | cropped = im 146 | return cropped 147 | except Exception as e: 148 | print("crop_error_1") # (f"Error: {e}", exc_info=True) 149 | 150 | 151 | def reduce_noise_raw(im): 152 | bilat = cv2.bilateralFilter(im, 4, 75, 75) 153 | blur = cv2.medianBlur(bilat, 1) 154 | return blur 155 | 156 | 157 | def reduce_noise_edges(im): 158 | try: 159 | structuring_element = cv2.getStructuringElement(cv2.MORPH_RECT, (1, 1)) 160 | opening = cv2.morphologyEx(im, cv2.MORPH_OPEN, structuring_element) 161 | maxed_rows = rank_filter(opening, -4, size=(1, 20)) 162 | maxed_cols = rank_filter(opening, -4, size=(20, 1)) 163 | debordered = np.minimum(np.minimum(opening, maxed_rows), maxed_cols) 164 | return debordered 165 | except Exception as e: 166 | print("noise_red_Error") # (f"Error: {e}", exc_info=True) 167 | 168 | 169 | def rects_are_vertical(rect1, rect2, rect_align=2): 170 | try: 171 | xmin1, ymin1, xmax1, ymax1 = rect1 172 | xmin2, ymin2, xmax2, ymax2 = rect2 173 | 174 | midpoint1 = (xmin1 + xmax1) / 2 175 | midpoint2 = (xmin2 + xmax2) / 2 176 | dist = abs(midpoint1 - midpoint2) 177 | 178 | rectarea1 = rect_area(rect1) 179 | rectarea2 = rect_area(rect2) 180 | if rectarea1 > rectarea2: 181 | thres = (xmax1 - xmin1) * rect_align 182 | else: 183 | thres = (xmax2 - xmin2) * rect_align 184 | 185 | if thres > dist: 186 | align = True 187 | else: 188 | align = False 189 | return align 190 | except Exception as e: 191 | print("vert_rec_Error") # (f"Error: {e}", exc_info=True) 192 | 193 | 194 | def find_final_crop(im, rects, orig_im): 195 | try: 196 | current = None 197 | for rect in rects: 198 | if current is None: 199 | current = rect 200 | continue 201 | 202 | aligned = rects_are_vertical(current, rect) 203 | 204 | if not aligned: 205 | continue 206 | 207 | current = rect_union(current, rect) 208 | if current is not None: 209 | return current 210 | else: 211 | return (0, 0, orig_im.shape[0], orig_im.shape[1]) 212 | except Exception as e: 213 | print("crop_Error") # (f"Error: {e}", exc_info=True) 214 | 215 | 216 | def process_image(orig_im): 217 | try: 218 | scale, im = downscale_image(orig_im) 219 | 220 | blur = reduce_noise_raw(im.copy()) 221 | 222 | edges = auto_canny(blur.copy()) 223 | 224 | debordered = reduce_noise_edges(edges.copy()) 225 | 226 | dilation, rects, num_tries = find_components(debordered, 16) 227 | 228 | final_rect = find_final_crop(dilation, rects, orig_im) 229 | 230 | cropped = crop_image(orig_im, final_rect, scale) 231 | # kernel = np.ones((3, 3), np.float32) / 25 232 | # smooth2d = cv2.filter2D(cropped, -1, kernel=kernel) 233 | return (cropped, num_tries) 234 | except Exception as e: 235 | print("process") # (f"Error: {e}", exc_info=True) 236 | 237 | 238 | def rad_to_deg(theta): 239 | return theta * 180 / np.pi 240 | 241 | 242 | def rotate(image, theta): 243 | try: 244 | (h, w) = image.shape[:2] 245 | center = (w / 2, h / 2) 246 | M = cv2.getRotationMatrix2D(center, theta, 1) 247 | rotated = cv2.warpAffine( 248 | image, 249 | M, 250 | (int(w), int(h)), 251 | cv2.INTER_LINEAR, 252 | borderMode=cv2.BORDER_CONSTANT, 253 | borderValue=(255, 255, 255), 254 | ) 255 | return rotated 256 | except Exception as e: 257 | print("rotation_error") # (f"Error: {e}", exc_info=True) 258 | 259 | 260 | def angle_calculation(gray): 261 | gray = cv2.cvtColor(gray, cv2.COLOR_BGR2GRAY) 262 | gray = cv2.bitwise_not(gray) 263 | thresh = cv2.threshold(gray, 0, 255, cv2.THRESH_BINARY | cv2.THRESH_OTSU)[1] 264 | 265 | coords = np.column_stack(np.where(thresh > 0)) 266 | # print(coords, coords.shape) 267 | 268 | min_y = coords[0][0] 269 | max_y = coords[-1][0] 270 | min_x = coords[0][0] 271 | max_x = coords[-1][1] 272 | 273 | left_most = coords[0] 274 | right_most = coords[0] 275 | top_most = coords[0] 276 | bottom_most = coords[0] 277 | # print(coords[0], coords[-1]) 278 | for i in range(1, coords.shape[0]): 279 | y, x = coords[i][0], coords[i][1] 280 | if y <= min_y: 281 | min_y = y 282 | top_most = coords[i] 283 | elif y >= max_y: 284 | max_y = y 285 | bottom_most = coords[i] 286 | if x <= min_x: 287 | min_x = x 288 | left_most = coords[i] 289 | elif x >= max_x: 290 | max_x = x 291 | right_most = coords[i] 292 | # print(top_most, left_most, bottom_most, right_most) 293 | 294 | slopes = [] 295 | edge_coor = [top_most, left_most, bottom_most, right_most] 296 | for i in range(0, len(edge_coor)): 297 | if i == len(edge_coor) - 1: 298 | if abs((edge_coor[0][1] - edge_coor[i][1])) >= 10: 299 | angle = ( 300 | ( 301 | (edge_coor[0][0] - edge_coor[i][0]) 302 | / (edge_coor[0][1] - edge_coor[i][1]) 303 | ) 304 | * 180 305 | ) / 3.14 306 | slopes.append(angle) 307 | else: 308 | slopes.append(0.0) 309 | else: 310 | if abs((edge_coor[i + 1][1] - edge_coor[i][1])) >= 10: 311 | angle = ( 312 | ( 313 | (edge_coor[i + 1][0] - edge_coor[i][0]) 314 | / (edge_coor[i + 1][1] - edge_coor[i][1]) 315 | ) 316 | * 180 317 | ) / 3.14 318 | slopes.append(angle) 319 | else: 320 | slopes.append(0.0) 321 | # img = cv2.circle(thresh, (edge_coor[i][1], edge_coor[i][0]), 5, (255, 0, 0), 2) 322 | 323 | slopes = np.asarray(slopes) 324 | if len(np.where(slopes == 0.0)[0]) >= 2: 325 | # print("error") #(f"Error: {e}", exc_info=True)don't rotate") 326 | return None 327 | else: 328 | # print("error") #(f"Error: {e}", exc_info=True)rotate") 329 | neg_slope = (slopes[0] + slopes[2]) / 2 330 | pos_slope = (slopes[1] + slopes[3]) / 2 331 | # print(pos_slope, neg_slope) 332 | new_pos_slope = pos_slope 333 | new_neg_slope = neg_slope 334 | if pos_slope > 90: 335 | if pos_slope < 180: 336 | new_pos_slope = 180 - pos_slope 337 | else: 338 | new_pos_slope = pos_slope - ((pos_slope // 180) * 180) 339 | # print(new_pos_slope) 340 | if neg_slope < -90: 341 | new_neg_slope = 180 + neg_slope 342 | # print(new_pos_slope, new_neg_slope) 343 | if new_pos_slope <= new_neg_slope: 344 | fin_angle = pos_slope 345 | else: 346 | fin_angle = neg_slope 347 | 348 | if fin_angle < -90: 349 | rot_angle = 180 + fin_angle 350 | elif fin_angle > 90: 351 | rot_angle = -(180 - fin_angle) 352 | elif -90 < fin_angle < 0: 353 | rot_angle = fin_angle 354 | elif 0 < fin_angle < 90: 355 | rot_angle = fin_angle 356 | return rot_angle 357 | 358 | 359 | def estimate_skew(image): 360 | try: 361 | osd = pytesseract.image_to_osd(image) 362 | angle = float(re.search("(?<=Rotate: )\d+", osd).group(0)) 363 | if angle == 0: 364 | # fin_image = rotate(image_gray, angle) 365 | edges = auto_canny(image) 366 | # print(edges.shape) 367 | # print("error") #(f"Error: {e}", exc_info=True)edges found: ", edges) 368 | lines = cv2.HoughLines(edges, 1, np.pi / 270, 400) 369 | # print("error") #(f"Error: {e}", exc_info=True)lines found: ", lines) 370 | if lines is not None: 371 | new = edges.copy() 372 | thetas = [] 373 | for line in lines: 374 | for rho, theta in line: 375 | a = np.cos(theta) 376 | b = np.sin(theta) 377 | x0 = a * rho 378 | y0 = b * rho 379 | x1 = int(x0 + 1000 * (-b)) 380 | y1 = int(y0 + 1000 * (a)) 381 | x2 = int(x0 - 1000 * (-b)) 382 | y2 = int(y0 - 1000 * (a)) 383 | if theta > np.pi / 3 and theta < np.pi * 2 / 3: 384 | thetas.append(theta) 385 | new = cv2.line(new, (x1, y1), (x2, y2), (255, 255, 255), 1) 386 | 387 | theta_mean = np.mean(thetas) 388 | theta = -(90 - (rad_to_deg(theta_mean) if len(thetas) > 0 else 0)) 389 | else: 390 | # theta = angle_calculation(image) 391 | theta = 0.0 392 | else: 393 | theta = angle 394 | return theta 395 | except Exception as e: 396 | print("theta_error") # (f"Error: {e}", exc_info=True) 397 | 398 | 399 | def process_skewed_crop(image): 400 | try: 401 | theta = estimate_skew(image) 402 | # print(theta) 403 | # ret, thresh = cv2.threshold(image, 0, 127, cv2.THRESH_OTSU) 404 | # print(thresh) 405 | if theta is not None and (theta % 90) != 0: 406 | rotated = rotate(image, theta) 407 | elif (theta % 90) == 0: 408 | rotated = imutils.rotate_bound(image, theta) 409 | else: 410 | rotated = image 411 | # print(rotated) 412 | return rotated, theta 413 | except Exception as e: 414 | print("skew_Error") # (f"Error: {e}", exc_info=True) 415 | 416 | 417 | def preprocess_image(file_path: str): 418 | try: 419 | gray_page = cv2.imread(file_path, 0) 420 | process_page = PagePreprocess(gray_page) 421 | _ = process_page.crop() 422 | deskewed_page = process_page.deskew() 423 | # cv2.imwrite(file_path, deskewed_page) 424 | return deskewed_page 425 | except Exception as e: 426 | print("process_image_error") # (f"Error: {e}", exc_info=True) 427 | 428 | 429 | def preprocess_image_file(img): 430 | try: 431 | # converted_image = cv2.cvtColor(np.asarray(img), cv2.COLOR_RGB2BGR) 432 | gray_page = cv2.cvtColor(np.array(img), cv2.COLOR_BGR2GRAY) 433 | # gray_page = cv2.cvtColor(gray_page, cv2.COLOR_BGR2RGB) 434 | process_page = PagePreprocess(gray_page) 435 | _ = process_page.crop() 436 | deskewed_page = process_page.deskew() 437 | return deskewed_page 438 | except Exception as e: 439 | print("error") # (f"Error: {e}", exc_info=True) 440 | -------------------------------------------------------------------------------- /document_classification/train.py: -------------------------------------------------------------------------------- 1 | import argparse 2 | import os 3 | from hybrid_v1 import train_hybrid_v1 4 | from pre_process_text import ( 5 | pdf_to_images, 6 | read_text_from_pages, 7 | ) 8 | 9 | 10 | def process( 11 | dataset_path: str, 12 | save_dir: str, 13 | pdf_check: bool, 14 | artifact_name: str, 15 | num_words_to_read: int, 16 | ): 17 | updated_dataset_path = os.path.join(save_dir, artifact_name, "dataset") 18 | os.makedirs(updated_dataset_path, exist_ok=True) 19 | for document_type in sorted(os.listdir(dataset_path)): 20 | if document_type != ".DS_Store": 21 | folder_path = os.path.join(dataset_path, document_type) 22 | updated_document_type_folder_path = os.path.join( 23 | updated_dataset_path, document_type 24 | ) 25 | os.makedirs(updated_document_type_folder_path, exist_ok=True) 26 | for documents in sorted(os.listdir(folder_path)): 27 | if documents != ".DS_Store": 28 | document_path = os.path.join(folder_path, documents) 29 | if pdf_check: 30 | # Perform conversion and store the images in a temp folder 31 | pdf_to_images( 32 | full_path_pdf=document_path, 33 | converted_images_path=updated_document_type_folder_path, 34 | meta_name=artifact_name, 35 | ) 36 | if pdf_check: 37 | images_data_path = os.path.join(save_dir, artifact_name) 38 | else: 39 | images_data_path = dataset_path 40 | master_data_path = read_text_from_pages( 41 | complete_folder_path=images_data_path, 42 | path_to_save_essential_data=save_dir, 43 | meta_name=artifact_name, 44 | num_of_words=num_words_to_read, 45 | ) 46 | return master_data_path 47 | 48 | 49 | def single_level(args): 50 | all_data_path = process( 51 | dataset_path=args.data_path, 52 | save_dir=args.file_path, 53 | pdf_check=bool(args.pdfs), 54 | artifact_name=args.art_name, 55 | num_words_to_read=int(args.num_of_words), 56 | ) 57 | train_hybrid_v1( 58 | text_plus_file_path=all_data_path, 59 | batch_size=int(args.batch_size), 60 | epochs=int(args.epochs), 61 | image_shape=int(args.img_shape), 62 | max_words=int(args.num_of_words), 63 | artifact_name=args.art_name, 64 | save_dir_path=args.file_path, 65 | trained_model_path=args.model_path, 66 | experiment_name=args.experiment_name, 67 | ) 68 | 69 | 70 | parser = argparse.ArgumentParser() 71 | parser.add_argument("-dp", "--data_path", help="File path of the dataset") 72 | parser.add_argument("-fp", "--file_path", help="Directory path to save artifacts") 73 | parser.add_argument("-a", "--art_name", help="Artifacts name") 74 | parser.add_argument("-p", "--pdfs", default=False, help="Dataset type") 75 | parser.add_argument("-n", "--num_of_words", default=10, help="No of words to read") 76 | parser.add_argument("-b", "--batch_size", default=8, help="Batch size for training") 77 | parser.add_argument("-e", "--epochs", default=3, help="Number of epochs") 78 | parser.add_argument("-is", "--img_shape", default=100, help="One dimension of image") 79 | parser.add_argument("-mp", "--model_path", default="NULL", help="Path to trained model") 80 | parser.add_argument( 81 | "-exp", 82 | "--experiment_name", 83 | default="document_classification", 84 | help="Name of the experiment for tracking", 85 | ) 86 | args = parser.parse_args() 87 | single_level(args=args) # For single level document classification 88 | -------------------------------------------------------------------------------- /image_classifier/__pycache__/bento_predictor.cpython-38.pyc: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/vaibhavsatpathy/psAI-clOps/7b447177b3e43c8d028360f574e6a20ba5090cc5/image_classifier/__pycache__/bento_predictor.cpython-38.pyc -------------------------------------------------------------------------------- /image_classifier/artifacts/saved_model.pb: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/vaibhavsatpathy/psAI-clOps/7b447177b3e43c8d028360f574e6a20ba5090cc5/image_classifier/artifacts/saved_model.pb -------------------------------------------------------------------------------- /image_classifier/artifacts/variables/variables.data-00000-of-00001: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/vaibhavsatpathy/psAI-clOps/7b447177b3e43c8d028360f574e6a20ba5090cc5/image_classifier/artifacts/variables/variables.data-00000-of-00001 -------------------------------------------------------------------------------- /image_classifier/artifacts/variables/variables.index: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/vaibhavsatpathy/psAI-clOps/7b447177b3e43c8d028360f574e6a20ba5090cc5/image_classifier/artifacts/variables/variables.index -------------------------------------------------------------------------------- /image_classifier/bento_package.py: -------------------------------------------------------------------------------- 1 | from bento_predictor import ImageClassifier 2 | from tensorflow.keras.models import load_model 3 | import tensorflow as tf 4 | import json 5 | 6 | 7 | def classifier_models(model_service, model_path: str, labels_path: str): 8 | model_cnn = load_model(model_path) 9 | tf.saved_model.save(model_cnn, "artifacts/") 10 | model_cnn = tf.saved_model.load("artifacts/") 11 | model_service.pack("model", model_cnn) 12 | 13 | with open(labels_path, "r") as f: 14 | labels = json.load(f) 15 | model_service.pack("labels", labels) 16 | 17 | 18 | def main(): 19 | model_service = ImageClassifier() 20 | classifier_models( 21 | model_service=model_service, model_path=model_path, labels_path=labels_path 22 | ) 23 | saved_path = model_service.save() 24 | 25 | 26 | model_path = "/Users/vsatpathy/Desktop/docs/training_data/intel/image_classifier.h5" 27 | labels_path = "/Users/vsatpathy/Desktop/docs/training_data/intel/rev_labels_intel.json" 28 | main() -------------------------------------------------------------------------------- /image_classifier/bento_predictor.py: -------------------------------------------------------------------------------- 1 | import bentoml 2 | from bentoml.adapters import FileInput 3 | from bentoml.frameworks.tensorflow import TensorflowSavedModelArtifact 4 | from bentoml.service.artifacts.common import JSONArtifact 5 | 6 | import tensorflow as tf 7 | from tensorflow.keras.preprocessing.image import load_img 8 | import importlib.util 9 | import numpy as np 10 | from PIL import Image 11 | 12 | 13 | @bentoml.env(infer_pip_packages=True) 14 | @bentoml.artifacts([TensorflowSavedModelArtifact("model"), JSONArtifact("labels")]) 15 | class ImageClassifier(bentoml.BentoService): 16 | def pre_process_image(self, image_file): 17 | image = np.asarray( 18 | Image.open(image_file).convert(mode="RGB").resize((100, 100)) 19 | ) 20 | image = np.divide(image, 255.0) 21 | image = np.asarray([image]).astype("float32") 22 | return image 23 | 24 | @bentoml.api(input=FileInput()) 25 | def predict_image(self, file_stream): 26 | image = self.pre_process_image(image_file=file_stream) 27 | model = self.artifacts.model.signatures["serving_default"] 28 | model._num_positional_args = 1 29 | results = model(tf.constant(image)) 30 | print(results) 31 | conv_results = results.get("dense")[0].numpy() 32 | label = self.artifacts.labels[str(np.argmax(conv_results))] 33 | return {"label": label} 34 | -------------------------------------------------------------------------------- /image_classifier/docker-compose.yml: -------------------------------------------------------------------------------- 1 | version: "3" 2 | services: 3 | web: 4 | image: 142339138776.dkr.ecr.us-east-2.amazonaws.com/docuedge-model-zoo:latest 5 | ports: 6 | - "5000:5000" 7 | logging: 8 | driver: awslogs 9 | options: 10 | awslogs-group: docuedge-modelserver-ecs 11 | awslogs-region: us-east-2 12 | awslogs-stream-prefix: web 13 | volumes: 14 | - /app/temp 15 | -------------------------------------------------------------------------------- /image_classifier/ecs-params.yml: -------------------------------------------------------------------------------- 1 | version: 1 2 | task_definition: 3 | task_execution_role: ecsTaskExecutionRoleBento 4 | ecs_network_mode: awsvpc 5 | task_size: 6 | mem_limit: 8GB 7 | cpu_limit: 4096 8 | efs_volumes: 9 | - name: docuedgedev-efs 10 | filesystem_id: fs-4717c93f 11 | root_directory: /smartbox-config 12 | run_params: 13 | network_configuration: 14 | awsvpc_configuration: 15 | subnets: 16 | - subnet-00e7bff093931a167 17 | - subnet-0345b051535c9625d 18 | security_groups: 19 | - sg-09b7e06cb8b13167d 20 | - sg-0601a52d4ea28af05 21 | assign_public_ip: ENABLED 22 | -------------------------------------------------------------------------------- /image_classifier/infer.py: -------------------------------------------------------------------------------- 1 | from tensorflow.keras.models import load_model 2 | from tensorflow.keras.preprocessing.image import load_img 3 | 4 | import numpy as np 5 | import os 6 | import random 7 | 8 | 9 | def preprocess(image_path: str, image_shape: tuple = None): 10 | image_dimensions = (100, 100, 3) 11 | if image_shape: 12 | pass 13 | else: 14 | image_shape = image_dimensions 15 | test_inp = image_path 16 | test_img = np.asarray(load_img(test_inp, target_size=image_shape)) 17 | test_img = np.divide(test_img, 255.0) 18 | test_img = np.asarray([test_img]).astype("float32") 19 | return test_img 20 | 21 | 22 | def predict(folder_path: str, model_path: str): 23 | image_dimensions = (100, 100, 3) 24 | full_image_path = os.path.join(folder_path, random.choice(os.listdir(folder_path))) 25 | model = load_model(model_path) 26 | image = preprocess(image_path=full_image_path, image_shape=image_dimensions) 27 | results = model.predict(image) 28 | print(full_image_path) 29 | print(np.argmax(results[0])) 30 | 31 | 32 | model_path = "/Users/vsatpathy/Desktop/docs/training_data/intel/image_classifier.h5" 33 | folder_path = ( 34 | "/Users/vsatpathy/Desktop/off_POCs/intel-image-classification/seg_train/buildings" 35 | ) 36 | predict(folder_path=folder_path, model_path=model_path) 37 | -------------------------------------------------------------------------------- /image_classifier/train.py: -------------------------------------------------------------------------------- 1 | import os 2 | import random 3 | import numpy as np 4 | import mlflow 5 | from mlflow import pyfunc 6 | import argparse 7 | import json 8 | 9 | from tensorflow.keras.preprocessing.image import load_img 10 | from tensorflow.keras.models import Model, load_model 11 | from tensorflow.keras.layers import ( 12 | Input, 13 | Conv2D, 14 | Dense, 15 | Flatten, 16 | ) 17 | 18 | tracking_uri = ( 19 | "http://testuser:password@ec2-18-218-100-222.us-east-2.compute.amazonaws.com" 20 | ) 21 | s3_bucket = "s3://docuedge-mlflow-bucket" # replace this value 22 | 23 | 24 | def model_arc(y_labels: dict, image_inp_shape: tuple): 25 | inp_layer_images = Input(shape=image_inp_shape) 26 | 27 | conv_layer = Conv2D(filters=64, kernel_size=(2, 2), activation="relu")( 28 | inp_layer_images 29 | ) 30 | flatten_layer = Flatten()(conv_layer) 31 | 32 | out_layer = Dense(len(y_labels), activation="softmax")(flatten_layer) 33 | 34 | model = Model(inp_layer_images, out_layer) 35 | model.compile( 36 | optimizer="adam", loss="sparse_categorical_crossentropy", metrics=["accuracy"] 37 | ) 38 | return model 39 | 40 | 41 | def data_loader(gt_data_path: list, gt_labels: dict, bs: int, image_shape: tuple): 42 | while True: 43 | images = [] 44 | labels = [] 45 | while len(images) < bs: 46 | indice = random.randint(0, len(gt_data_path) - 1) 47 | image_path = gt_data_path[indice] 48 | 49 | label = gt_labels.get(image_path.split("/")[-2]) 50 | labels.append(label) 51 | 52 | test_img = np.asarray(load_img(image_path, target_size=image_shape)) 53 | img = np.divide(test_img, 255.0) 54 | images.append(img) 55 | yield np.asarray(images), np.asarray(labels) 56 | 57 | 58 | def read_data(data_path: str): 59 | folders = os.listdir(data_path) 60 | 61 | all_images_paths = [] 62 | all_labels = {} 63 | for label in folders: 64 | if label != ".DS_Store": 65 | images = os.path.join(data_path, label) 66 | for image in os.listdir(images): 67 | full_image_path = os.path.join(images, image) 68 | all_images_paths.append(full_image_path) 69 | if label not in all_labels: 70 | all_labels[label] = len(all_labels) 71 | rev_labels = {} 72 | for key, val in all_labels.items(): 73 | rev_labels[val] = key 74 | return all_images_paths, all_labels, rev_labels 75 | 76 | 77 | def train( 78 | image_shape: int, 79 | epochs: int, 80 | batch_size: int, 81 | data_path: str, 82 | save_dir_path: str, 83 | art_name: str, 84 | exp_name: str, 85 | trained_model_path: str, 86 | ): 87 | mlflow.set_tracking_uri(tracking_uri) 88 | client = mlflow.tracking.MlflowClient(tracking_uri=tracking_uri) 89 | try: 90 | expr_name = exp_name # create a new experiment (do not replace) 91 | mlflow.create_experiment(expr_name, s3_bucket) 92 | mlflow.set_experiment(expr_name) 93 | experiment = mlflow.get_experiment_by_name(exp_name) 94 | except: 95 | experiment = mlflow.get_experiment_by_name(exp_name) 96 | 97 | mlflow.tensorflow.autolog(every_n_iter=1) 98 | with mlflow.start_run(experiment_id=experiment.experiment_id): 99 | image_dimensions = (image_shape, image_shape, 3) 100 | no_of_epochs = epochs 101 | batch_size = batch_size 102 | dataset_path = data_path 103 | gt_image_paths, gt_labels, gt_rev_labels = read_data(data_path=dataset_path) 104 | os.makedirs(os.path.join(save_dir_path, art_name), exist_ok=True) 105 | 106 | mlflow.log_metrics( 107 | { 108 | "batch_size": batch_size, 109 | "epochs": epochs, 110 | "image_shape": image_shape, 111 | } 112 | ) 113 | 114 | with open( 115 | os.path.join(save_dir_path, art_name, f"rev_labels_{art_name}.json"), 116 | "w+", 117 | ) as tar: 118 | json.dump(gt_rev_labels, tar) 119 | 120 | print("target_encodings: ", gt_labels) 121 | print("Number of training images: ", len(gt_image_paths)) 122 | 123 | train_gen = data_loader( 124 | gt_data_path=gt_image_paths, 125 | gt_labels=gt_labels, 126 | bs=batch_size, 127 | image_shape=image_dimensions, 128 | ) 129 | 130 | if os.path.isfile(trained_model_path): 131 | model = load_model(trained_model_path) 132 | else: 133 | model = model_arc(y_labels=gt_labels, image_inp_shape=image_dimensions) 134 | model.fit( 135 | x=train_gen, 136 | steps_per_epoch=len(gt_image_paths) // batch_size, 137 | epochs=no_of_epochs, 138 | ) 139 | model.save( 140 | filepath=os.path.join(save_dir_path, art_name, f"image_classifier.h5") 141 | ) 142 | 143 | meta_data_path = os.path.join(save_dir_path, art_name) 144 | for artifact in sorted(os.listdir(meta_data_path)): 145 | if artifact != ".DS_Store": 146 | artifact_path = os.path.join(meta_data_path, artifact) 147 | if ( 148 | os.path.isfile(artifact_path) 149 | and artifact_path.split(".")[-1] != "h5" 150 | ): 151 | print(f"artifact to be uploaded is: {artifact}") 152 | mlflow.log_artifact(local_path=artifact_path) 153 | 154 | artifact_uri = mlflow.get_artifact_uri() 155 | print(artifact_uri) 156 | mlflow.end_run() 157 | 158 | 159 | parser = argparse.ArgumentParser() 160 | parser.add_argument("-dp", "--data_path", help="File path of the dataset") 161 | parser.add_argument("-fp", "--file_path", help="Directory path to save artifacts") 162 | parser.add_argument("-a", "--art_name", help="Artifacts name") 163 | parser.add_argument("-b", "--batch_size", default=8, help="Batch size for training") 164 | parser.add_argument("-e", "--epochs", default=3, help="Number of epochs") 165 | parser.add_argument("-is", "--img_shape", default=100, help="One dimension of image") 166 | parser.add_argument("-mp", "--model_path", default="NULL", help="Path to trained model") 167 | parser.add_argument( 168 | "-exp", 169 | "--experiment_name", 170 | default="intel_classification", 171 | help="Name of the experiment for tracking", 172 | ) 173 | args = parser.parse_args() 174 | train( 175 | image_shape=args.img_shape, 176 | epochs=args.epochs, 177 | batch_size=args.batch_size, 178 | data_path=args.data_path, 179 | save_dir_path=args.file_path, 180 | art_name=args.art_name, 181 | exp_name=args.experiment_name, 182 | trained_model_path=args.model_path, 183 | ) 184 | -------------------------------------------------------------------------------- /multiple_models/bento_package.py: -------------------------------------------------------------------------------- 1 | from bento_predictor import ModelZoo 2 | import tensorflow as tf 3 | from tensorflow.keras.models import load_model 4 | from tensorflow.keras.preprocessing.text import Tokenizer 5 | import spacy 6 | import json 7 | 8 | 9 | def mcr_models( 10 | model_zoo, tf_model_path_mcr: str, text_file_path_mcr: str, labels_path_mcr: str 11 | ): 12 | model_cnn = load_model(tf_model_path_mcr) 13 | tf.saved_model.save(model_cnn, "artifacts/") 14 | model_cnn = tf.saved_model.load("artifacts/") 15 | model_zoo.pack("mcr_model", model_cnn) 16 | 17 | text_model = spacy.load("en_core_web_sm") 18 | model_zoo.pack("mcr_spacy_model", text_model) 19 | 20 | tokenizer = Tokenizer() 21 | with open(text_file_path_mcr, "r") as f: 22 | bow = f.read() 23 | tokenizer.fit_on_texts(bow.split("####")) 24 | model_zoo.pack("mcr_tokenizer", tokenizer) 25 | 26 | with open(labels_path_mcr, "r") as f: 27 | labels_mcr = json.load(f) 28 | model_zoo.pack("mcr_labels", labels_mcr) 29 | 30 | 31 | def som_models( 32 | model_zoo, 33 | tf_model_path_som: str, 34 | text_file_path_som: str, 35 | master_labels_path: str, 36 | sub_labels_path: str, 37 | ): 38 | model_cnn = load_model(tf_model_path_som) 39 | tf.saved_model.save(model_cnn, "artifacts/") 40 | model_cnn = tf.saved_model.load("artifacts/") 41 | model_zoo.pack("som_model", model_cnn) 42 | 43 | text_model = spacy.load("en_core_web_sm") 44 | model_zoo.pack("som_spacy_model", text_model) 45 | 46 | tokenizer = Tokenizer() 47 | with open(text_file_path_som, "r") as f: 48 | bow = f.read() 49 | tokenizer.fit_on_texts(bow.split("####")) 50 | model_zoo.pack("som_tokenizer", tokenizer) 51 | 52 | with open(master_labels_path, "r") as f: 53 | labels_som = json.load(f) 54 | model_zoo.pack("som_master_labels", labels_som) 55 | 56 | with open(sub_labels_path, "r") as g: 57 | sub_labels_som = json.load(g) 58 | model_zoo.pack("som_sub_labels", sub_labels_som) 59 | 60 | 61 | def main(): 62 | model_zoo = ModelZoo() 63 | mcr_models( 64 | model_zoo=model_zoo, 65 | tf_model_path_mcr=tf_model_path_mcr, 66 | text_file_path_mcr=text_file_path_mcr, 67 | labels_path_mcr=labels_path_mcr, 68 | ) 69 | som_models( 70 | model_zoo=model_zoo, 71 | tf_model_path_som=tf_model_path_som, 72 | text_file_path_som=text_file_path_som, 73 | master_labels_path=master_labels_path, 74 | sub_labels_path=sub_labels_path, 75 | ) 76 | saved_path = model_zoo.save() 77 | 78 | 79 | tf_model_path_mcr = ( 80 | "/Users/vsatpathy/Desktop/docs/training_data/mcr/document_classifier.h5" 81 | ) 82 | text_file_path_mcr = ( 83 | "/Users/vsatpathy/Desktop/docs/training_data/mcr/file_and_text_mcr.txt" 84 | ) 85 | labels_path_mcr = "/Users/vsatpathy/Desktop/docs/training_data/mcr/rev_labels_mcr.json" 86 | 87 | tf_model_path_som = ( 88 | "/Users/vsatpathy/Desktop/docs/training_data/som/document_classifier.h5" 89 | ) 90 | text_file_path_som = ( 91 | "/Users/vsatpathy/Desktop/docs/training_data/som/file_and_text_som.txt" 92 | ) 93 | master_labels_path = ( 94 | "/Users/vsatpathy/Desktop/docs/training_data/som/rev_labels_master_som.json" 95 | ) 96 | sub_labels_path = "/Users/vsatpathy/Desktop/docs/training_data/som/rev_labels_som.json" 97 | main() 98 | -------------------------------------------------------------------------------- /multiple_models/bento_predictor.py: -------------------------------------------------------------------------------- 1 | import bentoml 2 | from bentoml.types import FileLike 3 | from bentoml.adapters import JsonInput, FileInput, MultiFileInput 4 | from bentoml.frameworks.spacy import SpacyModelArtifact 5 | from bentoml.frameworks.tensorflow import TensorflowSavedModelArtifact 6 | from bentoml.service.artifacts.common import ( 7 | JSONArtifact, 8 | PickleArtifact, 9 | ) 10 | 11 | import tensorflow as tf 12 | from tensorflow.keras.preprocessing.image import load_img 13 | import numpy as np 14 | from pytesseract import image_to_string 15 | import re 16 | from PIL import Image 17 | from typing import List 18 | 19 | 20 | @bentoml.env(infer_pip_packages=True) 21 | @bentoml.artifacts( 22 | [ 23 | TensorflowSavedModelArtifact("mcr_model"), 24 | SpacyModelArtifact("mcr_spacy_model"), 25 | PickleArtifact("mcr_tokenizer"), 26 | JSONArtifact("mcr_labels"), 27 | TensorflowSavedModelArtifact("som_model"), 28 | SpacyModelArtifact("som_spacy_model"), 29 | PickleArtifact("som_tokenizer"), 30 | JSONArtifact("som_master_labels"), 31 | JSONArtifact("som_sub_labels"), 32 | ] 33 | ) 34 | class ModelZoo(bentoml.BentoService): 35 | def helper(self, text): 36 | dummy = [] 37 | for word in text: 38 | dummy.append(str(word)) 39 | final = " ".join(dummy) 40 | return final 41 | 42 | def preprocess_spacy(self, spacy_model, text, num_of_words: int): 43 | text = str(text) 44 | text = text.split(" ") 45 | text = self.helper(text) 46 | text = str(text.lower()) 47 | # Remove all the special characters 48 | text = re.sub(r"\W", " ", text) 49 | text = re.sub(r"[^a-zA-Z ]+", "", text) 50 | # remove all single characters 51 | text = re.sub(r"\s+[a-zA-Z]\s+", " ", text) 52 | # Remove single characters from the start 53 | text = re.sub(r"\^[a-zA-Z]\s+", " ", text) 54 | # Substituting multiple spaces with single space 55 | text = re.sub(r"\s+", " ", text, flags=re.I) 56 | # text = self.artifacts.mcr_spacy_model(text) 57 | text = spacy_model(text) 58 | filtered = [token.lemma_ for token in text if token.is_stop == False] 59 | text = " ".join(filtered[: num_of_words * 2]) 60 | text = text.strip().split(" ") 61 | text = " ".join(text[:num_of_words]) 62 | return text 63 | 64 | def tokenize_sentence(self, sentence, tokenizer, maximum_word_length): 65 | updated_sentence = sentence.split(" ") 66 | tok_sent = [] 67 | for word in updated_sentence: 68 | if word in tokenizer.word_index: 69 | tok_sent.append(tokenizer.word_index[word]) 70 | else: 71 | tok_sent.append(0) 72 | if len(tok_sent) != maximum_word_length: 73 | delta = maximum_word_length - len(tok_sent) 74 | for i in range(delta): 75 | tok_sent.append(0) 76 | return tok_sent 77 | 78 | def pre_process_image(self, image_file): 79 | ocr_image = np.asarray(Image.open(image_file)) 80 | image = np.asarray( 81 | Image.open(image_file).convert(mode="RGB").resize((100, 100)) 82 | ) 83 | image = np.divide(image, 255.0) 84 | image = np.asarray([image]).astype("float32") 85 | return ocr_image, image 86 | 87 | def pre_process_mcr(self, file): 88 | ocr_image, image = self.pre_process_image(image_file=file) 89 | doc_text = image_to_string(ocr_image) 90 | doc_text_processed = self.preprocess_spacy( 91 | spacy_model=self.artifacts.mcr_spacy_model, text=doc_text, num_of_words=10 92 | ) 93 | fin_text = self.tokenize_sentence( 94 | sentence=doc_text_processed, 95 | tokenizer=self.artifacts.mcr_tokenizer, 96 | maximum_word_length=10, 97 | ) 98 | return image, np.asarray([fin_text]).astype("float32") 99 | 100 | def pre_process_som(self, file): 101 | ocr_image, image = self.pre_process_image(image_file=file) 102 | doc_text = image_to_string(ocr_image) 103 | doc_text_processed = self.preprocess_spacy( 104 | spacy_model=self.artifacts.som_spacy_model, text=doc_text, num_of_words=10 105 | ) 106 | fin_text = self.tokenize_sentence( 107 | sentence=doc_text_processed, 108 | tokenizer=self.artifacts.som_tokenizer, 109 | maximum_word_length=10, 110 | ) 111 | return image, np.asarray([fin_text]).astype("float32") 112 | 113 | @bentoml.api(input=FileInput()) 114 | def predict_document_labels_mcr(self, file_stream): 115 | image, text = self.pre_process_mcr(file=file_stream) 116 | model = self.artifacts.mcr_model.signatures["serving_default"] 117 | model._num_positional_args = 2 118 | results = model(tf.constant(text), tf.constant(image)) 119 | conv_results = results.get("dense_1")[0].numpy() 120 | document_label = self.artifacts.mcr_labels[str(np.argmax(conv_results))] 121 | return {"document_type": document_label} 122 | 123 | @bentoml.api(input=FileInput()) 124 | def predict_document_labels_som(self, file_stream): 125 | image, text = self.pre_process_som(file=file_stream) 126 | model = self.artifacts.som_model.signatures["serving_default"] 127 | model._num_positional_args = 2 128 | results = model(tf.constant(text), tf.constant(image)) 129 | mas_results = results.get("dense_1")[0].numpy() 130 | sub_results = results.get("dense_4")[0].numpy() 131 | master_label = self.artifacts.som_master_labels[str(np.argmax(mas_results))] 132 | sub_label = self.artifacts.som_sub_labels[str(np.argmax(sub_results))] 133 | return {"master document type": master_label, "sub document type": sub_label} 134 | -------------------------------------------------------------------------------- /multiple_models/docker-compose.yml: -------------------------------------------------------------------------------- 1 | version: "3" 2 | services: 3 | web: 4 | image: 142339138776.dkr.ecr.us-east-2.amazonaws.com/docuedge-model-zoo:latest 5 | ports: 6 | - "5000:5000" 7 | logging: 8 | driver: awslogs 9 | options: 10 | awslogs-group: docuedge-modelserver-ecs 11 | awslogs-region: us-east-2 12 | awslogs-stream-prefix: web 13 | volumes: 14 | - /app/temp 15 | -------------------------------------------------------------------------------- /multiple_models/ecs-params.yml: -------------------------------------------------------------------------------- 1 | version: 1 2 | task_definition: 3 | task_execution_role: ecsTaskExecutionRoleBento 4 | ecs_network_mode: awsvpc 5 | task_size: 6 | mem_limit: 8GB 7 | cpu_limit: 4096 8 | efs_volumes: 9 | - name: docuedgedev-efs 10 | filesystem_id: fs-4717c93f 11 | root_directory: /smartbox-config 12 | run_params: 13 | network_configuration: 14 | awsvpc_configuration: 15 | subnets: 16 | - subnet-00e7bff093931a167 17 | - subnet-0345b051535c9625d 18 | security_groups: 19 | - sg-09b7e06cb8b13167d 20 | - sg-0601a52d4ea28af05 21 | assign_public_ip: ENABLED 22 | -------------------------------------------------------------------------------- /multiple_models/hybrid_v1.py: -------------------------------------------------------------------------------- 1 | import numpy as np 2 | import random 3 | from tensorflow.keras.preprocessing.image import load_img 4 | from tensorflow.keras.preprocessing.text import Tokenizer 5 | from tensorflow.keras.layers import ( 6 | Input, 7 | Conv2D, 8 | Dense, 9 | Flatten, 10 | Embedding, 11 | Concatenate, 12 | GlobalMaxPool1D, 13 | ) 14 | from tensorflow.keras.models import Model, load_model 15 | import os 16 | import json 17 | import mlflow 18 | import mlflow.tensorflow 19 | 20 | tracking_uri = "http://testuser:test@ec2-18-220-228-243.us-east-2.compute.amazonaws.com" 21 | mlflow.set_tracking_uri(tracking_uri) 22 | client = mlflow.tracking.MlflowClient(tracking_uri=tracking_uri) 23 | try: 24 | expr_name = "hybrid_v1" # create a new experiment (do not replace) 25 | s3_bucket = "s3://docuedge-mlflow-bucket" # replace this value 26 | mlflow.create_experiment(expr_name, s3_bucket) 27 | mlflow.set_experiment(expr_name) 28 | except: 29 | experiment = mlflow.get_experiment_by_name("hybrid_v1") 30 | 31 | 32 | def read_data(path): 33 | bow = open(path, "r") 34 | data = bow.readlines() 35 | all_data_paths = [] 36 | all_texts = [] 37 | y_labels = {} 38 | for line in data: 39 | line_data = line.split("####") 40 | all_data_paths.append(line_data[0]) 41 | all_texts.append(line_data[-1][:-1]) 42 | label = line_data[0].split("/")[-2] 43 | if label not in y_labels: 44 | y_labels[label] = len(y_labels) 45 | 46 | rev_labels = {} 47 | for key, val in y_labels.items(): 48 | rev_labels[val] = key 49 | 50 | return all_data_paths, y_labels, rev_labels, all_texts 51 | 52 | 53 | def tokenize_sentence(sentence, tokenizer, maximum_word_length): 54 | updated_sentence = sentence.split(" ") 55 | tok_sent = [] 56 | for word in updated_sentence: 57 | if word in tokenizer.word_index: 58 | tok_sent.append(tokenizer.word_index[word]) 59 | else: 60 | tok_sent.append(0) 61 | if len(tok_sent) != maximum_word_length: 62 | delta = maximum_word_length - len(tok_sent) 63 | for i in range(delta): 64 | tok_sent.append(0) 65 | return tok_sent 66 | 67 | 68 | def data_loader_text( 69 | bs, data, y_lab, tokenizer, text_data, image_input_shape, max_word_length 70 | ): 71 | while True: 72 | images = [] 73 | labels = [] 74 | texts = [] 75 | while len(images) < bs: 76 | indice = random.randint(0, len(data) - 1) 77 | target = data[indice].split("/")[-2] 78 | labels.append(y_lab[target]) 79 | 80 | test_img = np.asarray(load_img(data[indice], target_size=image_input_shape)) 81 | img = np.divide(test_img, 255.0) 82 | images.append(img) 83 | 84 | tok_sen = tokenize_sentence( 85 | text_data[indice], tokenizer, maximum_word_length=max_word_length 86 | ) 87 | texts.append(tok_sen) 88 | yield [np.asarray(images), np.asarray(texts)], np.asarray(labels) 89 | 90 | 91 | def model_arc(y_labels, tokenizer, text_model_inp_shape, image_inp_shape): 92 | inp_layer_texts = Input(shape=text_model_inp_shape) 93 | inp_layer_images = Input(shape=image_inp_shape) 94 | 95 | embedding_layer = Embedding( 96 | input_dim=len(tokenizer.word_index) + 1, 97 | output_dim=64, 98 | input_length=text_model_inp_shape, 99 | trainable=True, 100 | )(inp_layer_texts) 101 | pooling_layer = GlobalMaxPool1D()(embedding_layer) 102 | dense_layer = Dense(units=64, activation="relu")(pooling_layer) 103 | # lstm_layer = Bidirectional(LSTM(units=32))(embedding_layer) 104 | 105 | conv_layer = Conv2D(filters=64, kernel_size=(2, 2), activation="relu")( 106 | inp_layer_images 107 | ) 108 | flatten_layer = Flatten()(conv_layer) 109 | 110 | concat_layer = Concatenate()([flatten_layer, dense_layer]) 111 | out_layer = Dense(len(y_labels), activation="softmax")(concat_layer) 112 | 113 | model = Model([inp_layer_images, inp_layer_texts], out_layer) 114 | model.compile( 115 | optimizer="adam", loss="sparse_categorical_crossentropy", metrics=["accuracy"] 116 | ) 117 | return model 118 | 119 | 120 | def train_hybrid_v1( 121 | text_plus_file_path: str, 122 | batch_size: int, 123 | epochs: int, 124 | image_shape: int, 125 | max_words: int, 126 | artifact_name: str, 127 | save_dir_path: str, 128 | trained_model_path: str, 129 | ): 130 | all_imgs_path, y_labels, rev_labels, all_text = read_data(path=text_plus_file_path) 131 | num_train_img = len(all_imgs_path) 132 | 133 | with open( 134 | os.path.join(save_dir_path, artifact_name, f"rev_labels_{artifact_name}.json"), 135 | "w+", 136 | ) as tar: 137 | json.dump(rev_labels, tar) 138 | 139 | print("target_encodings: ", y_labels) 140 | print("Number of training images: ", num_train_img) 141 | 142 | bow = open(text_plus_file_path, "r") 143 | tokenizer = Tokenizer() 144 | tokenizer.fit_on_texts(bow.read().split("####")) 145 | 146 | train_gen = data_loader_text( 147 | tokenizer=tokenizer, 148 | y_lab=y_labels, 149 | data=all_imgs_path, 150 | text_data=all_text, 151 | bs=batch_size, 152 | image_input_shape=(image_shape, image_shape, 3), 153 | max_word_length=max_words, 154 | ) 155 | if os.path.isfile(trained_model_path): 156 | model = load_model(trained_model_path) 157 | else: 158 | model = model_arc( 159 | y_labels=y_labels, 160 | tokenizer=tokenizer, 161 | text_model_inp_shape=(max_words,), 162 | image_inp_shape=(image_shape, image_shape, 3), 163 | ) 164 | mlflow.tensorflow.autolog(every_n_iter=1) 165 | with mlflow.start_run(experiment_id=experiment.experiment_id): 166 | mlflow.log_metrics( 167 | { 168 | "batch_size": batch_size, 169 | "epochs": epochs, 170 | "image_shape": image_shape, 171 | "max_words": max_words, 172 | } 173 | ) 174 | history = model.fit( 175 | x=train_gen, 176 | steps_per_epoch=num_train_img // batch_size, 177 | epochs=epochs, 178 | ) 179 | model.save( 180 | filepath=os.path.join( 181 | save_dir_path, artifact_name, "document_classifier.h5" 182 | ) 183 | ) 184 | meta_data_path = os.path.join(save_dir_path, artifact_name) 185 | for artifact in sorted(os.listdir(meta_data_path)): 186 | if artifact != ".DS_Store": 187 | artifact_path = os.path.join(meta_data_path, artifact) 188 | if ( 189 | os.path.isfile(artifact_path) 190 | and artifact_path.split(".")[-1] != "h5" 191 | ): 192 | print(f"artifact to be uploaded is: {artifact}") 193 | mlflow.log_artifact(local_path=artifact_path) 194 | 195 | artifact_uri = mlflow.get_artifact_uri() 196 | print(artifact_uri) 197 | mlflow.end_run() 198 | -------------------------------------------------------------------------------- /multiple_models/hybrid_v2.py: -------------------------------------------------------------------------------- 1 | import numpy as np 2 | import random 3 | import tensorflow as tf 4 | from tensorflow.keras.preprocessing.image import load_img 5 | from tensorflow.keras.preprocessing.text import Tokenizer 6 | from tensorflow.keras.layers import ( 7 | Input, 8 | Conv2D, 9 | Dense, 10 | Flatten, 11 | Embedding, 12 | Concatenate, 13 | GlobalMaxPool1D, 14 | Conv1D, 15 | MaxPooling1D, 16 | ) 17 | from tensorflow.keras.models import Model, load_model 18 | import os 19 | import json 20 | import mlflow 21 | import mlflow.tensorflow 22 | 23 | tracking_uri = "http://testuser:test@ec2-18-220-228-243.us-east-2.compute.amazonaws.com" 24 | mlflow.set_tracking_uri(tracking_uri) 25 | client = mlflow.tracking.MlflowClient(tracking_uri=tracking_uri) 26 | try: 27 | expr_name = "hybrid_v2" # create a new experiment (do not replace) 28 | s3_bucket = "s3://docuedge-mlflow-bucket" # replace this value 29 | mlflow.create_experiment(expr_name, s3_bucket) 30 | mlflow.set_experiment(expr_name) 31 | except: 32 | experiment = mlflow.get_experiment_by_name("hybrid_v2") 33 | 34 | 35 | def read_data(path): 36 | bow = open(path, "r") 37 | data = bow.readlines() 38 | all_data_paths = [] 39 | all_texts = [] 40 | doc_type_y_labels = {} 41 | master_doc_type_y_labels = {} 42 | for line in data: 43 | line_data = line.split("####") 44 | all_data_paths.append(line_data[0]) 45 | all_texts.append(line_data[-1][:-1]) 46 | doc_type_label = line_data[0].split("/")[-2] 47 | master_doc_type_label = line_data[0].split("/")[-3] 48 | if doc_type_label not in doc_type_y_labels: 49 | doc_type_y_labels[doc_type_label] = len(doc_type_y_labels) 50 | if master_doc_type_label not in master_doc_type_y_labels: 51 | master_doc_type_y_labels[master_doc_type_label] = len( 52 | master_doc_type_y_labels 53 | ) 54 | 55 | rev_labels_doc_type = {} 56 | for key, val in doc_type_y_labels.items(): 57 | rev_labels_doc_type[val] = key 58 | rev_labels_master_doc_type = {} 59 | for key, val in master_doc_type_y_labels.items(): 60 | rev_labels_master_doc_type[val] = key 61 | 62 | return ( 63 | all_data_paths, 64 | doc_type_y_labels, 65 | rev_labels_doc_type, 66 | all_texts, 67 | master_doc_type_y_labels, 68 | rev_labels_master_doc_type, 69 | ) 70 | 71 | 72 | def tokenize_sentence(sentence, tokenizer, maximum_word_length): 73 | updated_sentence = sentence.split(" ") 74 | tok_sent = [] 75 | for word in updated_sentence: 76 | if word in tokenizer.word_index: 77 | tok_sent.append(tokenizer.word_index[word]) 78 | else: 79 | tok_sent.append(0) 80 | if len(tok_sent) != maximum_word_length: 81 | delta = maximum_word_length - len(tok_sent) 82 | for i in range(delta): 83 | tok_sent.append(0) 84 | return tok_sent 85 | 86 | 87 | def data_loader_text( 88 | bs, 89 | data, 90 | y_lab, 91 | tokenizer, 92 | text_data, 93 | image_input_shape, 94 | max_word_length, 95 | y_sub_labels, 96 | ): 97 | while True: 98 | images = [] 99 | master_labels = [] 100 | sub_labels = [] 101 | texts = [] 102 | while len(images) < bs: 103 | indice = random.randint(0, len(data) - 1) 104 | target = data[indice].split("/")[-3] 105 | sub_target = data[indice].split("/")[-2] 106 | master_labels.append(y_lab[target]) 107 | sub_labels.append(y_sub_labels[sub_target]) 108 | 109 | test_img = np.asarray(load_img(data[indice], target_size=image_input_shape)) 110 | img = np.divide(test_img, 255.0) 111 | images.append(img) 112 | 113 | tok_sen = tokenize_sentence( 114 | text_data[indice], tokenizer, maximum_word_length=max_word_length 115 | ) 116 | texts.append(tok_sen) 117 | yield [np.asarray(images), np.asarray(texts)], [ 118 | np.asarray(master_labels), 119 | np.asarray(sub_labels), 120 | ] 121 | 122 | 123 | def model_arc(y_labels, tokenizer, text_model_inp_shape, image_inp_shape, y_sub_labels): 124 | inp_layer_texts = Input(shape=text_model_inp_shape) 125 | inp_layer_images = Input(shape=image_inp_shape) 126 | 127 | embedding_layer = Embedding( 128 | input_dim=len(tokenizer.word_index) + 1, 129 | output_dim=64, 130 | input_length=text_model_inp_shape, 131 | trainable=True, 132 | )(inp_layer_texts) 133 | pooling_layer = GlobalMaxPool1D()(embedding_layer) 134 | dense_layer = Dense(units=64, activation="relu")(pooling_layer) 135 | 136 | conv_layer = Conv2D(filters=64, kernel_size=(2, 2), activation="relu")( 137 | inp_layer_images 138 | ) 139 | flatten_layer = Flatten()(conv_layer) 140 | 141 | concat_layer = Concatenate()([flatten_layer, dense_layer]) 142 | out_layer = Dense(len(y_labels), activation="softmax")(concat_layer) 143 | 144 | sub_model_inp = Dense(units=64, activation="relu")(out_layer) 145 | sub_dense_layer = Dense(units=256, activation="relu")(sub_model_inp) 146 | # sub_expansion_layer = tf.expand_dims(sub_model_inp, axis=-1) 147 | # sub_conv_layer = Conv1D(filters=64, kernel_size=(2,), activation="relu")( 148 | # sub_expansion_layer 149 | # ) 150 | # sub_pool_layer = MaxPooling1D(pool_size=2)(sub_conv_layer) 151 | # sub_flatten_layer = Flatten()(sub_pool_layer) 152 | 153 | sub_concat_layer = Concatenate()([sub_dense_layer, concat_layer]) 154 | sub_out_layer = Dense(units=len(y_sub_labels), activation="softmax")( 155 | sub_concat_layer 156 | ) 157 | 158 | model = Model([inp_layer_images, inp_layer_texts], [out_layer, sub_out_layer]) 159 | model.compile( 160 | optimizer="adam", loss="sparse_categorical_crossentropy", metrics=["accuracy"] 161 | ) 162 | return model 163 | 164 | 165 | def train_hybrid_v2( 166 | text_plus_file_path: str, 167 | batch_size: int, 168 | epochs: int, 169 | image_shape: int, 170 | max_words: int, 171 | artifact_name: str, 172 | save_dir_path: str, 173 | trained_model_path: str, 174 | ): 175 | ( 176 | all_imgs_path, 177 | doc_type_y_labels, 178 | rev_labels_doc_type, 179 | all_text, 180 | master_doc_type_label, 181 | rev_labels_master_doc_type, 182 | ) = read_data(path=text_plus_file_path) 183 | num_train_img = len(all_imgs_path) 184 | 185 | with open( 186 | os.path.join(save_dir_path, artifact_name, f"rev_labels_{artifact_name}.json"), 187 | "w+", 188 | ) as tar: 189 | json.dump(rev_labels_doc_type, tar) 190 | with open( 191 | os.path.join( 192 | save_dir_path, artifact_name, f"rev_labels_master_{artifact_name}.json" 193 | ), 194 | "w+", 195 | ) as tar: 196 | json.dump(rev_labels_master_doc_type, tar) 197 | 198 | print("target_encodings: ", master_doc_type_label) 199 | print("target_encodings: ", doc_type_y_labels) 200 | print("Number of training images: ", num_train_img) 201 | 202 | bow = open(text_plus_file_path, "r") 203 | tokenizer = Tokenizer() 204 | tokenizer.fit_on_texts(bow.read().split("####")) 205 | 206 | train_gen = data_loader_text( 207 | tokenizer=tokenizer, 208 | y_lab=master_doc_type_label, 209 | data=all_imgs_path, 210 | text_data=all_text, 211 | bs=batch_size, 212 | image_input_shape=(image_shape, image_shape, 3), 213 | max_word_length=max_words, 214 | y_sub_labels=doc_type_y_labels, 215 | ) 216 | if os.path.isfile(trained_model_path): 217 | model = load_model(trained_model_path) 218 | else: 219 | model = model_arc( 220 | y_labels=master_doc_type_label, 221 | tokenizer=tokenizer, 222 | text_model_inp_shape=(max_words,), 223 | image_inp_shape=(image_shape, image_shape, 3), 224 | y_sub_labels=doc_type_y_labels, 225 | ) 226 | mlflow.tensorflow.autolog(every_n_iter=1) 227 | with mlflow.start_run(experiment_id=experiment.experiment_id): 228 | mlflow.log_metrics( 229 | { 230 | "batch_size": batch_size, 231 | "epochs": epochs, 232 | "image_shape": image_shape, 233 | "max_words": max_words, 234 | } 235 | ) 236 | model.fit( 237 | x=train_gen, steps_per_epoch=num_train_img // batch_size, epochs=epochs 238 | ) 239 | model.save( 240 | filepath=os.path.join( 241 | save_dir_path, artifact_name, "document_classifier.h5" 242 | ) 243 | ) 244 | meta_data_path = os.path.join(save_dir_path, artifact_name) 245 | for artifact in sorted(os.listdir(meta_data_path)): 246 | if artifact != ".DS_Store": 247 | artifact_path = os.path.join(meta_data_path, artifact) 248 | if ( 249 | os.path.isfile(artifact_path) 250 | and artifact_path.split(".")[-1] != "h5" 251 | ): 252 | print(f"artifact to be uploaded is: {artifact}") 253 | mlflow.log_artifact(local_path=artifact_path) 254 | 255 | artifact_uri = mlflow.get_artifact_uri() 256 | print(artifact_uri) 257 | mlflow.end_run() 258 | -------------------------------------------------------------------------------- /multiple_models/pre_process_text.py: -------------------------------------------------------------------------------- 1 | import spacy 2 | import re 3 | from pdf2image import convert_from_path 4 | import os 5 | from tqdm import tqdm 6 | import pre_processing 7 | from PIL import Image 8 | import pytesseract 9 | 10 | nlp = spacy.load("en_core_web_sm") 11 | 12 | 13 | def helper(text): 14 | dummy = [] 15 | for word in text: 16 | dummy.append(str(word)) 17 | final = " ".join(dummy) 18 | return final 19 | 20 | 21 | def preprocess_spacy(text, num_of_words: int): 22 | text = str(text) 23 | text = text.split(" ") 24 | text = helper(text) 25 | text = str(text.lower()) 26 | # Remove all the special characters 27 | text = re.sub(r"\W", " ", text) 28 | text = re.sub(r"[^a-zA-Z ]+", "", text) 29 | # remove all single characters 30 | text = re.sub(r"\s+[a-zA-Z]\s+", " ", text) 31 | # Remove single characters from the start 32 | text = re.sub(r"\^[a-zA-Z]\s+", " ", text) 33 | # Substituting multiple spaces with single space 34 | text = re.sub(r"\s+", " ", text, flags=re.I) 35 | text = nlp(text) 36 | filtered = [token.lemma_ for token in text if token.is_stop == False] 37 | text = " ".join(filtered[: num_of_words * 2]) 38 | text = text.strip().split(" ") 39 | text = " ".join(text[:num_of_words]) 40 | return text 41 | 42 | 43 | def read_text_from_pages( 44 | complete_folder_path: str, 45 | path_to_save_essential_data: str, 46 | meta_name: str, 47 | num_of_words: int, 48 | ): 49 | final_path_for_data = os.path.join( 50 | path_to_save_essential_data, meta_name, f"file_and_text_{meta_name}.txt" 51 | ) 52 | if os.path.isfile(final_path_for_data): 53 | data = open(final_path_for_data, "r").read() 54 | file_exists = True 55 | else: 56 | data = "null" 57 | file_exists = False 58 | print("#### Reading pages ####") 59 | doc_image_types = sorted(os.listdir(complete_folder_path)) 60 | text_of_all_pages = [] 61 | for doc_image_type in doc_image_types: 62 | if doc_image_type != ".DS_Store": 63 | print("DOCUMENT TYPE: ", doc_image_type) 64 | complete_doc_image_path = os.path.join(complete_folder_path, doc_image_type) 65 | pages = sorted(os.listdir(complete_doc_image_path)) 66 | for page in tqdm(pages): 67 | if page != ".DS_Store": 68 | page_path = os.path.join(complete_doc_image_path, page) 69 | if page_path not in data: 70 | document_page = Image.open(page_path) 71 | document_text = pytesseract.image_to_string(document_page) 72 | document_page.close() 73 | essential_file_path_and_text = ( 74 | page_path 75 | + "####" 76 | + preprocess_spacy(document_text, num_of_words=num_of_words) 77 | + "\n" 78 | ) 79 | text_of_all_pages.append(essential_file_path_and_text) 80 | 81 | if os.path.isfile(final_path_for_data): 82 | all_essential_data = open(final_path_for_data, "a+") 83 | all_essential_data.writelines(text_of_all_pages) 84 | else: 85 | all_essential_data = open(final_path_for_data, "w") 86 | all_essential_data.writelines(text_of_all_pages) 87 | return final_path_for_data 88 | 89 | 90 | def read_text_from_pages_v2( 91 | complete_folder_path: str, 92 | path_to_save_essential_data: str, 93 | meta_name: str, 94 | num_of_words: int, 95 | ): 96 | final_path_for_data = os.path.join( 97 | path_to_save_essential_data, meta_name, f"file_and_text_{meta_name}.txt" 98 | ) 99 | if os.path.isfile(final_path_for_data): 100 | data = open(final_path_for_data, "r").read() 101 | file_exists = True 102 | else: 103 | data = "null" 104 | file_exists = False 105 | print("#### Reading pages ####") 106 | document_folders_path = os.path.join(complete_folder_path) 107 | master_doc_types = sorted(os.listdir(document_folders_path)) 108 | text_of_all_pages = [] 109 | for master_doc_type in master_doc_types: 110 | if master_doc_type != ".DS_Store": 111 | print("MASTER DOCUMENT TYPE: ", master_doc_type) 112 | sub_doc_type_path = os.path.join(document_folders_path, master_doc_type) 113 | for doc_image_type in sorted(os.listdir(sub_doc_type_path)): 114 | if doc_image_type != ".DS_Store": 115 | print("DOCUMENT TYPE: ", doc_image_type) 116 | complete_doc_image_path = os.path.join( 117 | sub_doc_type_path, doc_image_type 118 | ) 119 | pages = sorted(os.listdir(complete_doc_image_path)) 120 | for page in tqdm(pages): 121 | if page != ".DS_Store": 122 | page_path = os.path.join(complete_doc_image_path, page) 123 | if page_path not in data: 124 | document_page = Image.open(page_path) 125 | document_text = pytesseract.image_to_string( 126 | document_page 127 | ) 128 | document_page.close() 129 | essential_file_path_and_text = ( 130 | page_path 131 | + "####" 132 | + preprocess_spacy( 133 | document_text, num_of_words=num_of_words 134 | ) 135 | + "\n" 136 | ) 137 | text_of_all_pages.append(essential_file_path_and_text) 138 | 139 | if os.path.isfile(final_path_for_data): 140 | all_essential_data = open(final_path_for_data, "a+") 141 | all_essential_data.writelines(text_of_all_pages) 142 | else: 143 | all_essential_data = open(final_path_for_data, "w") 144 | all_essential_data.writelines(text_of_all_pages) 145 | return final_path_for_data 146 | 147 | 148 | def pdf_to_images(full_path_pdf: str, converted_images_path: str, meta_name: str): 149 | doc = full_path_pdf.split("/")[-1] 150 | index = 0 151 | OUTPUT_PATH = converted_images_path 152 | os.makedirs(name=OUTPUT_PATH, exist_ok=True) 153 | 154 | print("Document name: ", doc) 155 | if str(doc.split(".pdf")[-2]) + "_" + str(index) + ".jpg" not in os.listdir( 156 | converted_images_path 157 | ): 158 | pil_images = convert_from_path(full_path_pdf, dpi=300) 159 | 160 | for image in tqdm(pil_images): 161 | processed_image = pre_processing.preprocess_image_file(image) 162 | try: 163 | processed_image = Image.fromarray(processed_image) 164 | processed_image.save( 165 | os.path.join(OUTPUT_PATH, str(doc.split(".pdf")[-2])) 166 | + "_" 167 | + str(index) 168 | + ".jpg", 169 | format="JPEG", 170 | subsampling=0, 171 | quality=100, 172 | ) 173 | index += 1 174 | processed_image.close() 175 | except: 176 | index += 1 177 | else: 178 | pass -------------------------------------------------------------------------------- /multiple_models/pre_processing.py: -------------------------------------------------------------------------------- 1 | """IMAGE PREPROCESSING FUNCTIONS 2 | """ 3 | import cv2 4 | import numpy as np 5 | from scipy.ndimage.filters import rank_filter 6 | 7 | # from sbox.utils.sbox_logger import logger 8 | import pytesseract 9 | import re 10 | import imutils 11 | from PIL import Image 12 | 13 | # print("error") # = logger(__name__) 14 | 15 | 16 | class PagePreprocess(object): 17 | def __init__(self, im): 18 | self.err = False 19 | self.orig_im = im 20 | self.orig_shape = self.orig_im.shape 21 | self.image = im 22 | 23 | def crop(self): 24 | try: 25 | self.image, self.num_tries = process_image(self.orig_im) 26 | self.crop_shape = self.image.shape 27 | return self.image 28 | except Exception as e: 29 | print("crop_obj_Error") # (f"Error: {e}", exc_info=True) 30 | 31 | def deskew(self): 32 | try: 33 | self.image, self.theta_est = process_skewed_crop(self.image) 34 | return self.image 35 | except Exception as e: 36 | print("deskew_obj_Error") # (f"Error: {e}", exc_info=True) 37 | 38 | 39 | def auto_canny(image, sigma=0.33): 40 | try: 41 | v = np.median(image) 42 | lower = int(max(0, (1.0 - sigma) * v)) 43 | upper = int(min(255, (1.0 + sigma) * v)) 44 | edged = cv2.Canny(image, lower, upper, True) 45 | return edged 46 | except Exception as e: 47 | print("auto_canny_Error") # (f"Error: {e}", exc_info=True) 48 | 49 | 50 | def dilate(image, kernel, iterations): 51 | dilated_image = cv2.dilate(image, kernel, iterations=iterations) 52 | return dilated_image 53 | 54 | 55 | def downscale_image(im, max_dim=2048): 56 | try: 57 | a, b = im.shape[:2] 58 | if max(a, b) <= max_dim: 59 | return 1.0, im 60 | 61 | scale = 1.0 * max_dim / max(a, b) 62 | new_im = cv2.resize(im, (int(b * scale), int(a * scale)), cv2.INTER_AREA) 63 | return scale, new_im 64 | except Exception as e: 65 | print("error") # (f"Error: {e}", exc_info=True) 66 | 67 | 68 | def find_components(im, max_components=16): 69 | try: 70 | kernel = cv2.getStructuringElement(cv2.MORPH_RECT, (10, 10)) 71 | dilation = dilate(im, kernel, 6) 72 | 73 | count = 21 74 | n = 0 75 | sigma = 0.000 76 | 77 | while count > max_components: 78 | n += 1 79 | sigma += 0.005 80 | result = cv2.findContours(dilation, cv2.RETR_TREE, cv2.CHAIN_APPROX_SIMPLE) 81 | if len(result) == 3: 82 | _, contours, hierarchy = result 83 | elif len(result) == 2: 84 | contours, hierarchy = result 85 | possible = find_likely_rectangles(contours, sigma) 86 | count = len(possible) 87 | 88 | return (dilation, possible, n) 89 | except Exception as e: 90 | print("comp_error") # (f"Error: {e}", exc_info=True) 91 | 92 | 93 | def find_likely_rectangles(contours, sigma): 94 | try: 95 | contours = sorted(contours, key=cv2.contourArea, reverse=True)[:10] 96 | possible = [] 97 | for c in contours: 98 | 99 | peri = cv2.arcLength(c, True) 100 | approx = cv2.approxPolyDP(c, sigma * peri, True) 101 | box = make_box(approx) 102 | possible.append(box) 103 | 104 | return possible 105 | except Exception as e: 106 | print("likely_rec_error") # (f"Error: {e}", exc_info=True) 107 | 108 | 109 | def make_box(poly): 110 | try: 111 | x = [] 112 | y = [] 113 | for p in poly: 114 | for point in p: 115 | x.append(point[0]) 116 | y.append(point[1]) 117 | xmax = max(x) 118 | ymax = max(y) 119 | xmin = min(x) 120 | ymin = min(y) 121 | return (xmin, ymin, xmax, ymax) 122 | except Exception as e: 123 | print("bbox_error") # (f"Error: {e}", exc_info=True) 124 | 125 | 126 | def rect_union(crop1, crop2): 127 | x11, y11, x21, y21 = crop1 128 | x12, y12, x22, y22 = crop2 129 | return min(x11, x12), min(y11, y12), max(x21, x22), max(y21, y22) 130 | 131 | 132 | def rect_area(crop): 133 | x1, y1, x2, y2 = crop 134 | return max(0, x2 - x1) * max(0, y2 - y1) 135 | 136 | 137 | def crop_image(im, rect, scale): 138 | try: 139 | xmin, ymin, xmax, ymax = rect 140 | crop = [xmin, ymin, xmax, ymax] 141 | xmin, ymin, xmax, ymax = [int(x / scale) for x in crop] 142 | if ((ymax - ymin) * (xmax - xmin)) > 0.25 * im.size: 143 | cropped = im[ymin:ymax, xmin:xmax] 144 | else: 145 | cropped = im 146 | return cropped 147 | except Exception as e: 148 | print("crop_error_1") # (f"Error: {e}", exc_info=True) 149 | 150 | 151 | def reduce_noise_raw(im): 152 | bilat = cv2.bilateralFilter(im, 4, 75, 75) 153 | blur = cv2.medianBlur(bilat, 1) 154 | return blur 155 | 156 | 157 | def reduce_noise_edges(im): 158 | try: 159 | structuring_element = cv2.getStructuringElement(cv2.MORPH_RECT, (1, 1)) 160 | opening = cv2.morphologyEx(im, cv2.MORPH_OPEN, structuring_element) 161 | maxed_rows = rank_filter(opening, -4, size=(1, 20)) 162 | maxed_cols = rank_filter(opening, -4, size=(20, 1)) 163 | debordered = np.minimum(np.minimum(opening, maxed_rows), maxed_cols) 164 | return debordered 165 | except Exception as e: 166 | print("noise_red_Error") # (f"Error: {e}", exc_info=True) 167 | 168 | 169 | def rects_are_vertical(rect1, rect2, rect_align=2): 170 | try: 171 | xmin1, ymin1, xmax1, ymax1 = rect1 172 | xmin2, ymin2, xmax2, ymax2 = rect2 173 | 174 | midpoint1 = (xmin1 + xmax1) / 2 175 | midpoint2 = (xmin2 + xmax2) / 2 176 | dist = abs(midpoint1 - midpoint2) 177 | 178 | rectarea1 = rect_area(rect1) 179 | rectarea2 = rect_area(rect2) 180 | if rectarea1 > rectarea2: 181 | thres = (xmax1 - xmin1) * rect_align 182 | else: 183 | thres = (xmax2 - xmin2) * rect_align 184 | 185 | if thres > dist: 186 | align = True 187 | else: 188 | align = False 189 | return align 190 | except Exception as e: 191 | print("vert_rec_Error") # (f"Error: {e}", exc_info=True) 192 | 193 | 194 | def find_final_crop(im, rects, orig_im): 195 | try: 196 | current = None 197 | for rect in rects: 198 | if current is None: 199 | current = rect 200 | continue 201 | 202 | aligned = rects_are_vertical(current, rect) 203 | 204 | if not aligned: 205 | continue 206 | 207 | current = rect_union(current, rect) 208 | if current is not None: 209 | return current 210 | else: 211 | return (0, 0, orig_im.shape[0], orig_im.shape[1]) 212 | except Exception as e: 213 | print("crop_Error") # (f"Error: {e}", exc_info=True) 214 | 215 | 216 | def process_image(orig_im): 217 | try: 218 | scale, im = downscale_image(orig_im) 219 | 220 | blur = reduce_noise_raw(im.copy()) 221 | 222 | edges = auto_canny(blur.copy()) 223 | 224 | debordered = reduce_noise_edges(edges.copy()) 225 | 226 | dilation, rects, num_tries = find_components(debordered, 16) 227 | 228 | final_rect = find_final_crop(dilation, rects, orig_im) 229 | 230 | cropped = crop_image(orig_im, final_rect, scale) 231 | # kernel = np.ones((3, 3), np.float32) / 25 232 | # smooth2d = cv2.filter2D(cropped, -1, kernel=kernel) 233 | return (cropped, num_tries) 234 | except Exception as e: 235 | print("process") # (f"Error: {e}", exc_info=True) 236 | 237 | 238 | def rad_to_deg(theta): 239 | return theta * 180 / np.pi 240 | 241 | 242 | def rotate(image, theta): 243 | try: 244 | (h, w) = image.shape[:2] 245 | center = (w / 2, h / 2) 246 | M = cv2.getRotationMatrix2D(center, theta, 1) 247 | rotated = cv2.warpAffine( 248 | image, 249 | M, 250 | (int(w), int(h)), 251 | cv2.INTER_LINEAR, 252 | borderMode=cv2.BORDER_CONSTANT, 253 | borderValue=(255, 255, 255), 254 | ) 255 | return rotated 256 | except Exception as e: 257 | print("rotation_error") # (f"Error: {e}", exc_info=True) 258 | 259 | 260 | def angle_calculation(gray): 261 | gray = cv2.cvtColor(gray, cv2.COLOR_BGR2GRAY) 262 | gray = cv2.bitwise_not(gray) 263 | thresh = cv2.threshold(gray, 0, 255, cv2.THRESH_BINARY | cv2.THRESH_OTSU)[1] 264 | 265 | coords = np.column_stack(np.where(thresh > 0)) 266 | # print(coords, coords.shape) 267 | 268 | min_y = coords[0][0] 269 | max_y = coords[-1][0] 270 | min_x = coords[0][0] 271 | max_x = coords[-1][1] 272 | 273 | left_most = coords[0] 274 | right_most = coords[0] 275 | top_most = coords[0] 276 | bottom_most = coords[0] 277 | # print(coords[0], coords[-1]) 278 | for i in range(1, coords.shape[0]): 279 | y, x = coords[i][0], coords[i][1] 280 | if y <= min_y: 281 | min_y = y 282 | top_most = coords[i] 283 | elif y >= max_y: 284 | max_y = y 285 | bottom_most = coords[i] 286 | if x <= min_x: 287 | min_x = x 288 | left_most = coords[i] 289 | elif x >= max_x: 290 | max_x = x 291 | right_most = coords[i] 292 | # print(top_most, left_most, bottom_most, right_most) 293 | 294 | slopes = [] 295 | edge_coor = [top_most, left_most, bottom_most, right_most] 296 | for i in range(0, len(edge_coor)): 297 | if i == len(edge_coor) - 1: 298 | if abs((edge_coor[0][1] - edge_coor[i][1])) >= 10: 299 | angle = ( 300 | ( 301 | (edge_coor[0][0] - edge_coor[i][0]) 302 | / (edge_coor[0][1] - edge_coor[i][1]) 303 | ) 304 | * 180 305 | ) / 3.14 306 | slopes.append(angle) 307 | else: 308 | slopes.append(0.0) 309 | else: 310 | if abs((edge_coor[i + 1][1] - edge_coor[i][1])) >= 10: 311 | angle = ( 312 | ( 313 | (edge_coor[i + 1][0] - edge_coor[i][0]) 314 | / (edge_coor[i + 1][1] - edge_coor[i][1]) 315 | ) 316 | * 180 317 | ) / 3.14 318 | slopes.append(angle) 319 | else: 320 | slopes.append(0.0) 321 | # img = cv2.circle(thresh, (edge_coor[i][1], edge_coor[i][0]), 5, (255, 0, 0), 2) 322 | 323 | slopes = np.asarray(slopes) 324 | if len(np.where(slopes == 0.0)[0]) >= 2: 325 | # print("error") #(f"Error: {e}", exc_info=True)don't rotate") 326 | return None 327 | else: 328 | # print("error") #(f"Error: {e}", exc_info=True)rotate") 329 | neg_slope = (slopes[0] + slopes[2]) / 2 330 | pos_slope = (slopes[1] + slopes[3]) / 2 331 | # print(pos_slope, neg_slope) 332 | new_pos_slope = pos_slope 333 | new_neg_slope = neg_slope 334 | if pos_slope > 90: 335 | if pos_slope < 180: 336 | new_pos_slope = 180 - pos_slope 337 | else: 338 | new_pos_slope = pos_slope - ((pos_slope // 180) * 180) 339 | # print(new_pos_slope) 340 | if neg_slope < -90: 341 | new_neg_slope = 180 + neg_slope 342 | # print(new_pos_slope, new_neg_slope) 343 | if new_pos_slope <= new_neg_slope: 344 | fin_angle = pos_slope 345 | else: 346 | fin_angle = neg_slope 347 | 348 | if fin_angle < -90: 349 | rot_angle = 180 + fin_angle 350 | elif fin_angle > 90: 351 | rot_angle = -(180 - fin_angle) 352 | elif -90 < fin_angle < 0: 353 | rot_angle = fin_angle 354 | elif 0 < fin_angle < 90: 355 | rot_angle = fin_angle 356 | return rot_angle 357 | 358 | 359 | def estimate_skew(image): 360 | try: 361 | osd = pytesseract.image_to_osd(image) 362 | angle = float(re.search("(?<=Rotate: )\d+", osd).group(0)) 363 | if angle == 0: 364 | # fin_image = rotate(image_gray, angle) 365 | edges = auto_canny(image) 366 | # print(edges.shape) 367 | # print("error") #(f"Error: {e}", exc_info=True)edges found: ", edges) 368 | lines = cv2.HoughLines(edges, 1, np.pi / 270, 400) 369 | # print("error") #(f"Error: {e}", exc_info=True)lines found: ", lines) 370 | if lines is not None: 371 | new = edges.copy() 372 | thetas = [] 373 | for line in lines: 374 | for rho, theta in line: 375 | a = np.cos(theta) 376 | b = np.sin(theta) 377 | x0 = a * rho 378 | y0 = b * rho 379 | x1 = int(x0 + 1000 * (-b)) 380 | y1 = int(y0 + 1000 * (a)) 381 | x2 = int(x0 - 1000 * (-b)) 382 | y2 = int(y0 - 1000 * (a)) 383 | if theta > np.pi / 3 and theta < np.pi * 2 / 3: 384 | thetas.append(theta) 385 | new = cv2.line(new, (x1, y1), (x2, y2), (255, 255, 255), 1) 386 | 387 | theta_mean = np.mean(thetas) 388 | theta = -(90 - (rad_to_deg(theta_mean) if len(thetas) > 0 else 0)) 389 | else: 390 | # theta = angle_calculation(image) 391 | theta = 0.0 392 | else: 393 | theta = angle 394 | return theta 395 | except Exception as e: 396 | print("theta_error") # (f"Error: {e}", exc_info=True) 397 | 398 | 399 | def process_skewed_crop(image): 400 | try: 401 | theta = estimate_skew(image) 402 | # print(theta) 403 | # ret, thresh = cv2.threshold(image, 0, 127, cv2.THRESH_OTSU) 404 | # print(thresh) 405 | if theta is not None and (theta % 90) != 0: 406 | rotated = rotate(image, theta) 407 | elif (theta % 90) == 0: 408 | rotated = imutils.rotate_bound(image, theta) 409 | else: 410 | rotated = image 411 | # print(rotated) 412 | return rotated, theta 413 | except Exception as e: 414 | print("skew_Error") # (f"Error: {e}", exc_info=True) 415 | 416 | 417 | def preprocess_image(file_path: str): 418 | try: 419 | gray_page = cv2.imread(file_path, 0) 420 | process_page = PagePreprocess(gray_page) 421 | _ = process_page.crop() 422 | deskewed_page = process_page.deskew() 423 | # cv2.imwrite(file_path, deskewed_page) 424 | return deskewed_page 425 | except Exception as e: 426 | print("process_image_error") # (f"Error: {e}", exc_info=True) 427 | 428 | 429 | def preprocess_image_file(img): 430 | try: 431 | # converted_image = cv2.cvtColor(np.asarray(img), cv2.COLOR_RGB2BGR) 432 | gray_page = cv2.cvtColor(np.array(img), cv2.COLOR_BGR2GRAY) 433 | # gray_page = cv2.cvtColor(gray_page, cv2.COLOR_BGR2RGB) 434 | process_page = PagePreprocess(gray_page) 435 | _ = process_page.crop() 436 | deskewed_page = process_page.deskew() 437 | return deskewed_page 438 | except Exception as e: 439 | print("error") # (f"Error: {e}", exc_info=True) 440 | -------------------------------------------------------------------------------- /multiple_models/train.py: -------------------------------------------------------------------------------- 1 | import argparse 2 | import os 3 | from pre_process_text import ( 4 | pdf_to_images, 5 | read_text_from_pages, 6 | read_text_from_pages_v2, 7 | ) 8 | from hybrid_v1 import train_hybrid_v1 9 | from hybrid_v2 import train_hybrid_v2 10 | 11 | 12 | def process( 13 | dataset_path: str, 14 | save_dir: str, 15 | pdf_check: bool, 16 | artifact_name: str, 17 | num_words_to_read: int, 18 | ): 19 | updated_dataset_path = os.path.join(save_dir, artifact_name, "dataset") 20 | os.makedirs(updated_dataset_path, exist_ok=True) 21 | for document_type in sorted(os.listdir(dataset_path)): 22 | if document_type != ".DS_Store": 23 | folder_path = os.path.join(dataset_path, document_type) 24 | updated_document_type_folder_path = os.path.join( 25 | updated_dataset_path, document_type 26 | ) 27 | os.makedirs(updated_document_type_folder_path, exist_ok=True) 28 | for documents in sorted(os.listdir(folder_path)): 29 | if documents != ".DS_Store": 30 | document_path = os.path.join(folder_path, documents) 31 | if pdf_check: 32 | # Perform conversion and store the images in a temp folder 33 | pdf_to_images( 34 | full_path_pdf=document_path, 35 | converted_images_path=updated_document_type_folder_path, 36 | meta_name=artifact_name, 37 | ) 38 | if pdf_check: 39 | images_data_path = os.path.join(save_dir, artifact_name) 40 | else: 41 | images_data_path = dataset_path 42 | master_data_path = read_text_from_pages( 43 | complete_folder_path=images_data_path, 44 | path_to_save_essential_data=save_dir, 45 | meta_name=artifact_name, 46 | num_of_words=num_words_to_read, 47 | ) 48 | return master_data_path 49 | 50 | 51 | def single_level(args): 52 | all_data_path = process( 53 | dataset_path=args.data_path, 54 | save_dir=args.file_path, 55 | pdf_check=bool(args.pdfs), 56 | artifact_name=args.art_name, 57 | num_words_to_read=int(args.num_of_words), 58 | ) 59 | train_hybrid_v1( 60 | text_plus_file_path=all_data_path, 61 | batch_size=int(args.batch_size), 62 | epochs=int(args.epochs), 63 | image_shape=int(args.img_shape), 64 | max_words=int(args.num_of_words), 65 | artifact_name=args.art_name, 66 | save_dir_path=args.file_path, 67 | trained_model_path=args.model_path, 68 | ) 69 | 70 | 71 | def process_multi_level( 72 | dataset_path: str, 73 | save_dir: str, 74 | pdf_check: bool, 75 | artifact_name: str, 76 | num_words_to_read: int, 77 | ): 78 | updated_dataset_path = os.path.join(save_dir, artifact_name, "dataset") 79 | os.makedirs(updated_dataset_path, exist_ok=True) 80 | for master_document_type in sorted(os.listdir(dataset_path)): 81 | if master_document_type != ".DS_Store": 82 | document_folder_path = os.path.join(dataset_path, master_document_type) 83 | updated_master_document_type_folder_path = os.path.join( 84 | updated_dataset_path, master_document_type 85 | ) 86 | os.makedirs(updated_master_document_type_folder_path, exist_ok=True) 87 | for document_type in sorted(os.listdir(document_folder_path)): 88 | if document_type != ".DS_Store": 89 | folder_path = os.path.join(document_folder_path, document_type) 90 | updated_document_type_folder_path = os.path.join( 91 | updated_master_document_type_folder_path, document_type 92 | ) 93 | os.makedirs(updated_document_type_folder_path, exist_ok=True) 94 | for documents in sorted(os.listdir(folder_path)): 95 | if documents != ".DS_Store": 96 | docuemnt_path = os.path.join(folder_path, documents) 97 | if pdf_check: 98 | # Perform conversion and store the images in a temp folder 99 | pdf_to_images( 100 | full_path_pdf=docuemnt_path, 101 | converted_images_path=updated_document_type_folder_path, 102 | meta_name=artifact_name, 103 | ) 104 | if pdf_check: 105 | images_data_path = os.path.join(save_dir, artifact_name) 106 | else: 107 | images_data_path = dataset_path 108 | master_data_path = read_text_from_pages_v2( 109 | complete_folder_path=images_data_path, 110 | path_to_save_essential_data=save_dir, 111 | meta_name=artifact_name, 112 | num_of_words=num_words_to_read, 113 | ) 114 | return master_data_path 115 | 116 | 117 | def multi_level(args): 118 | all_data_path = process_multi_level( 119 | dataset_path=args.data_path, 120 | save_dir=args.file_path, 121 | pdf_check=bool(args.pdfs), 122 | artifact_name=args.art_name, 123 | num_words_to_read=args.num_of_words, 124 | ) 125 | train_hybrid_v2( 126 | text_plus_file_path=all_data_path, 127 | batch_size=int(args.batch_size), 128 | epochs=int(args.epochs), 129 | image_shape=int(args.img_shape), 130 | max_words=int(args.num_of_words), 131 | artifact_name=args.art_name, 132 | save_dir_path=args.file_path, 133 | trained_model_path=args.model_path, 134 | ) 135 | 136 | 137 | parser = argparse.ArgumentParser() 138 | parser.add_argument("-dp", "--data_path", help="File path of the dataset") 139 | parser.add_argument("-fp", "--file_path", help="Directory path to save artifacts") 140 | parser.add_argument("-a", "--art_name", help="Artifacts name") 141 | parser.add_argument("-p", "--pdfs", default=False, help="Dataset type") 142 | parser.add_argument("-n", "--num_of_words", default=10, help="No of words to read") 143 | parser.add_argument("-b", "--batch_size", default=8, help="Batch size for training") 144 | parser.add_argument("-e", "--epochs", default=3, help="Number of epochs") 145 | parser.add_argument("-is", "--img_shape", default=100, help="One dimension of image") 146 | parser.add_argument("-mp", "--model_path", default="NULL", help="Path to trained model") 147 | args = parser.parse_args() 148 | single_level(args=args) 149 | # multi_level(args=args) 150 | -------------------------------------------------------------------------------- /trial.py: -------------------------------------------------------------------------------- 1 | import requests 2 | 3 | image_path = "/Users/vsatpathy/Desktop/off_POCs/intel-image-classification/seg_train/buildings/0.jpg" 4 | 5 | with open(image_path, "rb") as f: 6 | image_bytes = f.read() 7 | 8 | files = { 9 | "image": ("test_image", image_bytes), 10 | } 11 | url = "http://127.0.0.1:5000/predict_image" 12 | # url = "https://bentoml.smartbox-capture.com/predict_document_labels_som" 13 | 14 | response = requests.post(url, files=files) 15 | print(response.text) 16 | -------------------------------------------------------------------------------- /vanilla_GAN/__pycache__/bento_predictor.cpython-38.pyc: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/vaibhavsatpathy/psAI-clOps/7b447177b3e43c8d028360f574e6a20ba5090cc5/vanilla_GAN/__pycache__/bento_predictor.cpython-38.pyc -------------------------------------------------------------------------------- /vanilla_GAN/artifacts/saved_model.pb: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/vaibhavsatpathy/psAI-clOps/7b447177b3e43c8d028360f574e6a20ba5090cc5/vanilla_GAN/artifacts/saved_model.pb -------------------------------------------------------------------------------- /vanilla_GAN/artifacts/variables/variables.data-00000-of-00001: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/vaibhavsatpathy/psAI-clOps/7b447177b3e43c8d028360f574e6a20ba5090cc5/vanilla_GAN/artifacts/variables/variables.data-00000-of-00001 -------------------------------------------------------------------------------- /vanilla_GAN/artifacts/variables/variables.index: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/vaibhavsatpathy/psAI-clOps/7b447177b3e43c8d028360f574e6a20ba5090cc5/vanilla_GAN/artifacts/variables/variables.index -------------------------------------------------------------------------------- /vanilla_GAN/bento_package.py: -------------------------------------------------------------------------------- 1 | from bento_predictor import DigitGenerator 2 | from tensorflow.keras.models import load_model 3 | import tensorflow as tf 4 | 5 | 6 | def classifier_models(model_service, model_path: str): 7 | model_gen = load_model(model_path) 8 | tf.saved_model.save(model_gen, "artifacts/") 9 | model_gen = tf.saved_model.load("artifacts/") 10 | model_service.pack("model", model_gen) 11 | 12 | 13 | def main(): 14 | model_service = DigitGenerator() 15 | classifier_models(model_service=model_service, model_path=generator_model_path) 16 | saved_path = model_service.save() 17 | 18 | 19 | generator_model_path = ( 20 | "/Users/vsatpathy/Desktop/docs/training_data/van_gan/generator.h5" 21 | ) 22 | main() -------------------------------------------------------------------------------- /vanilla_GAN/bento_predictor.py: -------------------------------------------------------------------------------- 1 | import bentoml 2 | from bentoml.adapters import JsonInput 3 | from bentoml.frameworks.tensorflow import TensorflowSavedModelArtifact 4 | 5 | import tensorflow as tf 6 | import importlib.util 7 | import numpy as np 8 | from PIL import Image 9 | 10 | 11 | @bentoml.env(infer_pip_packages=True) 12 | @bentoml.artifacts([TensorflowSavedModelArtifact("model")]) 13 | class DigitGenerator(bentoml.BentoService): 14 | @bentoml.api(input=JsonInput()) 15 | def generate_image(self, file_stream): 16 | model = self.artifacts.model.signatures["serving_default"] 17 | model._num_positional_args = 1 18 | noise = np.random.normal(0, 1, (1, 100)) 19 | noise = tf.convert_to_tensor(noise, dtype=tf.float32) 20 | results = model(noise) 21 | generated_image = results.get("dense_3")[0].numpy().reshape(28, 28) 22 | return {"digit_generated": generated_image} 23 | -------------------------------------------------------------------------------- /vanilla_GAN/infer.py: -------------------------------------------------------------------------------- 1 | from tensorflow.keras.models import load_model 2 | import numpy as np 3 | import matplotlib.pyplot as plt 4 | from PIL import Image 5 | 6 | 7 | def test(gen_model_path: str, i: int): 8 | gen = load_model(gen_model_path) 9 | noise = np.random.normal(0, 1, (1, 100)) 10 | image = np.squeeze(gen.predict(noise), axis=0) 11 | plt.imsave( 12 | "/Users/vsatpathy/Desktop/off_POCs/cycle_gan/epoch_%d" % i, 13 | image.reshape(28, 28), 14 | format="jpg", 15 | cmap="gray", 16 | ) 17 | 18 | 19 | generator_model_path = ( 20 | "/Users/vsatpathy/Desktop/docs/training_data/van_gan/generator.h5" 21 | ) 22 | test(gen_model_path=generator_model_path, i=0) 23 | -------------------------------------------------------------------------------- /vanilla_GAN/train.py: -------------------------------------------------------------------------------- 1 | import os 2 | import numpy as np 3 | import matplotlib.pyplot as plt 4 | import mlflow 5 | import argparse 6 | 7 | from tensorflow.keras.models import Model, Sequential 8 | from tensorflow.keras.datasets import mnist 9 | from tensorflow.keras.optimizers import Adam 10 | from tensorflow.keras import backend as K 11 | from tensorflow.keras import initializers 12 | from tensorflow.keras.layers import ( 13 | Reshape, 14 | Dense, 15 | Dropout, 16 | Flatten, 17 | BatchNormalization, 18 | Convolution2D, 19 | UpSampling2D, 20 | Input, 21 | LeakyReLU, 22 | ) 23 | 24 | 25 | tracking_uri = ( 26 | "http://testuser:password@ec2-18-218-100-222.us-east-2.compute.amazonaws.com" 27 | ) 28 | # tracking_uri = "postgresql://postgres:postgres@localhost:5432/" 29 | s3_bucket = "s3://docuedge-mlflow-bucket" # replace this value 30 | 31 | 32 | def generator(): 33 | gen = Sequential() 34 | gen.add(Dense(256, input_dim=100)) 35 | gen.add(LeakyReLU(0.2)) 36 | gen.add(Dense(512)) 37 | gen.add(LeakyReLU(0.2)) 38 | gen.add(Dense(1024)) 39 | gen.add(LeakyReLU(0.2)) 40 | gen.add(Dense(784, activation="tanh")) 41 | gen.compile(loss="binary_crossentropy", optimizer=Adam(lr=0.0002, beta_1=0.5)) 42 | return gen 43 | 44 | 45 | def discriminator(): 46 | disc = Sequential() 47 | disc.add(Dense(1024, input_dim=784)) 48 | disc.add(LeakyReLU(0.2)) 49 | disc.add(Dropout(0.2)) 50 | disc.add(Dense(512)) 51 | disc.add(LeakyReLU(0.2)) 52 | disc.add(Dropout(0.2)) 53 | disc.add(Dense(256)) 54 | disc.add(LeakyReLU(0.2)) 55 | disc.add(Dropout(0.2)) 56 | disc.add(Dense(1, activation="sigmoid")) 57 | disc.compile(loss="binary_crossentropy", optimizer=Adam(lr=0.0002, beta_1=0.5)) 58 | return disc 59 | 60 | 61 | def stacked_GAN(gen, disc): 62 | disc.trainable = False 63 | gan_input = Input(shape=(100,)) 64 | x = gen(gan_input) 65 | gan_out = disc(x) 66 | gan_stack = Model(inputs=gan_input, outputs=gan_out) 67 | gan_stack.compile(loss="binary_crossentropy", optimizer=Adam(lr=0.0002, beta_1=0.5)) 68 | return gan_stack 69 | 70 | 71 | def train( 72 | gen, 73 | disc, 74 | gan_stack, 75 | max_iter: int, 76 | batch_size: int, 77 | img_shape: int, 78 | file_path: str, 79 | artifact_name: str, 80 | exp_name: str, 81 | ): 82 | 83 | mlflow.set_tracking_uri(tracking_uri) 84 | client = mlflow.tracking.MlflowClient(tracking_uri=tracking_uri) 85 | try: 86 | expr_name = exp_name # create a new experiment (do not replace) 87 | mlflow.create_experiment(expr_name, s3_bucket) 88 | mlflow.set_experiment(expr_name) 89 | experiment = mlflow.get_experiment_by_name(exp_name) 90 | except: 91 | experiment = mlflow.get_experiment_by_name(exp_name) 92 | 93 | os.makedirs(os.path.join(file_path, artifact_name), exist_ok=True) 94 | mlflow.tensorflow.autolog(every_n_iter=1) 95 | with mlflow.start_run(experiment_id=experiment.experiment_id) as run: 96 | 97 | mlflow.log_metrics( 98 | { 99 | "batch_size": batch_size, 100 | "epochs": max_iter, 101 | "image_shape": img_shape, 102 | } 103 | ) 104 | 105 | (X_train, _), (_, _) = mnist.load_data() 106 | X_train = (X_train.astype(np.float32) - 127.5) / 127.5 107 | X_train = X_train.reshape(60000, img_shape) 108 | 109 | for i in range(0, max_iter): 110 | noise = np.random.normal(0, 1, (batch_size, 100)) 111 | image_batch = X_train[ 112 | np.random.randint(0, X_train.shape[0], size=batch_size) 113 | ] 114 | 115 | fake_images = gen.predict(noise) 116 | 117 | final_images = np.concatenate([image_batch, fake_images]) 118 | final_labels = np.concatenate( 119 | ( 120 | np.ones((np.int64(batch_size), 1)), 121 | np.zeros((np.int64(batch_size), 1)), 122 | ) 123 | ) 124 | 125 | disc.trainable = True 126 | disc_loss = disc.train_on_batch(final_images, final_labels) 127 | 128 | disc.trainable = False 129 | y_mis_labels = np.ones(batch_size) 130 | gen_loss = gan_stack.train_on_batch(noise, y_mis_labels) 131 | 132 | mlflow.log_metrics( 133 | {"generator_loss": gen_loss, "discriminator_loss": disc_loss} 134 | ) 135 | 136 | print( 137 | "epoch_%d---->gen_loss:[%f]---->disc_loss:[%f]" 138 | % (i, gen_loss, disc_loss) 139 | ) 140 | # if i % 1000 == 0: 141 | # test(gen, i) 142 | 143 | gen.save(os.path.join(file_path, artifact_name, "generator.h5")) 144 | # disc.save(os.path.join(file_path, artifact_name, "discriminator.h5")) 145 | 146 | meta_data_path = os.path.join(file_path, artifact_name) 147 | for artifact in sorted(os.listdir(meta_data_path)): 148 | if artifact != ".DS_Store": 149 | artifact_path = os.path.join(meta_data_path, artifact) 150 | if ( 151 | os.path.isfile(artifact_path) 152 | and artifact_path.split(".")[-1] != "h5" 153 | ): 154 | print(f"artifact to be uploaded is: {artifact}") 155 | mlflow.log_artifact(local_path=artifact_path) 156 | 157 | artifact_uri = mlflow.get_artifact_uri() 158 | print(artifact_uri) 159 | mlflow.end_run() 160 | 161 | 162 | parser = argparse.ArgumentParser() 163 | parser.add_argument("-fp", "--file_path", help="Directory path to save artifacts") 164 | parser.add_argument("-a", "--art_name", help="Artifacts name") 165 | parser.add_argument("-b", "--batch_size", default=32, help="Batch size for training") 166 | parser.add_argument("-e", "--epochs", default=20000, help="Number of epochs") 167 | parser.add_argument("-is", "--img_shape", default=784, help="One dimension of image") 168 | parser.add_argument( 169 | "-exp", 170 | "--experiment_name", 171 | default="vanilla_gan", 172 | help="Name of the experiment for tracking", 173 | ) 174 | args = parser.parse_args() 175 | train( 176 | gen=generator(), 177 | disc=discriminator(), 178 | gan_stack=stacked_GAN(gen=generator(), disc=discriminator()), 179 | max_iter=int(args.epochs), 180 | batch_size=int(args.batch_size), 181 | img_shape=int(args.img_shape), 182 | file_path=args.file_path, 183 | artifact_name=args.art_name, 184 | exp_name=args.experiment_name, 185 | ) 186 | --------------------------------------------------------------------------------