├── .DS_Store
├── Images
    ├── .DS_Store
    └── sample_ktp.png
├── KTPextractor_main.py
├── OCR_texts
    ├── .DS_Store
    └── ocr_sample_ktp.npy
├── Output_data
    ├── .DS_Store
    └── data_ocr_sample_ktp.csv
├── README.md
├── ktp_entity_extractor.py
├── kyc_config.py
├── my_gcvision_api_key.json
└── ocr_text_extractor.py


/.DS_Store:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/bukalapak/KTPextractor/399f54dade396f3c7dfbe3e05040a4296803ccda/.DS_Store


--------------------------------------------------------------------------------
/Images/.DS_Store:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/bukalapak/KTPextractor/399f54dade396f3c7dfbe3e05040a4296803ccda/Images/.DS_Store


--------------------------------------------------------------------------------
/Images/sample_ktp.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/bukalapak/KTPextractor/399f54dade396f3c7dfbe3e05040a4296803ccda/Images/sample_ktp.png


--------------------------------------------------------------------------------
/KTPextractor_main.py:
--------------------------------------------------------------------------------
 1 | import sys
 2 | import kyc_config as cfg
 3 | import ocr_text_extractor as ocr
 4 | import ktp_entity_extractor as extractor
 5 | 
 6 | if __name__ == '__main__':
 7 |     if(len(sys.argv) > 1):
 8 |         # input: image path
 9 |         img_path = sys.argv[1]
10 |         print('OCR processing '+img_path)
11 |         ocr.process_ocr(img_path)
12 | 
13 |         img_name = img_path.split('/')[-1].split('.')[0]
14 |         ocr_path = cfg.json_loc+'ocr_'+img_name+'.npy'
15 |         print('Extracting data from '+ocr_path)
16 |         extractor.process_extract_entities(ocr_path)
17 |     else:
18 |         print('argument is missing: image path')
19 | 


--------------------------------------------------------------------------------
/OCR_texts/.DS_Store:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/bukalapak/KTPextractor/399f54dade396f3c7dfbe3e05040a4296803ccda/OCR_texts/.DS_Store


--------------------------------------------------------------------------------
/OCR_texts/ocr_sample_ktp.npy:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/bukalapak/KTPextractor/399f54dade396f3c7dfbe3e05040a4296803ccda/OCR_texts/ocr_sample_ktp.npy


--------------------------------------------------------------------------------
/Output_data/.DS_Store:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/bukalapak/KTPextractor/399f54dade396f3c7dfbe3e05040a4296803ccda/Output_data/.DS_Store


--------------------------------------------------------------------------------
/Output_data/data_ocr_sample_ktp.csv:
--------------------------------------------------------------------------------
1 | province,city,identity_number,fullname,birth_place,birth_date,nationality,occupation,gender,marital_status,blood_type,address,rt_rw,kel_desa,kecamatan,religion,expired_date,state
2 | DKI JAKARTA,,317507010190999,BILLY BUMBLEBEE SIFULAN,SURABAYA,1990-01-01,INDONESIA,Karyawan Swasta,male,married,AB,JL DIMANA NO 100 AB,001/001,ANTAH BERANTAH,DUREN SAWIT,ISLAM,SEUMUR HIDUP,ok
3 | 


--------------------------------------------------------------------------------
/README.md:
--------------------------------------------------------------------------------
 1 | # KTPextractor
 2 | 
 3 | This is a service to extract data from KTP image. This is a part of open source project by Data Scientists of Bukalapak. Other open source projects: https://github.com/bukalapak?q=data
 4 | 
 5 | ### Config File
 6 | Please fill in the configuration in file `kyc_config.py`
 7 | `gcv_api_key_path`: path location of the GCV API Key. To get an API, check https://cloud.google.com/vision/docs/setup
 8 | `json_loc` = path location to save the OCR output from GCV
 9 | `output_loc` = path location to save the extracted KTP data
10 | 
11 | ### OCR Text Extractor
12 | To extract texts from an image (OCR), use the following command:
13 | ```
14 | python ocr_text_extractor.py <image_path>
15 | ```
16 | The OCR output file will be saved in the `json_loc` (check config file)
17 | 
18 | ### KTP Entity Extractor
19 | To extract attributes from the KTP based on the OCR output, use the following command:
20 | ```
21 | python ktp_entity_extractor.py <path of ocr output file>
22 | ```
23 | The extracted KTP data will be saved in csv format in the `output_loc` (check config file)
24 | 
25 | ### KTP Data Extractor
26 | To extract KTP data directly from KTP image, use the following command:
27 | ```
28 | python KTPextractor_main.py <image_path>
29 | ```
30 | The extracted KTP data will be saved in csv format in the `output_loc` (check config file)
31 | 


--------------------------------------------------------------------------------
/ktp_entity_extractor.py:
--------------------------------------------------------------------------------
  1 | import sys
  2 | import os
  3 | import re
  4 | import math
  5 | import copy
  6 | import pandas as pd
  7 | import numpy as np
  8 | import bisect
  9 | from datetime import datetime
 10 | import kyc_config as cfg
 11 | 
 12 | def levenshtein(source, target):
 13 |     if len(source) < len(target):
 14 |         return levenshtein(target, source)
 15 | 
 16 |     # So now we have len(source) >= len(target).
 17 |     if len(target) == 0:
 18 |         return len(source)
 19 | 
 20 |     # We call tuple() to force strings to be used as sequences
 21 |     # ('c', 'a', 't', 's') - numpy uses them as values by default.
 22 |     source = np.array(tuple(source))
 23 |     target = np.array(tuple(target))
 24 | 
 25 |     # We use a dynamic programming algorithm, but with the
 26 |     # added optimization that we only need the last two rows
 27 |     # of the matrix.
 28 |     previous_row = np.arange(target.size + 1)
 29 |     for s in source:
 30 |         # Insertion (target grows longer than source):
 31 |         current_row = previous_row + 1
 32 | 
 33 |         # Substitution or matching:
 34 |         # Target and source items are aligned, and either
 35 |         # are different (cost of 1), or are the same (cost of 0).
 36 |         current_row[1:] = np.minimum(
 37 |                 current_row[1:],
 38 |                 np.add(previous_row[:-1], target != s))
 39 | 
 40 |         # Deletion (target grows shorter than source):
 41 |         current_row[1:] = np.minimum(
 42 |                 current_row[1:],
 43 |                 current_row[0:-1] + 1)
 44 | 
 45 |         previous_row = current_row
 46 | 
 47 |     return previous_row[-1]
 48 | 
 49 | def correct2numbers(words):
 50 |     words = words.replace(' ','')
 51 |     if isNumber(words):
 52 |         result = ''
 53 |         for cc in words:
 54 |             if cc in ['T','I']:
 55 |                 result+='1'
 56 |             elif cc>='0' and cc<='9' :
 57 |                 result+=cc
 58 |             else :
 59 |                 result+='0'
 60 |         words = result
 61 |     return words
 62 | 
 63 | def calDegBox(box,x,y,w):
 64 |     ls_cal_abs = [np.abs(nx-x)+np.abs(ny-y) for nx,ny in box]
 65 |     index = np.argmin(ls_cal_abs)
 66 | 
 67 |     ls_cal_abs2 = [np.abs(nx-x-w)+np.abs(ny-y) for nx,ny in box]
 68 |     index2 = np.argmin(ls_cal_abs2)
 69 | 
 70 |     x1,y1 = box[index]
 71 |     x2,y2 = box[index2]
 72 |     myradians = math.atan2(y1-y2, x1-x2)
 73 |     mydegrees = math.degrees(myradians)
 74 |     mydegrees = mydegrees if mydegrees >= 0 else 360+mydegrees
 75 |     return mydegrees
 76 | 
 77 | def calDeg(x1,y1,x2,y2):
 78 |     myradians = math.atan2(y1-y2, x1-x2)
 79 |     mydegrees = math.degrees(myradians)
 80 |     mydegrees = mydegrees if mydegrees >= 0 else 360+mydegrees
 81 |     return mydegrees
 82 | 
 83 | def convert_format(text_response):
 84 |     ls_word = []
 85 |     if ('textAnnotations' in text_response):
 86 |         for text in text_response['textAnnotations']:
 87 |             boxes = {}
 88 |             boxes['label'] = text['description']
 89 | 
 90 |             boxes['x1'] = text['boundingPoly']['vertices'][0].get('x',0)
 91 |             boxes['y1'] = text['boundingPoly']['vertices'][0].get('y',0)
 92 |             boxes['x2'] = text['boundingPoly']['vertices'][1].get('x',0)
 93 |             boxes['y2'] = text['boundingPoly']['vertices'][1].get('y',0)
 94 |             boxes['x3'] = text['boundingPoly']['vertices'][2].get('x',0)
 95 |             boxes['y3'] = text['boundingPoly']['vertices'][2].get('y',0)
 96 |             boxes['x4'] = text['boundingPoly']['vertices'][3].get('x',0)
 97 |             boxes['y4'] = text['boundingPoly']['vertices'][3].get('y',0)
 98 | 
 99 |             boxes['w'] = boxes['x3'] - boxes['x1']
100 |             boxes['h'] = boxes['y3'] - boxes['y1']
101 | 
102 |             #print(boxes)
103 |             ls_word.append(boxes)
104 |     return ls_word
105 | 
106 | def get_attribute_ktp(ls_word,field_name,field_keywords,typo_tolerance, debug_mode=False):
107 |     if(len(ls_word)==0):
108 |         return None
109 | 
110 |     if(field_name == 'nama'):
111 |         ls_word = np.asarray([word for word in ls_word if word['label'].lower() not in ['jawa','nusa'] ])
112 | 
113 |     new_ls_word = np.asarray([word['label'].lower() for word in ls_word])
114 | 
115 |     ls_dist = [levenshtein(field_keywords, word.lower()) for word in new_ls_word]
116 |     if np.min(ls_dist) > typo_tolerance:
117 | 
118 |         if(field_name == 'kota' and field_keywords!='kota'):
119 |             return get_attribute_ktp(ls_word,field_name,'kota',1,debug_mode)
120 | 
121 |         return None
122 |     index = np.argmin(ls_dist)
123 |     x,y = ls_word[index]['x1'], ls_word[index]['y1']
124 |     w = ls_word[index]['w']
125 |     degree = calDeg(ls_word[index]['x1'],ls_word[index]['y1'],ls_word[index]['x2'],ls_word[index]['y2'])
126 | 
127 |     ls_y = np.asarray([np.abs(y-word['y1'])<300 for word in ls_word])
128 | 
129 |     value_words = [ww for ww, val in zip(ls_word,ls_y) if (val and np.abs(calDeg(x,y,ww['x1'],ww['y1'])-degree)<3)]
130 | 
131 |     if debug_mode:
132 |         print(value_words)
133 | 
134 |     # handling special attributes
135 | 
136 |     value_words = [val for val in value_words if len(val['label'].replace(' ','').replace(':',''))>0]
137 | 
138 |     d = [levenshtein('gol.', str(val['label']).lower()) for val in value_words]
139 |     if(len(d)>0 and min(d) <= 1):
140 |         idx = np.argmin(d)
141 |         value_words.pop(idx)
142 | 
143 |     d = [levenshtein('darah', str(val['label']).lower()) for val in value_words]
144 |     if(len(d)>0 and min(d) <= 1):
145 |         idx = np.argmin(d)
146 |         value_words.pop(idx)
147 | 
148 |     if(field_name == 'nik'):
149 |         if(len(value_words)>0):
150 |             global max_x
151 |             max_x = max([val['x2'] for val in value_words])
152 | 
153 |     if(field_name == 'kota'):
154 |         field_value = ""
155 |         for val in value_words:
156 |             field_value = field_value + ' '+ str(val['label'])
157 |         field_value = field_value.lstrip()
158 | 
159 |         if(field_keywords == 'kabupaten'):
160 |             return 'KABUPATEN '+field_value
161 |         else:
162 |             return 'KOTA '+field_value
163 | 
164 |     if(field_name == 'ttl'):
165 |             d = [levenshtein('lahir', str(val['label']).lower()) for val in value_words]
166 |             if(len(d)>0 and min(d) <= 2):
167 |                 idx = np.argmin(d)
168 |                 value_words.pop(idx)
169 |     elif(field_name == 'jenis_kelamin'):
170 |             score_laki, score_wanita = 999,999
171 |             d = [levenshtein('laki-laki', str(val['label']).lower()) for val in value_words]
172 |             if(len(d)>0 and min(d) <= 2):
173 |                 return 'LAKI-LAKI'
174 | 
175 |             d = [levenshtein('laki', str(val['label']).lower()) for val in value_words]
176 |             if(len(d)>0 and min(d) <= 1):
177 |                 return 'LAKI-LAKI'
178 | 
179 |             d = [levenshtein('wanita', str(val['label']).lower()) for val in value_words]
180 |             if(len(d)>0 and min(d) <= 2):
181 |                 return 'WANITA'
182 | 
183 |             d = [levenshtein('perempuan', str(val['label']).lower()) for val in value_words]
184 |             if(len(d)>0 and min(d) <= 2):
185 |                 return 'PEREMPUAN'
186 | 
187 |             return None
188 | 
189 |     elif(field_name == 'gol_darah'):
190 |             vals = [val['label'] for val in value_words if len(val['label']) <= 3]
191 |             if(len(vals)>0):
192 |                 return vals[0]
193 |             else:
194 |                 return None
195 | 
196 | 
197 |     elif(field_name == 'pekerjaan'):
198 |             d = [levenshtein('kartu', str(val['label']).lower()) for val in value_words]
199 |             if(len(d)>0 and min(d) <= 2):
200 |                 idx = np.argmin(d)
201 |                 value_words.pop(idx)
202 | 
203 |             value_words = [val for val in value_words if val['x1'] <= max_x]
204 | 
205 |     elif(field_name == 'kewarganegaraan'):
206 |             d = [levenshtein('wni', str(val['label']).lower()) for val in value_words]
207 |             if(len(d)>0):
208 |                 return 'WNI'
209 | 
210 |             xlocs = [val['x1'] for val in value_words]
211 |             if(len(xlocs)>0):
212 |                 idx = np.argmin(xlocs)
213 |                 return value_words[idx]['label']
214 |             else:
215 |                 return None
216 | 
217 | 
218 |     elif(field_name == 'status_perkawinan'):
219 |             xlocs = [val['x1'] for val in value_words]
220 |             if(len(xlocs)>0):
221 |                 idx = np.argmin(xlocs)
222 |                 field_value = value_words[idx]['label']
223 | 
224 |                 if(levenshtein('belum',field_value.lower()) <= 1):
225 |                     return 'BELUM KAWIN'
226 |                 else:
227 |                     return field_value
228 |             else:
229 |                 return None
230 | 
231 | 
232 |     elif(field_name == 'berlaku_hingga'):
233 |             d = [levenshtein('hingga', str(val['label']).lower()) for val in value_words]
234 |             if(len(d)>0 and min(d) <= 2):
235 |                 idx = np.argmin(d)
236 |                 value_words.pop(idx)
237 | 
238 |             xlocs = [val['x1'] for val in value_words]
239 |             if(len(xlocs)>0):
240 |                 idx = np.argmin(xlocs)
241 |                 field_value = value_words[idx]['label']
242 |                 if(levenshtein('seumur',field_value.lower()) <= 2):
243 |                     return 'SEUMUR HIDUP'
244 |                 else:
245 |                     return field_value
246 |             else:
247 |                 return None
248 | 
249 | 
250 |     field_value = ""
251 |     for val in value_words:
252 |         field_value = field_value + ' '+ str(val['label'])
253 |     field_value = field_value.lstrip()
254 | 
255 |     return field_value
256 | 
257 | def get_gender(ls_word):
258 |     new_ls_word = np.asarray([word['label'].lower() for word in ls_word])
259 | 
260 |     d = [levenshtein('laki-laki', word.lower()) for word in new_ls_word]
261 |     if(len(d)>0 and min(d) <= 3):
262 |             return 'male'
263 | 
264 |     d = [levenshtein('wanita', word.lower()) for word in new_ls_word]
265 |     if(len(d)>0 and min(d) <= 2):
266 |             return 'female'
267 | 
268 |     d = [levenshtein('perempuan', word.lower()) for word in new_ls_word]
269 |     if(len(d)>0 and min(d) <= 2):
270 |             return 'female'
271 | 
272 |     d = [levenshtein('pria', word.lower()) for word in new_ls_word]
273 |     if(len(d)>0 and min(d) <= 1):
274 |             return 'male'
275 | 
276 |     d = [levenshtein('laki', word.lower()) for word in new_ls_word]
277 |     if(len(d)>0 and min(d) <= 1):
278 |             return 'male'
279 | 
280 |     return None
281 | 
282 | def extract_date(date_string):
283 |     if(date_string == None):
284 |         return None
285 | 
286 |     date = None
287 |     try:
288 |         regex = re.compile(r'(\d{1,2}-\d{1,2}-\d{1,4})')
289 |         tgl = re.findall(regex, date_string)
290 |         if(len(tgl)>0):
291 |             date = datetime.strptime(tgl[0], '%d-%m-%Y')
292 |         else:
293 |             tgl = ''.join([n for n in date_string if n.isdigit()])
294 |             if(len(tgl)==8):
295 |                 date = datetime.strptime(tgl[0:2]+'-'+tgl[2:4]+'-'+tgl[4:], '%d-%m-%Y')
296 |     except ValueError:
297 |         return None
298 | 
299 |     if(date==None):
300 |         return None
301 | 
302 |     if((date.year < 1910) or (date.year > 2100)):
303 |         return None
304 | 
305 |     return date
306 | 
307 | def find_occupation(occ):
308 |     if(occ==None):
309 |         return None
310 | 
311 |     result = occ
312 |     if(levenshtein('mengurus rumah tangga',occ.lower()) <= 6):
313 |             result = 'Mengurus Rumah Tangga'
314 |     if(levenshtein('buruh harian lepas',occ.lower()) <= 6):
315 |             result = 'Buruh Harian Lepas'
316 |     if(levenshtein('pegawai negeri sipil',occ.lower()) <= 5):
317 |             result = 'Pegawai Negeri Sipil'
318 |     if(levenshtein('pelajar/mahasiswa',occ.lower()) <= 4):
319 |             result = 'Pelajar/Mahasiswa'
320 |     if(levenshtein('pelajar/mhs',occ.lower()) <= 3):
321 |             result = 'Pelajar/Mahasiswa'
322 |     if(levenshtein('belum/tidak bekerja',occ.lower()) <= 5):
323 |             result = 'Belum/Tidak Bekerja'
324 |     if(levenshtein('karyawan swasta',occ.lower()) <= 4):
325 |             result = 'Karyawan Swasta'
326 |     if(levenshtein('pegawai negeri',occ.lower()) <= 4):
327 |             result = 'Pegawai Negeri'
328 |     if(levenshtein('wiraswasta',occ[0:10].lower()) <= 3):
329 |             result = 'Wiraswasta'
330 |     if(levenshtein('peg negeri',occ.lower()) <= 3):
331 |             result = 'Pegawai Negeri'
332 |     if(levenshtein('peg swasta',occ.lower()) <= 3):
333 |             result = 'Pegawai Swasta'
334 | 
335 |     return result
336 | 
337 | 
338 | fields_ktp = [
339 |     {'field_name': 'provinsi', 'keywords': 'provinsi', 'typo_tolerance': 2},
340 |     {'field_name': 'kota', 'keywords': 'kabupaten', 'typo_tolerance': 2},
341 |     {'field_name': 'nik', 'keywords': 'nik', 'typo_tolerance': 1},
342 |     {'field_name': 'nama', 'keywords': 'nama', 'typo_tolerance': 2},
343 |     {'field_name': 'ttl', 'keywords': 'tempat/tgl', 'typo_tolerance': 5},
344 |     {'field_name': 'jenis_kelamin', 'keywords': 'kelamin', 'typo_tolerance': 3},
345 |     {'field_name': 'gol_darah', 'keywords': 'darah', 'typo_tolerance': 3},
346 |     {'field_name': 'alamat', 'keywords': 'alamat', 'typo_tolerance': 2},
347 |     {'field_name': 'rt_rw', 'keywords': 'rt/rw', 'typo_tolerance': 2},
348 |     {'field_name': 'kel_desa', 'keywords': 'kel/desa', 'typo_tolerance': 3},
349 |     {'field_name': 'kecamatan', 'keywords': 'kecamatan', 'typo_tolerance': 3},
350 |     {'field_name': 'agama', 'keywords': 'agama', 'typo_tolerance': 3},
351 |     {'field_name': 'status_perkawinan', 'keywords': 'perkawinan', 'typo_tolerance': 4},
352 |     {'field_name': 'pekerjaan', 'keywords': 'pekerjaan', 'typo_tolerance': 4},
353 |     {'field_name': 'kewarganegaraan', 'keywords': 'kewarganegaraan', 'typo_tolerance': 4},
354 |     {'field_name': 'berlaku_hingga', 'keywords': 'berlaku', 'typo_tolerance': 4}
355 | ]
356 | 
357 | def extract_ktp_data(text_response,debug_mode=False):
358 | 
359 |     ktp_extract = pd.DataFrame(columns=['province','city','identity_number','fullname','birth_place','birth_date','nationality','occupation','gender','marital_status',
360 |                                         'blood_type','address','rt_rw','kel_desa','kecamatan','religion','expired_date','state'])
361 | 
362 |     attributes = {}
363 | 
364 |     ls_word = convert_format(text_response)
365 | 
366 |     if(len(ls_word)==0):
367 |         attributes['state'] = "rejected"
368 |         ktp_extract = ktp_extract.append(attributes,ignore_index=True)
369 |         return ktp_extract
370 | 
371 |     global max_x
372 |     max_x = 9999
373 | 
374 |     raw_result = {}
375 | 
376 |     for field in fields_ktp:
377 |         field_value = get_attribute_ktp(ls_word,field['field_name'],field['keywords'],field['typo_tolerance'],debug_mode)
378 |         if(field_value != None):
379 |             field_value = str(field_value).replace(': ','').replace(':','')
380 |         #print(field['field_name'] +': '+str(field_value) )
381 |         raw_result[field['field_name']] = field_value
382 | 
383 | 
384 |     attributes['state'] = 'ok'
385 | 
386 |     attributes['identity_number'] = raw_result['nik']
387 |     if(attributes['identity_number'] != None):
388 |         attributes['identity_number'] = ''.join([i for i in raw_result['nik'] if i.isdigit()])
389 | 
390 |     if(attributes['identity_number'] == None):
391 |         attributes['state'] = "rejected"
392 |         ktp_extract = ktp_extract.append(attributes,ignore_index=True)
393 |         return ktp_extract
394 | 
395 |     attributes['fullname'] = raw_result['nama']
396 |     if(raw_result['nama'] != None):
397 |         attributes['fullname'] = ''.join([i for i in raw_result['nama'] if not i.isdigit()]).replace('-','').strip()
398 | 
399 | 
400 |     if(raw_result['jenis_kelamin'] == 'LAKI-LAKI'):
401 |         attributes['gender'] = 'male'
402 |     elif(raw_result['jenis_kelamin'] in ['WANITA','PEREMPUAN']):
403 |         attributes['gender'] = 'female'
404 |     else:
405 |         attributes['gender'] = get_gender(ls_word)
406 | 
407 |     attributes['birth_place'] = None
408 |     attributes['birth_date'] = None
409 | 
410 |     if(raw_result['ttl'] != None):
411 |         ttls = raw_result['ttl'].split(', ')
412 |         if(len(ttls)>=2):
413 |             attributes['birth_place'] = ttls[0]
414 |             attributes['birth_date'] = extract_date(ttls[1])
415 | 
416 |         elif(len(ttls)==1):
417 |             attributes['birth_place'] = ttls[0]
418 | 
419 |         if(attributes['birth_date'] == None):
420 |             attributes['birth_date'] = extract_date(raw_result['ttl'])
421 | 
422 |     if(attributes['birth_place'] != None):
423 |         attributes['birth_place'] = ''.join([i for i in attributes['birth_place'] if not i.isdigit()]).replace('-','').replace('.','').strip()
424 | 
425 |     attributes['nationality'] = raw_result['kewarganegaraan']
426 | 
427 |     if(attributes['nationality'] == "WNI"):
428 |         attributes['nationality'] = "INDONESIA"
429 | 
430 |     attributes['marital_status'] = raw_result['status_perkawinan']
431 |     if(attributes['marital_status'] != None):
432 |         if(levenshtein('belum kawin',attributes['marital_status'].lower()) <= 2):
433 |             attributes['marital_status'] = 'single'
434 |         elif(levenshtein('tidak kawin',attributes['marital_status'].lower()) <= 2):
435 |             attributes['marital_status'] = 'single'
436 |         elif(levenshtein('kawin',attributes['marital_status'].lower()) <= 1):
437 |             attributes['marital_status'] = 'married'
438 |         elif(levenshtein('janda',attributes['marital_status'].lower()) <= 2):
439 |             attributes['marital_status'] = 'widowed'
440 |         elif(levenshtein('duda',attributes['marital_status'].lower()) <= 2):
441 |             attributes['marital_status'] = 'widowed'
442 |         elif(levenshtein('cerai',attributes['marital_status'].lower()) <= 2):
443 |             attributes['marital_status'] = 'widowed'
444 |         else:
445 |             attributes['marital_status'] = None
446 | 
447 |     attributes['occupation'] = find_occupation(raw_result['pekerjaan'])
448 | 
449 | 
450 |     #bonus
451 |     if(raw_result['gol_darah'] != None):
452 |         attributes['blood_type'] = ''.join([i for i in raw_result['gol_darah'] if not i.isdigit()]).strip()
453 |         if(attributes['blood_type'].lower() not in ['a','b','ab','o']):
454 |             attributes['blood_type'] = None
455 |     else:
456 |         attributes['blood_type'] = None
457 | 
458 |     attributes['province'] = raw_result['provinsi']
459 |     attributes['city'] = raw_result['kota']
460 |     attributes['address'] = raw_result['alamat']
461 |     attributes['rt_rw'] = raw_result['rt_rw']
462 |     attributes['kel_desa'] = raw_result['kel_desa']
463 |     attributes['kecamatan'] = raw_result['kecamatan']
464 |     attributes['religion'] = raw_result['agama']
465 |     attributes['expired_date'] = raw_result['berlaku_hingga']
466 | 
467 |     ktp_extract = ktp_extract.append(attributes,ignore_index=True)
468 | 
469 |     return ktp_extract
470 | 
471 | def process_extract_entities(ocr_path):
472 |         try:
473 |             text_response = np.load(ocr_path).item()
474 |         except:
475 |             print(ocr_path+' cannot be loaded')
476 | 
477 |         ktp_extract = extract_ktp_data(text_response)
478 |         print(ktp_extract.iloc[0])
479 | 
480 |         ocr_name = ocr_path.split('/')[-1].split('.')[0]
481 |         output_name = cfg.output_loc+'data_'+ocr_name+'.csv'
482 |         ktp_extract.to_csv(output_name,index=False)
483 | 
484 | if __name__ == '__main__':
485 |     if(len(sys.argv) > 1):
486 |         # input: ocr file path
487 |         ocr_path = sys.argv[1]
488 |         print('Extracting data from '+ocr_path)
489 |         process_extract_entities(ocr_path)
490 |     else:
491 |         print('argument is missing: ocr output file path')
492 | 


--------------------------------------------------------------------------------
/kyc_config.py:
--------------------------------------------------------------------------------
1 | gcv_api_key_path = 'my_gcvision_api_key.json'
2 | json_loc = 'OCR_texts/'
3 | output_loc = 'Output_data/'
4 | 


--------------------------------------------------------------------------------
/my_gcvision_api_key.json:
--------------------------------------------------------------------------------
 1 | PLEASE REPLACE THIS FILE WITH YOUR OWN GCV API KEY
 2 | {
 3 |   "type": "",
 4 |   "project_id": "",
 5 |   "private_key_id": "",
 6 |   "private_key": "",
 7 |   "client_email": "",
 8 |   "client_id": "",
 9 |   "auth_uri": "",
10 |   "token_uri": "",
11 |   "auth_provider_x509_cert_url": "",
12 |   "client_x509_cert_url": ""
13 | }
14 | 


--------------------------------------------------------------------------------
/ocr_text_extractor.py:
--------------------------------------------------------------------------------
 1 | from google.cloud import vision
 2 | from google.cloud.vision import types
 3 | from google.protobuf.json_format import MessageToDict
 4 | import pandas as pd
 5 | import numpy as np
 6 | import sys
 7 | import kyc_config as cfg
 8 | 
 9 | client = vision.ImageAnnotatorClient.from_service_account_file(
10 | cfg.gcv_api_key_path
11 | )
12 | 
13 | def get_text_response_from_path(path):
14 | 
15 |     output = None
16 | 
17 |     try:
18 |         if path.startswith('http') or path.startswith('gs:'):
19 |             image = types.Image()
20 |             image.source.image_uri = path
21 |         else:
22 |             with open(path, 'rb') as image_file:
23 |                 content = image_file.read()
24 |             image = types.Image(content=content)
25 | 
26 |     except ValueError:
27 |         output = "Cannot Read Input File"
28 |         return output
29 | 
30 |     text_response = client.text_detection(image=image)
31 |     text_response = MessageToDict(text_response)
32 |     return text_response
33 | 
34 | def process_ocr(img_path):
35 |     text_response = get_text_response_from_path(img_path)
36 | 
37 |     #save the output file
38 |     img_name = img_path.split('/')[-1].split('.')[0]
39 |     json_name = cfg.json_loc+'ocr_'+img_name+'.npy'
40 |     np.save(json_name, text_response)
41 | 
42 | if __name__ == '__main__':
43 |     if(len(sys.argv) > 1):
44 |         # input: image path
45 |         img_path = sys.argv[1]
46 |         print('OCR processing '+img_path)
47 |         process_ocr(img_path)
48 |     else:
49 |         print('argument is missing: image path')
50 | 


--------------------------------------------------------------------------------