├── .gitignore
├── README.md
├── external_id_search
    └── script.py
├── m0_preprocessing
    └── convert_sid_to_jpg.py
├── m1_geotiff
    └── convert_image_to_geotiff.py
├── m2_detection_recognition
    └── crop_img.py
├── m3_image_geojson
    ├── run.sh
    └── stitch_output.py
├── m4_post_ocr
    ├── logstash_postocr.conf
    ├── post_ocr.py
    ├── post_ocr_main.py
    └── preprocess.py
├── m5_geocoordinate_converter
    └── convert_geojson_to_geocoord.py
├── m6_entity_linker
    ├── create_elasticsearch_index.py
    ├── create_spatial_index_postgres.py
    ├── entity_linking.py
    ├── logstash_osm_linker.conf
    ├── logstash_postgres_world.conf
    ├── postgres_logger.py
    ├── upload_osm_to_postgres_all_continents.py
    └── upload_osm_to_postgres_ogr2ogr.py
├── m_sanborn
    ├── s1_geocoding.py
    ├── s2_clustering.py
    └── s3_gen_geojson.py
├── requirements.txt
├── run.py
├── run_img.py
└── utils.py


/.gitignore:
--------------------------------------------------------------------------------
 1 | data/
 2 | data0/
 3 | data1/
 4 | rumsey_output/
 5 | .idea/
 6 | .env
 7 | MrSID*
 8 | __pycache__
 9 | debug/
10 | .ipynb_checkpoints/
11 | run_linker.py
12 | osm_linker.csv


--------------------------------------------------------------------------------
/README.md:
--------------------------------------------------------------------------------
 1 | 
 2 | ## About mapKurator System
 3 | 
 4 | **[New]**: Our documentation website for mapKurator system is up! [https://knowledge-computing.github.io/mapkurator-doc/](https://knowledge-computing.github.io/mapkurator-doc/#/)
 5 | 
 6 | [mapKurator](https://dl.acm.org/doi/abs/10.1145/3589132.3625579) is a fully automatic pipeline developed by the [**Knowledge Computing Lab**](https://knowledge-computing.github.io/) at the **University of Minnesota** to process a large number of scanned historical map images. Outputs include the recognized text labels, label bounding polygons, labels after post-OCR correction, and a geo-entity identifier from OpenStreetMap. 
 7 | 
 8 | ### mapKurator textspotter repository
 9 | Please refer to this link to check all the spotter models in mapKurator : [Spotter-v2](https://github.com/knowledge-computing/mapkurator-spotter), [PALETTE](https://github.com/knowledge-computing/mapkurator-palette)
10 | 
11 | ---------
12 | 
13 | ## Data Card - Derived Dataset Processed by mapKurator System 
14 | 
15 | Map text recognized from the [Rumsey historical map collection](https://www.davidrumsey.com/) with 57K georeferenced maps. 
16 | 
17 | ### Dataset Download Link
18 | 
19 | Text extraction and recognition results [https://s3.msi.umn.edu/rumsey_output/Round3/english.zip](https://s3.msi.umn.edu/rumsey_output/Round3/english.zip (~50GB)) (~50GB)
20 | 
21 | ### Dataset Languages
22 | 
23 | English
24 | 
25 | ### Language Creators:
26 | 
27 | Machine-generated
28 | 
29 | ## Dataset Structure
30 | 
31 | ### Data Fields
32 | 
33 | <img src="https://user-images.githubusercontent.com/5383572/188784909-10cd04fd-4b61-4205-a563-33d20f9026db.png" width="700">
34 | 
35 | ### Output File Name
36 | 
37 | Output GeoJSON file is named after the external ID of original map image.
38 | 
39 | <img src="https://user-images.githubusercontent.com/5383572/188785367-446690fd-76fc-47db-b2ae-a1fac4fc61d6.png" width="700">
40 | 
41 | ### Citation
42 | ```
43 | @inproceedings{kim2023mapkurator,
44 |   title={The mapKurator System: A Complete Pipeline for Extracting and Linking Text from Historical Maps},
45 |   author={Kim, Jina and Li, Zekun and Lin, Yijun and Namgung, Min and Jang, Leeje and Chiang, Yao-Yi},
46 |   booktitle={Proceedings of the 31st ACM International Conference on Advances in Geographic Information Systems},
47 |   pages={1--4},
48 |   year={2023}
49 | }
50 | ```
51 | 
52 | ### Licensing Information
53 | 
54 | CC BY-NC 2.0
55 | 
56 | 


--------------------------------------------------------------------------------
/external_id_search/script.py:
--------------------------------------------------------------------------------
 1 | from elasticsearch_dsl import Search, Q
 2 | from elasticsearch import Elasticsearch, helpers
 3 | from elasticsearch import RequestsHttpConnection
 4 | import argparse
 5 | import os
 6 | import glob
 7 | import json
 8 | import nltk
 9 | import logging
10 | from dotenv import load_dotenv
11 | 
12 | import pandas as pd
13 | import numpy as np
14 | import logging
15 | import re
16 | import warnings
17 | warnings.filterwarnings("ignore")
18 | 
19 | 
20 | 
21 | def db_connect():
22 |     """Elasticsearch Connection on Sansa"""
23 |     load_dotenv()
24 |     
25 |     DB_HOST = os.getenv("DB_HOST")
26 |     USER_NAME = os.getenv("DB_USERNAME")
27 |     PASSWORD = os.getenv("DB_PASSWORD")
28 | 
29 |     es = Elasticsearch([DB_HOST], connection_class=RequestsHttpConnection, http_auth=(USER_NAME, PASSWORD), verify_certs=False)
30 |     return es
31 | 
32 | 
33 | def query(target):
34 |     es = db_connect()
35 |     inputs = target.upper()
36 |     query = {"query": {"match": {"text": f"{inputs}"}}} 
37 |     test = es.search(index="meta", body=query, size=10000)["hits"]["hits"] 
38 |     
39 |     id_list = []
40 |     if len(test) != 0 :
41 |         for i in range(len(test)):
42 |             map_id = test[i]['_source']['external_id']
43 |             id_list.append(map_id)
44 |     
45 |     
46 |     result = sorted(list(set(id_list)))
47 |     return result
48 |    
49 | 
50 | def main(args):
51 |     keyword = args.target
52 |     metadata_path = args.metadata
53 |     meta_df = pd.read_csv(metadata_path)
54 |     meta_df['tmp'] = meta_df['image_no'].str.split(".").str[0]
55 | 
56 |     results = query(keyword)
57 |     # print(f' "{keyword}" exist in: {results}')
58 | 
59 |     tmp_df = meta_df[meta_df.tmp.isin(results)]
60 | 
61 |     print(f'"{keyword}" exist in:')
62 |     for index, row in tmp_df.iterrows():
63 |         print(f'{row.tmp} \t {row.title}')
64 | 
65 |     
66 | if __name__ == '__main__':
67 |     parser = argparse.ArgumentParser()
68 |     parser.add_argument('--target', type=str, default='east', help='')
69 |     parser.add_argument('--metadata', type=str, default='/home/maplord/maplist_csv/luna_omo_metadata_56628_20220724.csv', help='')
70 |     
71 |     args = parser.parse_args()
72 |     print(args)
73 |     
74 |     main(args)
75 |     


--------------------------------------------------------------------------------
/m0_preprocessing/convert_sid_to_jpg.py:
--------------------------------------------------------------------------------
 1 | import os 
 2 | import glob 
 3 | import time
 4 | import multiprocessing
 5 | 
 6 | sid_dir = '/data/rumsey-sid'
 7 | sid_to_jpg_dir = '/data2/rumsey_sid_to_jpg/'
 8 | num_process = 20
 9 | if_print_command  = True
10 | 
11 | sid_list = glob.glob(os.path.join(sid_dir, '*/*.sid'))
12 | 
13 | def execute_command(command, if_print_command):
14 |     t1 = time.time()
15 | 
16 |     if if_print_command:
17 |         print(command)
18 |     os.system(command)
19 | 
20 |     t2 = time.time()
21 |     time_usage = t2 - t1 
22 |     return time_usage
23 | 
24 | 
25 | def conversion(img_path):
26 |     mrsiddecode_executable="/home/zekun/dr_maps/mapkurator-system/m1_geotiff/MrSID_DSDK-9.5.4.4709-rhel6.x86-64.gcc531/Raster_DSDK/bin/mrsiddecode"
27 |     map_name = os.path.basename(img_path)[:-4]
28 | 
29 |     redirected_path = os.path.join(sid_to_jpg_dir, map_name + '.jpg')
30 | 
31 |     run_sid_to_jpg_command = mrsiddecode_executable + ' -quiet -i '+ img_path + ' -o '+redirected_path
32 |     time_usage = execute_command(run_sid_to_jpg_command, if_print_command)
33 | 
34 | 
35 | 
36 | if __name__ == "__main__":
37 |     pool = multiprocessing.Pool(num_process)
38 |     start_time = time.perf_counter()
39 |     processes = [pool.apply_async(conversion, args=(sid_path,)) for sid_path in sid_list]
40 |     result = [p.get() for p in processes]
41 |     finish_time = time.perf_counter()
42 |     print(f"Program finished in {finish_time-start_time} seconds")
43 |     
44 | 


--------------------------------------------------------------------------------
/m1_geotiff/convert_image_to_geotiff.py:
--------------------------------------------------------------------------------
  1 | import os
  2 | import glob
  3 | import pandas as pd
  4 | import ast
  5 | import argparse
  6 | import logging
  7 | import pdb
  8 | 
  9 | logging.basicConfig(level=logging.INFO)
 10 | 
 11 | def func_file_to_fullpath_dict(file_path_list):
 12 | 
 13 |     file_fullpath_dict = dict()
 14 |     for file_path in file_path_list:
 15 |         file_fullpath_dict[os.path.basename(file_path).split('.')[0]] = file_path
 16 | 
 17 |     return file_fullpath_dict  
 18 | 
 19 | def main(args):
 20 | 
 21 |     jp2_root_dir = args.jp2_root_dir
 22 |     sid_root_dir = args.sid_root_dir
 23 |     additional_root_dir = args.additional_root_dir
 24 |     out_geotiff_dir = args.out_geotiff_dir
 25 | 
 26 |     sample_map_path = args.sample_map_path
 27 |     external_id_key = args.external_id_key
 28 | 
 29 |     jp2_file_path_list = glob.glob(os.path.join(jp2_root_dir, '*/*.jp2'))
 30 |     sid_file_path_list = glob.glob(os.path.join(sid_root_dir, '*.jpg')) # use converted jpg directly
 31 |     add_file_path_list = glob.glob(os.path.join(additional_root_dir, '*'))
 32 | 
 33 |     jp2_file_fullpath_dict = func_file_to_fullpath_dict(jp2_file_path_list) 
 34 |     sid_file_fullpath_dict = func_file_to_fullpath_dict(sid_file_path_list) 
 35 |     add_file_fullpath_dict = func_file_to_fullpath_dict(add_file_path_list) 
 36 | 
 37 |     sample_map_df = pd.read_csv(sample_map_path, dtype={'external_id':str})
 38 | 
 39 | 
 40 |     for index, record in sample_map_df.iterrows():
 41 |         external_id = record.external_id
 42 |         transform_method = record.transformation_method
 43 |         gcps = record.gcps
 44 |         filename_without_extension = external_id.strip("'").replace('.','')
 45 | 
 46 |         full_path = ''
 47 |         if filename_without_extension in jp2_file_fullpath_dict:
 48 |             full_path = jp2_file_fullpath_dict[filename_without_extension]
 49 |         elif filename_without_extension in sid_file_fullpath_dict:
 50 |             full_path = sid_file_fullpath_dict[filename_without_extension]
 51 |         elif filename_without_extension in add_file_fullpath_dict:
 52 |             full_path = add_file_fullpath_dict[filename_without_extension]
 53 |         else:
 54 |             print('image with external_id not found in image_dir:', external_id)
 55 |             continue
 56 |         assert (len(full_path)!=0)
 57 | 
 58 |         gcps = ast.literal_eval(gcps)
 59 | 
 60 |         gcp_str = ''
 61 |         for gcp in gcps:
 62 |             lng, lat = gcp['location']
 63 |             x, y = gcp['pixel']
 64 |             gcp_str += '-gcp '+str(x) + ' ' + str(y) + ' ' + str(lng) + ' ' + str(lat) + ' '
 65 |         
 66 |         # gdal_translate to add GCP to raw image
 67 |         gdal_command = 'gdal_translate -of Gtiff '+gcp_str + full_path + ' ' + os.path.join(out_geotiff_dir, filename_without_extension) + '_temp.geotiff'
 68 |         print(gdal_command)
 69 |         os.system(gdal_command)
 70 |         
 71 |         
 72 |         assert transform_method in ['affine','polynomial','tps']
 73 |              
 74 |         # reprojection with gdal_warp
 75 |         if transform_method == 'affine': 
 76 |             # first order
 77 |             
 78 |             warp_command = 'gdalwarp -s_srs EPSG:4326 -t_srs EPSG:3857 -r near -order 1 -of GTiff ' + os.path.join(out_geotiff_dir, filename_without_extension) + '_temp.geotiff' + ' ' + os.path.join(out_geotiff_dir, filename_without_extension) + '.geotiff'  
 79 |             
 80 |         elif transform_method == 'polynomial':
 81 |             # second order
 82 |             warp_command = 'gdalwarp -s_srs EPSG:4326 -t_srs EPSG:3857 -r near -order 2 -of GTiff '+ os.path.join(out_geotiff_dir, filename_without_extension) + '_temp.geotiff' + ' ' + os.path.join(out_geotiff_dir, filename_without_extension) + '.geotiff'  
 83 | 
 84 |         elif transform_method == 'tps':
 85 |             # Thin plate spline #debug/11558008.geotiff  #10057000.geotiff
 86 |             warp_command = 'gdalwarp -s_srs EPSG:4326 -t_srs EPSG:3857  -r near -tps -of GTiff '+ os.path.join(out_geotiff_dir, filename_without_extension) + '_temp.geotiff' + ' ' + os.path.join(out_geotiff_dir, filename_without_extension) + '.geotiff'  
 87 |             
 88 |         else:
 89 |             raise NotImplementedError
 90 |         print(warp_command)
 91 |         os.system(warp_command)
 92 |         # remove temporary tiff file
 93 |         # os.system('rm ' + os.path.join(out_geotiff_dir, filename_without_extension) + '_temp.geotiff')
 94 | 
 95 | 
 96 |         logging.info('Done generating geotiff for %s', external_id)
 97 | 
 98 | 
 99 | if __name__ == '__main__':
100 | 
101 |     parser = argparse.ArgumentParser()
102 |     parser.add_argument('--jp2_root_dir', type=str, default='/data/rumsey-jp2/',
103 |                         help='image dir of jp2 files.')
104 |     parser.add_argument('--sid_root_dir', type=str, default='/data2/rumsey_sid_to_jpg/',
105 |                         help='image dir of sid files.')
106 |     parser.add_argument('--additional_root_dir', type=str, default='/data2/rumsey-luna-img/',
107 |                         help='image dir of additional luna files.')
108 |     parser.add_argument('--out_geotiff_dir', type=str, default='data/geotiff/',
109 |                         help='output dir for geotiff')
110 |     parser.add_argument('--sample_map_path', type=str, default='data/initial_US_100_maps.csv',
111 |                         help='path to sample map csv, which contains gcps info')
112 |     parser.add_argument('--external_id_key', type=str, default='external_id',
113 |                         help='key string for external id, could be external_id or ListNo')
114 |  
115 |     args = parser.parse_args()
116 |     print(args)
117 | 
118 | 
119 |     main(args)
120 | 


--------------------------------------------------------------------------------
/m2_detection_recognition/crop_img.py:
--------------------------------------------------------------------------------
 1 | import sys
 2 | import os
 3 | from PIL import Image, ImageFile
 4 | import numpy as np
 5 | import argparse
 6 | import logging
 7 | 
 8 | logging.basicConfig(level=logging.INFO)
 9 | Image.MAX_IMAGE_PIXELS=None # allow reading huge images
10 | 
11 | #add this one line and import ImageFile above 
12 | ImageFile.LOAD_TRUNCATED_IMAGES = True
13 | 
14 | def main(args):
15 | 
16 |     img_path = args.img_path
17 |     output_dir = args.output_dir
18 | 
19 |     map_name = os.path.basename(img_path).split('.')[0] # get the map name without extension
20 |     output_dir = os.path.join(output_dir, map_name)
21 | 
22 |     if not os.path.isdir(output_dir):
23 |         os.makedirs(output_dir)
24 | 
25 |     map_img = Image.open(img_path) 
26 |     width, height = map_img.size 
27 | 
28 |     #print(width, height)
29 | 
30 |     shift_size = 1000
31 | 
32 |     # pad the image to the size divisible by shift-size
33 |     num_tiles_w = int(np.ceil(1. * width / shift_size))
34 |     num_tiles_h = int(np.ceil(1. * height / shift_size))
35 |     enlarged_width = int(shift_size * num_tiles_w)
36 |     enlarged_height = int(shift_size * num_tiles_h)
37 | 
38 |     enlarged_map = Image.new(mode="RGB", size=(enlarged_width, enlarged_height))
39 |     # paste map_imge to enlarged_map
40 |     enlarged_map.paste(map_img) 
41 | 
42 |     for idx in range(0, num_tiles_h):
43 |         for jdx in range(0, num_tiles_w):
44 |             img_clip = enlarged_map.crop((jdx * shift_size, idx * shift_size,(jdx + 1) * shift_size, (idx + 1) * shift_size, ))
45 | 
46 |             out_path = os.path.join(output_dir, 'h' + str(idx) + '_w' + str(jdx) + '.jpg')
47 |             img_clip.save(out_path)
48 | 
49 |     logging.info('Done cropping %s' %img_path )
50 | 
51 | 
52 | if __name__ == '__main__':
53 | 
54 |     parser = argparse.ArgumentParser()
55 |     parser.add_argument('--img_path', type=str, default='../data/100_maps/8628000.jp2',
56 |                         help='path to image file.')
57 |     parser.add_argument('--output_dir', type=str, default='../data/100_maps_crop/',
58 |                         help='path to output dir')
59 |    
60 |     args = parser.parse_args()
61 |     print(args)
62 | 
63 |     
64 |     # if not os.path.isdir(args.output_dir):
65 |     #     os.makedirs(args.output_dir)
66 |     #     print('created dir',args.output_dir)
67 | 
68 |     main(args)
69 | 


--------------------------------------------------------------------------------
/m3_image_geojson/run.sh:
--------------------------------------------------------------------------------
 1 | # python3 stitch_output.py --input_dir='/data2/rumsey_output/57k_maps_r3/spotter/arabic/13415000' --output='/data2/rumsey_output/57k_maps_r3/stitch/arabic/13415000.geojson'
 2 | 
 3 | # python3 stitch_output.py --input_dir='/data2/rumsey_output/57k_maps_r3/spotter/arabic/0845008' --output='/data2/rumsey_output/57k_maps_r3/stitch/arabic/0845008.geojson'
 4 | 
 5 | # python3 stitch_output.py --input_dir='/data2/rumsey_output/57k_maps_r3/spotter/arabic/8407000' --output='/data2/rumsey_output/57k_maps_r3/stitch/arabic/8407000.geojson'
 6 | 
 7 | # python3 stitch_output.py --input_dir='/data2/rumsey_output/57k_maps_r3/spotter/arabic/13272452' --output='/data2/rumsey_output/57k_maps_r3/stitch/arabic/13272452.geojson'
 8 | 
 9 | # python3 stitch_output.py --input_dir='/data2/rumsey_output/57k_maps_r3/spotter/arabic/6855023' --output='/data2/rumsey_output/57k_maps_r3/stitch/arabic/6855023.geojson'
10 | 
11 | # python3 stitch_output.py --input_dir='/data2/rumsey_output/57k_maps_r3/spotter/arabic/10198088' --output='/data2/rumsey_output/57k_maps_r3/stitch/arabic/10198088.geojson'
12 | 
13 | # python3 stitch_output.py --input_dir='/data2/rumsey_output/57k_maps_r3/spotter/arabic/2119002' --output='/data2/rumsey_output/57k_maps_r3/stitch/arabic/2119002.geojson'
14 | 
15 | # python3 stitch_output.py --input_dir='/data2/rumsey_output/57k_maps_r3/spotter/arabic/5850099' --output='/data2/rumsey_output/57k_maps_r3/stitch/arabic/5850099.geojson'
16 | 
17 | # python3 stitch_output.py --input_dir='/data2/rumsey_output/57k_maps_r3/spotter/arabic/0352067' --output='/data2/rumsey_output/57k_maps_r3/stitch/arabic/0352067.geojson'
18 | 
19 | # python3 stitch_output.py --input_dir='/data2/rumsey_output/57k_maps_r3/spotter/arabic/8496000' --output='/data2/rumsey_output/57k_maps_r3/stitch/arabic/8496000.geojson'
20 | 
21 | # python3 stitch_output.py --input_dir='/data2/rumsey_output/57k_maps_r3/spotter/arabic/10285112' --output='/data2/rumsey_output/57k_maps_r3/stitch/arabic/10285112.geojson'
22 | 
23 | # python3 stitch_output.py --input_dir='/data2/rumsey_output/57k_maps_r3/spotter/arabic/11201250' --output='/data2/rumsey_output/57k_maps_r3/stitch/arabic/11201250.geojson'
24 | 
25 | # python3 stitch_output.py --input_dir='/data2/rumsey_output/57k_maps_r3/spotter/arabic/7924008' --output='/data2/rumsey_output/57k_maps_r3/stitch/arabic/7924008.geojson'
26 | 
27 | # python3 stitch_output.py --input_dir='/data2/rumsey_output/57k_maps_r3/spotter/arabic/8859002' --output='/data2/rumsey_output/57k_maps_r3/stitch/arabic/8859002.geojson'
28 | 
29 | # python3 stitch_output.py --input_dir='/data2/rumsey_output/57k_maps_r3/spotter/arabic/2239006' --output='/data2/rumsey_output/57k_maps_r3/stitch/arabic/2239006.geojson'
30 | 
31 | # python3 stitch_output.py --input_dir='/data2/rumsey_output/57k_maps_r3/spotter/arabic/6954000' --output='/data2/rumsey_output/57k_maps_r3/stitch/arabic/6954000.geojson'
32 | 
33 | # python3 stitch_output.py --input_dir='/data2/rumsey_output/57k_maps_r3/spotter/arabic/9085004' --output='/data2/rumsey_output/57k_maps_r3/stitch/arabic/9085004.geojson'
34 | 
35 | # python3 stitch_output.py --input_dir='/data2/rumsey_output/57k_maps_r3/spotter/arabic/6353076' --output='/data2/rumsey_output/57k_maps_r3/stitch/arabic/6353076.geojson'
36 | 
37 | 
38 | python3 stitch_output.py --input_dir='/data2/rumsey_output/57k_maps_r3/spotter/english/0019149' --output='/data2/rumsey_output/57k_maps_r3/stitch/english/0019149.geojson'
39 | python3 stitch_output.py --input_dir='/data2/rumsey_output/57k_maps_r3/spotter/russian/0019149' --output='/data2/rumsey_output/57k_maps_r3/stitch/russian/0019149.geojson'
40 | python3 stitch_output.py --input_dir='/data2/rumsey_output/57k_maps_r3/spotter/arabic/0019149' --output='/data2/rumsey_output/57k_maps_r3/stitch/arabic/0019149.geojson'
41 | python3 stitch_output.py --input_dir='/data2/rumsey_output/57k_maps_r3/spotter/chinese/0019149' --output='/data2/rumsey_output/57k_maps_r3/stitch/chinese/0019149.geojson'
42 | 
43 | 
44 | 
45 | 


--------------------------------------------------------------------------------
/m3_image_geojson/stitch_output.py:
--------------------------------------------------------------------------------
  1 | import os
  2 | import glob
  3 | import pandas as pd 
  4 | import numpy as np
  5 | import argparse
  6 | from geojson import Polygon, Feature, FeatureCollection, dump
  7 | import logging
  8 | import pdb
  9 | 
 10 | # logging.basicConfig(level=logging.INFO) 
 11 | 
 12 | logging.basicConfig(level=logging.ERROR) 
 13 | pd.options.mode.chained_assignment = None
 14 | 
 15 | def concatenate_and_convert_to_geojson(args):
 16 |     map_subdir = args.input_dir
 17 |     output_geojson = args.output_geojson
 18 |     shift_size = args.shift_size
 19 |     eval_bool = args.eval_only
 20 | 
 21 |     file_list = glob.glob(map_subdir + '/*.json')
 22 |     file_list = sorted(file_list)
 23 |     if len(file_list) == 0:
 24 |         logging.warning('No files found for %s' % map_subdir)
 25 |     
 26 |     map_data = []
 27 |     for file_path in file_list:
 28 |         patch_index_h, patch_index_w = os.path.basename(file_path).split('.')[0].split('_')
 29 |         patch_index_h = int(patch_index_h[1:])
 30 |         patch_index_w = int(patch_index_w[1:])
 31 | 
 32 |         try:
 33 |             # fix text column to be type 'object', to avoid errors (e.g. '6' -> 6.0 'NAn' -> nan)
 34 |             df = pd.read_json(file_path, dtype={'text':object})
 35 |         except pd.errors.EmptyDataError:
 36 |             logging.warning('%s is empty. Skipping.' % file_path)
 37 |             continue 
 38 |         except KeyError as ke:
 39 |             logging.warning('%s has no detected labels. Skipping.' %file_path)
 40 |             continue 
 41 |         
 42 |         for index, line_data in df.iterrows():
 43 |             df['polygon_x'][index] = np.array(df['polygon_x'][index]).astype(float) + shift_size * patch_index_w
 44 |             df['polygon_y'][index] = np.array(df['polygon_y'][index]).astype(float) + shift_size * patch_index_h
 45 |         map_data.append(df)
 46 | 
 47 |     if len(map_data) == 0: 
 48 |         with open(output_geojson, 'w', encoding='utf8') as f:
 49 |             pass 
 50 |         print('created empty geojson for', output_geojson)
 51 |         return 0 
 52 |         
 53 |     map_df = pd.concat(map_data)
 54 | 
 55 | 
 56 |     features = []
 57 |     for index, line_data in map_df.iterrows():
 58 |         polygon_x, polygon_y = list(line_data['polygon_x']), list(line_data['polygon_y'])
 59 |         
 60 |         if eval_bool ==  False: 
 61 |             # y is kept to be positive.  Needs to be negative for QGIS visualization
 62 |             # For flip coordinates: [x,-y] for x,y in zip(polygon_x, polygon_y), 
 63 |             # To form a closed loop polygon: [polygon_x[0], -polygon_y[0]], otherwise QGIS can not display the polygon
 64 |             polygon = Polygon([[[x,-y] for x,y in zip(polygon_x, polygon_y)]+[[polygon_x[0], -polygon_y[0]]]])
 65 |         else:
 66 |             polygon = Polygon([[[x,y] for x,y in zip(polygon_x, polygon_y)]+[[polygon_x[0], polygon_y[0]]]])
 67 |             
 68 |         text = line_data['text']
 69 |         score = line_data['score']
 70 |         features.append(Feature(geometry = polygon, properties={"text": text, "score": score} ))
 71 | 
 72 |     feature_collection = FeatureCollection(features)
 73 |     # with open(os.path.join(output_dir, map_subdir +'.geojson'), 'w') as f:
 74 |     #     dump(feature_collection, f)
 75 |     with open(output_geojson, 'w', encoding='utf8') as f:
 76 |         dump(feature_collection, f, ensure_ascii=False)
 77 |     
 78 |     logging.info('Done generating geojson (img coord) for %s', map_subdir)
 79 | 
 80 | 
 81 | if __name__ == '__main__':
 82 | 
 83 |     parser = argparse.ArgumentParser()
 84 |     parser.add_argument('--input_dir', type=str, default='data/100_maps_crop_abc/0063014',
 85 |                         help='path to input json path.')
 86 |     
 87 |     parser.add_argument('--output_geojson', type=str, default='data/100_maps_geojson_abc/0063014.geojson',
 88 |                         help='path to output geojson path')
 89 | 
 90 |     parser.add_argument('--shift_size', type=int, default = 1000,
 91 |                         help='image patch size and shift size.')
 92 |     
 93 |     # This can not be of string type. Otherwise it will be interpreted to True all the time.
 94 |     parser.add_argument('--eval_only', default = False, action='store_true',
 95 |                         help='keep positive coordinate')
 96 |    
 97 |     args = parser.parse_args()
 98 |     print(args)
 99 | 
100 |     concatenate_and_convert_to_geojson(args)
101 | 
102 | 
103 | 
104 | 
105 | 
106 | 


--------------------------------------------------------------------------------
/m4_post_ocr/logstash_postocr.conf:
--------------------------------------------------------------------------------
 1 | input {
 2 |     file {
 3 |         path => "total.csv"
 4 |         start_position => beginning
 5 |         sincedb_path => "/dev/null"
 6 |     }
 7 | }
 8 | filter {
 9 |     csv {
10 |         separator => ","
11 |         columns => [
12 |                 "name",
13 |                 "count"
14 |         ]
15 |     }
16 | }
17 | output {
18 |  elasticsearch {
19 |         hosts => "localhost:9200"
20 |         index => "osm-voca"
21 |         doc_as_upsert => true
22 |     }
23 | }


--------------------------------------------------------------------------------
/m4_post_ocr/post_ocr.py:
--------------------------------------------------------------------------------
  1 | import logging
  2 | import requests
  3 | import json
  4 | import http.client as http_client
  5 | import nltk
  6 | import re
  7 | 
  8 | import elasticsearch
  9 | import elasticsearch.helpers
 10 | 
 11 | 
 12 | def lexical_search_query(target_text, es):
 13 |     """ Query candidates and save them as 'postocr_label' """
 14 | 
 15 |     clean_txt = []
 16 |     if type(target_text) == str:
 17 |         if any(char.isdigit() for char in target_text) == False:
 18 |             for t in range(len(target_text)):
 19 |                 txt = target_text[t]
 20 |                 if txt.isalpha():
 21 |                     clean_txt.append(txt)
 22 | 
 23 |             temp_label = ''.join([str(item) for item in clean_txt])
 24 |             if len(temp_label) != 0:
 25 |                 target_text = temp_label
 26 | 
 27 |                 process = re.findall('[A-Z][^A-Z]*', target_text)
 28 |                 if all(c.isupper() for c in process) or len(process) == 1:
 29 | 
 30 |                     if type(target_text) == str and any(c.isalpha() for c in target_text):
 31 |                         # edist 0
 32 |                         fuzziness = 0
 33 |                         inputs = target_text.lower()
 34 |                         q1 = {'query': {'fuzzy': {'name': {'value': inputs, 'fuzziness': 0}}}}
 35 |                         try:
 36 |                             es_results = elasticsearch.helpers.scan(es, index="osm-voca", preserve_order=True, query=q1)
 37 |                         except elasticsearch.ElasticsearchException as es_error:
 38 |                             print(es_error)
 39 | 
 40 |                         test = [item['_source'] for item in es_results if item["_source"]['name'] is not None]
 41 |                         
 42 | 
 43 |                     edist = []
 44 |                     edist_update = []
 45 | 
 46 |                     edd_min_find = 0
 47 |                     min_candidates = False
 48 | 
 49 |                     if test != 'NaN':
 50 |                         for tt in range(len(test)):
 51 |                             if 'name' in test[tt]:
 52 |                                 candidate = test[tt]['name']
 53 |                                 edist.append(candidate)
 54 | 
 55 |                         for e in range(len(edist)):
 56 |                             edd = nltk.edit_distance(inputs.upper(), edist[e].upper())
 57 | 
 58 |                             if edd == 0:
 59 |                                 edist_update.append(edist[e])
 60 |                                 min_candidates = edist[e]
 61 |                                 edd_min_find = 1
 62 | 
 63 |                     # edd 1
 64 |                     if edd_min_find != 1:
 65 |                         # edist 1
 66 |                         fuzziness = 1
 67 | 
 68 |                         q2 = {'query': {'fuzzy': {'name': {'value': inputs, 'fuzziness': fuzziness}}}}
 69 |                         try:
 70 |                             es_results = elasticsearch.helpers.scan(es, index="osm-voca", preserve_order=True, query=q2)
 71 |                         except elasticsearch.ElasticsearchException as es_error:
 72 |                             print(es_error)
 73 | 
 74 |                         test = [item['_source'] for item in es_results if item["_source"]['name'] is not None]
 75 | 
 76 | 
 77 |                         edist = []
 78 |                         edist_count = []
 79 |                         edist_update = []
 80 |                         edist_count_update = []
 81 | 
 82 |                         if test != 'NaN':
 83 |                             for tt in range(len(test)):
 84 |                                 if 'name' in test[tt]:
 85 |                                     candidate = test[tt]['message']
 86 |                                     cand = candidate.split(',')[0]
 87 |                                     count = candidate.split(',')[1]
 88 |                                     edist.append(cand)
 89 |                                     edist_count.append(count)
 90 | 
 91 |                             for e in range(len(edist)):
 92 |                                 edd = nltk.edit_distance(inputs.upper(), edist[e].upper())
 93 | 
 94 |                                 if edd == 1:
 95 |                                     edist_update.append(edist[e])
 96 |                                     edist_count_update.append(edist_count[e])
 97 | 
 98 |                             if len(edist_update) != 0:
 99 |                                 index = edist_count_update.index(max(edist_count_update))
100 |                                 min_candidates = edist_update[index]
101 |                                 edd_min_find = 1
102 | 
103 |                     # edd 2
104 |                     if edd_min_find != 1:
105 |                         # edist 2
106 |                         fuzziness = 2
107 |                         q3 = {'query': {'fuzzy': {'name': {'value': inputs, 'fuzziness': fuzziness}}}}
108 |                         try:
109 |                             es_results = elasticsearch.helpers.scan(es, index="osm-voca", preserve_order=True, query=q3)
110 |                         except elasticsearch.ElasticsearchException as es_error:
111 |                             print(es_error)
112 | 
113 |                         test = [item['_source'] for item in es_results if item["_source"]['name'] is not None]
114 | 
115 |                         edist = []
116 |                         edist_count = []
117 |                         edist_update = []
118 |                         edist_count_update = []
119 | 
120 |                         if test != 'NaN':
121 |                             for tt in range(len(test)):
122 |                                 if 'name' in test[tt]:
123 |                                     candidate = test[tt]['message']
124 |                                     cand = candidate.split(',')[0]
125 |                                     count = candidate.split(',')[1]
126 |                                     edist.append(cand)
127 |                                     edist_count.append(count)
128 | 
129 |                             for e in range(len(edist)):
130 |                                 edd = nltk.edit_distance(inputs.upper(), edist[e].upper())
131 | 
132 |                                 if edd == 2:
133 |                                     edist_update.append(edist[e])
134 |                                     edist_count_update.append(edist_count[e])
135 | 
136 |                             if len(edist_update) != 0:
137 |                                 index = edist_count_update.index(max(edist_count_update))
138 |                                 min_candidates = edist_update[index]
139 |                                 edd_min_find = 1
140 | 
141 |                     if edd_min_find != 1:
142 |                         min_candidates = False
143 | 
144 | 
145 |                     if min_candidates != False:
146 |                         return str(min_candidates)
147 |                     else:
148 |                         return str(target_text)
149 | 
150 |                 else: # added
151 |                     return str(target_text)
152 | 
153 |             else:
154 |                 # only numeric pred_text
155 |                 return str(target_text)
156 |         else:
157 |             # Combination of 140D -> 140D
158 |             return str(target_text)
159 |     else:
160 |         return str(target_text)


--------------------------------------------------------------------------------
/m4_post_ocr/post_ocr_main.py:
--------------------------------------------------------------------------------
 1 | import os
 2 | import argparse
 3 | import ast
 4 | import re
 5 | import pandas as pd
 6 | import numpy as np
 7 | import geojson
 8 | import json
 9 | from dotenv import load_dotenv
10 | from shapely.geometry import Polygon
11 | import psycopg2
12 | import reverse_geocoder as rg
13 | import pycountry_convert as pc
14 | from pyproj import Transformer, transform, Proj
15 | import sys
16 | import elasticsearch
17 | import elasticsearch.helpers
18 | from post_ocr import lexical_search_query
19 | import logging
20 | import time
21 | 
22 | logging.basicConfig(level=logging.INFO)
23 | 
24 | 
25 | 
26 | def save_postocr_results(in_geojson_data, unique_map_text_li, es_conn, output_dir, in_geojson_filename):
27 |     result_dict_postocr = dict()
28 |     for map_text in set(unique_map_text_li):
29 |         map_text_candidate = lexical_search_query(map_text, es_conn)
30 |         result_dict_postocr[map_text] = map_text_candidate
31 | 
32 |     for feature_data in in_geojson_data["features"]:
33 |         feature_data["properties"]["postocr_label"] = result_dict_postocr[str(feature_data["properties"]["text"]).lower()]
34 |     
35 |     with open(os.path.join(output_dir, in_geojson_filename.split("/")[-1]), 'w', encoding='utf8') as output_geojson:
36 |         geojson.dump(in_geojson_data, output_geojson, ensure_ascii=False)
37 | 
38 | 
39 | 
40 | def main(args):
41 |     geojson_file = args.in_geojson_file
42 |     output_dir = args.out_geojson_dir
43 | 
44 | 
45 |     try:
46 |         es = elasticsearch.Elasticsearch([{'host': "127.0.0.1", 'port': 9200}], timeout=1000)
47 |         es_connected = es.ping()
48 |     except:
49 |         logging.warning('elasticsearch.ConnectionError.ElasticConnectionError while running %s', geojson_file.split("/")[-1])
50 |         return
51 |     if not es_connected:
52 |         logging.warning('Error on elasticsearch connection while running %s', geojson_file.split("/")[-1])
53 |         return
54 |     es_logger = elasticsearch.logger
55 |     es_logger.setLevel(elasticsearch.logging.WARNING)
56 | 
57 |     with open(geojson_file) as f:
58 |         # Need update
59 |         try: 
60 |             data = geojson.load(f)
61 |         except json.decoder.JSONDecodeError:
62 |             if os.path.getsize(geojson_file) == 0:
63 |                 with open(os.path.join(output_dir, geojson_file.split("/")[-1]), 'w') as fp:
64 |                     pass
65 |             else:
66 |                 logging.info('JSONDecodeError %s', geojson_file)
67 |             # sys.exit(1)
68 |             return
69 |         
70 |         min_x, min_y, max_x, max_y = float('inf'), float('inf'), float('-inf') ,float('-inf')
71 |         unique_map_text = []
72 |         for feature_data in data['features']:
73 |             unique_map_text.append(str(feature_data['properties']['text']).lower())
74 |         
75 |         # if postocr_only:
76 |         save_postocr_results(data, unique_map_text, es, output_dir, geojson_file)
77 |         logging.info('Done generating standalone post-ocr geojson for %s', geojson_file.split("/")[-1])
78 |         # return
79 | 
80 | 
81 | if __name__ == '__main__':
82 |     parser = argparse.ArgumentParser()
83 |     parser.add_argument('--in_geojson_file', type=str, default='data/100_maps_geojson_abc_geocoord/',
84 |                         help='input geojson')
85 |     parser.add_argument('--out_geojson_dir', type=str, default='data/100_maps_geojson_abc_linked/',
86 |                         help='output dir for converted geojson files')
87 | 
88 |     args = parser.parse_args()
89 | 
90 |     main(args)


--------------------------------------------------------------------------------
/m4_post_ocr/preprocess.py:
--------------------------------------------------------------------------------
  1 | from elasticsearch import Elasticsearch
  2 | import logging
  3 | import requests
  4 | import json
  5 | 
  6 | import http.client as http_client
  7 | 
  8 | import pandas as pd
  9 | import string
 10 | import emoji
 11 | import time
 12 | import string
 13 | 
 14 | import glob
 15 | import os 
 16 | 
 17 | 
 18 | 
 19 | def read_name():
 20 |     http_client.HTTPConnection.debuglevel = 1
 21 | 
 22 |     logging.basicConfig()
 23 |     logging.getLogger().setLevel(logging.DEBUG)
 24 | 
 25 |     requests_log = logging.getLogger("requests.packages.urllib3")
 26 |     requests_log.setLevel(logging.DEBUG)
 27 |     requests_log.propagate = True
 28 | 
 29 | 
 30 |     #Popularity Count
 31 |     headers = {
 32 |         'Content-Type': 'application/json',
 33 |     }
 34 | 
 35 |     json_body = '{"track_total_hits": true}'
 36 | 
 37 |     resp = requests.get(f'http://localhost:9200/osm/_search?&pretty=true', \
 38 |                 data=json_body, \
 39 |                 headers = headers)
 40 |     resp_json = json.loads(resp.text)
 41 |     total_value = resp_json["hits"]["total"]["value"] 
 42 | 
 43 | 
 44 |     # Initialize
 45 |     json_body_page = '{"track_total_hits": true, "size": 10000, "sort": [{"ogc_fid": {"order" : "desc" }}]}'
 46 |     resp_page = requests.post(f'http://localhost:9200/osm/_search?', \
 47 |                 data=json_body_page, \
 48 |                 headers = headers)
 49 |     resp_page_json = json.loads(resp_page.text)
 50 |   
 51 |     name_list = []
 52 | 
 53 |     st = []
 54 |     for h in range(len(resp_page_json["hits"]["hits"])):
 55 |         st = resp_page_json["hits"]["hits"][h]["sort"]
 56 |         text = resp_page_json["hits"]["hits"][h]["_source"]["name"]
 57 |         token_list = text.split(" ")
 58 |         for t in range(len(token_list)):
 59 |             name_list.append(token_list[t].lower())
 60 |         
 61 |     n_val = len(resp_page_json["hits"]["hits"])
 62 |     st_list = [st[0]]
 63 |     error_track = []
 64 | 
 65 |     # Iterate over pages
 66 |     while n_val != total_value:
 67 | 
 68 |         try: #osm_id.keyword
 69 |             json_body_page2 = '{"track_total_hits": true, "size": 10000, "sort": [{"ogc_fid": {"order" : "desc" }}], "search_after": ['+str(st[0])+']}'
 70 |             resp_page2 = requests.get(f'http://localhost:9200/osm/_search?', \
 71 |                         data=json_body_page2, \
 72 |                         headers = headers)
 73 |             resp_page_json2 = json.loads(resp_page2.text)
 74 | 
 75 |             for h in range(len(resp_page_json2["hits"]["hits"])):
 76 |                 st = resp_page_json2["hits"]["hits"][h]["sort"]
 77 |                 text = resp_page_json2["hits"]["hits"][h]["_source"]["name"]
 78 |                 token_list = text.split(" ")
 79 |                 for t in range(len(token_list)):
 80 |                     name_list.append(token_list[t].lower())
 81 | 
 82 |             n_val += len(resp_page_json2["hits"]["hits"])
 83 |             st_list.append(st[0])
 84 |             print(f'n_val: {n_val} done!')
 85 | 
 86 |         except Exception as e:
 87 |             print(e.message)
 88 |             error_track.append(str(st[0])) 
 89 | 
 90 |             with open('error_id.txt', 'w') as fp:
 91 |                 for item in error_track:
 92 |                     fp.write("%s\n" % item)
 93 |                 print('Done')
 94 | 
 95 |             with open('name_mid.txt', 'w') as fp:
 96 |                 for item in name_list:
 97 |                     fp.write("%s\n" % item)
 98 |                 print('Done')
 99 |             
100 |             with open('last_sort_id.txt', 'w') as fp:
101 |                 for item in st_list:
102 |                     fp.write("%s\n" % item)
103 |                 print('Done')
104 | 
105 |     with open('name.txt', 'w') as fp:
106 |         for item in name_list:
107 |             fp.write("%s\n" % item)
108 |         print('Done')
109 | 
110 |     with open('name_set.txt', 'w') as fp:
111 |         name_set = list(set(name_list))
112 |         for item in name_set:
113 |             fp.write("%s\n" % item)
114 |         print('Done')
115 | 
116 | 
117 | 
118 | def counting():
119 |     input_txt = "name.txt"
120 | 
121 |     if os.path.exists(input_txt):
122 | 
123 |         punc = '''!()-[]{};:'"\,<>./?@#$%^&*_~'''
124 |         start2 = time.time()
125 |         set_lst2 = []
126 |         with open(input_txt) as file:
127 |             for item in file:
128 |                 name = emoji.replace_emoji(item.strip(), '') #filter out emoji
129 |                 name = name.translate(str.maketrans('', '', string.punctuation))
130 |                 if len(name) > 0:
131 |                     set_lst2.append(name.upper())
132 | 
133 |         end2 = time.time()
134 |         start = time.time()
135 | 
136 |         dic = {}
137 |         count = 0
138 |         
139 |         for word in set_lst2:
140 |             count += 1
141 |             if word in dic:
142 |                 dic[word] += 1
143 |             else:
144 |                 dic[word] = 1
145 | 
146 |         end = time.time()
147 | 
148 |         print(end - start)
149 |         print(end2 - start2)
150 |         dff = pd.DataFrame.from_dict([dic]).T
151 | 
152 |         dff.reset_index(inplace=True)
153 |         dff = dff.rename(columns = {'index':'name', '0': 'count'})
154 |         dff.to_csv("out.csv", index=False)
155 | 
156 | 
157 | if __name__ == '__main__':
158 |     read_name()
159 |     counting()
160 | 


--------------------------------------------------------------------------------
/m5_geocoordinate_converter/convert_geojson_to_geocoord.py:
--------------------------------------------------------------------------------
 1 | import os
 2 | import argparse
 3 | import logging
 4 | import ast
 5 | 
 6 | import pandas as pd
 7 | import numpy as np
 8 | import geojson
 9 | import os
10 | 
11 | logging.basicConfig(level=logging.INFO)
12 | 
13 | 
14 | def main(args):
15 |     geojson_file = args.in_geojson_file
16 |     output_dir = args.out_geojson_dir
17 | 
18 |     sample_map_df = pd.read_csv(args.sample_map_path, dtype={'image_no': str})
19 |     sample_map_df['image_no'] = sample_map_df['image_no'].str.replace('.1.jp2', '', regex=False).str.replace('.jp2', '', regex=False)
20 |     sample_map_df['image_no'] = sample_map_df['image_no'].apply(lambda x: x[:-2] if x[-2:] == '.1' else x)
21 |     
22 |     geojson_filename_id = geojson_file.split(".")[0].split("/")[-1]
23 | 
24 |     if not os.path.isdir(os.path.join(output_dir, "tmp/")):
25 |         os.makedirs(os.path.join(output_dir, "tmp/"))
26 | 
27 |     row = sample_map_df[sample_map_df['image_no'] == geojson_filename_id]
28 |     if not row.empty:
29 |         gcps = ast.literal_eval(row.iloc[0]['gcps'])
30 |         gcp_str = ''
31 |         for gcp in gcps:
32 |             lng, lat = gcp['location']
33 |             x, y = gcp['pixel']
34 |             gcp_str += '-gcp ' + str(x) + ' ' + str(y) + ' ' + str(lng) + ' ' + str(lat) + ' '
35 | 
36 |         transform_method = row.iloc[0]['transformation_method']
37 |         assert transform_method in ['affine', 'polynomial', 'tps']
38 | 
39 |         # minus in y
40 |         with open(geojson_file) as img_geojson:
41 |             try:
42 |                 img_data = geojson.load(img_geojson)
43 |             except json.decoder.JSONDecodeError:
44 |                 if os.stat(geojson_file).st_size == 0:
45 |                     with open(os.path.join(output_dir, geojson_filename_id + '.geojson'), 'w') as fp:
46 |                         pass
47 |                     logging.info('Done generating empty geocoord geojson for %s', geojson_file)
48 |                 else:
49 |                     logging.info('JSONDecodeError %s', geojson_file)
50 |                 return
51 | 
52 |             for img_feature in img_data['features']:
53 |                 arr = np.array(img_feature['geometry']['coordinates'])
54 |                 img_feature['properties']['img_coordinates'] = np.array(arr).reshape(-1, 2).tolist()
55 | 
56 |                 arr[:, :, 1] *= -1
57 |                 img_feature['geometry']['coordinates'] = arr.tolist()
58 | 
59 |         with open(os.path.join(os.path.join(output_dir, "tmp/"), geojson_filename_id + '.geojson'), 'w', encoding='utf8') as geocoord_geojson:
60 |             geojson.dump(img_data, geocoord_geojson, ensure_ascii=False)
61 | 
62 |         input = '"' + output_dir + "/tmp/" + geojson_filename_id + '.geojson"'
63 |         output = '"' + output_dir + "/" + geojson_filename_id + '.geojson"'
64 | 
65 |         if transform_method == 'affine':
66 |             gecoord_convert_command = 'ogr2ogr -f "GeoJSON" ' + output + " " + input + ' -order 1 -s_srs epsg:4326 -t_srs epsg:3857 -skipfailures ' + gcp_str
67 | 
68 |         elif transform_method == 'polynomial':
69 |             gecoord_convert_command = 'ogr2ogr -f "GeoJSON" ' + output + " " + input + ' -order 2 -s_srs epsg:4326 -t_srs epsg:3857 -skipfailures ' + gcp_str
70 | 
71 |         elif transform_method == 'tps':
72 |             gecoord_convert_command = 'ogr2ogr -f "GeoJSON" ' + output + " " + input + ' -tps -s_srs epsg:4326 -t_srs epsg:3857 -skipfailures ' + gcp_str
73 | 
74 |         else:
75 |             raise NotImplementedError
76 | 
77 |         ret_value = os.system(gecoord_convert_command)
78 |         if os.path.exists(os.path.join(os.path.join(output_dir, "tmp/"), geojson_filename_id + '.geojson')):
79 |             os.remove(os.path.join(os.path.join(output_dir, "tmp/"), geojson_filename_id + '.geojson'))
80 | 
81 |         if ret_value != 0:
82 |             logging.info('Failed generating geocoord geojson for %s', geojson_file)
83 |         else:
84 |             logging.info('Done generating geocoord geojson for %s', geojson_file)
85 | 
86 | 
87 | if __name__ == '__main__':
88 |     parser = argparse.ArgumentParser()
89 |     parser.add_argument('--sample_map_path', type=str, default='data/initial_US_100_maps.csv',
90 |                         help='path to sample map csv, which contains gcps info')
91 |     parser.add_argument('--in_geojson_file', type=str,
92 |                         help='input geojson file; results of M2')
93 |     parser.add_argument('--out_geojson_dir', type=str, default='data/100_maps_geojson_abc_geocoord/',
94 |                         help='output dir for converted geojson files')
95 | 
96 |     args = parser.parse_args()
97 | 
98 |     main(args)
99 | 


--------------------------------------------------------------------------------
/m6_entity_linker/create_elasticsearch_index.py:
--------------------------------------------------------------------------------
 1 | import argparse
 2 | import logging
 3 | 
 4 | import pandas as pd
 5 | 
 6 | import elasticsearch
 7 | from elasticsearch import helpers
 8 | 
 9 | logging.basicConfig(level=logging.INFO)
10 | 
11 | def main(args):
12 |     # elasticsearch connection 
13 |     try:
14 |         es = elasticsearch.Elasticsearch([{'host': '127.0.0.1', 'port': 9200}], timeout=1000)
15 |         es_connected = es.ping()
16 |     except:
17 |         logging.warning('elasticsearch.ConnectionError.ElasticConnectionError while running %s', geojson_file.split("/")[-1])
18 |         return
19 |     if not es_connected:
20 |         logging.warning('Error on elasticsearch connection while running %s', geojson_file.split("/")[-1])
21 |         return
22 |     es_logger = elasticsearch.logger
23 |     es_logger.setLevel(elasticsearch.logging.WARNING)
24 | 
25 |     df = pd.read_csv(args.in_csv)
26 | 
27 |     for index, row in df.iterrows():
28 |         if index % 1000 == 0: print(index, "processed ...")
29 | 
30 |         es_query = {"query": {
31 |             "bool": {
32 |             "must": [
33 |                 {
34 |                 "match": {'name': str(row['name']).lower().replace("'","\'")}
35 |                 }
36 |             ]
37 |             }
38 |         }}
39 | 
40 |         try:
41 |             osm_count = es.count(index="osm", body=es_query)["count"]
42 |         except elasticsearch.ElasticsearchException as es_error:
43 |             logging.warning('ElasticsearchException while running %s', geojson_file.split("/")[-1])
44 |             continue
45 |         
46 |         # skip word that has more than 10000 matched cases in OSM
47 |         if osm_count > 10000:
48 |             # logging.info('ElasticsearchException while running %s', geojson_file.split("/")[-1])
49 |             continue
50 | 
51 |         try:
52 |             es_results = elasticsearch.helpers.scan(es, index="osm", query=es_query)
53 |         except elasticsearch.ElasticsearchException as es_error:
54 |             logging.warning('ElasticsearchException while running %s', geojson_file.split("/")[-1])
55 |             continue
56 | 
57 |         es_results = [(hit["_source"]['source_table'], hit["_source"]['osm_id']) for hit in es_results]
58 |         if len(es_results) == 0:
59 |             # logging.info('No elasticsearch results of word %s while running %s', map_text, geojson_file.split("/")[-1])
60 |             continue
61 | 
62 |         df.loc[index, 'source_table_osm_id'] = str(es_results)
63 | 
64 |     df = df.dropna()
65 |     df.to_csv(args.out_csv, index=False)
66 | 
67 | if __name__ == '__main__':
68 |     parser = argparse.ArgumentParser()
69 |     parser.add_argument('--in_csv', type=str, default='out.csv', help='input csv')
70 |     parser.add_argument('--out_csv', type=str, default='./m6_entity_linker/osm_linker.csv', help='output csv')
71 |     args = parser.parse_args()
72 | 
73 |     main(args)


--------------------------------------------------------------------------------
/m6_entity_linker/create_spatial_index_postgres.py:
--------------------------------------------------------------------------------
 1 | from dotenv import load_dotenv
 2 | import psycopg2
 3 | 
 4 | 
 5 | load_dotenv()
 6 | DB_HOST = os.getenv("DB_HOST")
 7 | DB_PORT = os.getenv("DB_PORT")
 8 | DB_USERNAME = os.getenv("DB_USERNAME")
 9 | DB_PASSWORD = os.getenv("DB_PASSWORD")
10 | DB_NAME = os.getenv("DB_NAME")
11 | 
12 | conn = psycopg2.connect(database=DB_NAME, host=DB_HOST, user=DB_USERNAME, password=DB_PASSWORD, port=DB_PORT)
13 | 
14 | continents = ['africa', 'asia', 'central_america', 'north_america', 'south_america', 'antarctica', 'australia_oceania', 'europe']
15 | tables = ['points', 'lines', 'multilinestrings','multipolygons','other_relations']
16 | 
17 | for continent in continents:
18 |     for table in tables:
19 |         name = continent + "." + table
20 |         cur.execute(f'''CREATE INDEX {name.replace(".","_")+"_index"} ON {name} USING gist (wkb_geometry);''')
21 |         cur.execute(f'''CREATE INDEX {name.replace(".","_")+"_osm_index"} ON {name} (osm_id);''')
22 |         print(name, " creating index...")
23 |     
24 | conn.commit()
25 | cur.close()
26 | conn.close()
27 | 


--------------------------------------------------------------------------------
/m6_entity_linker/entity_linking.py:
--------------------------------------------------------------------------------
  1 | import os
  2 | import argparse
  3 | import ast
  4 | import logging
  5 | import time
  6 | 
  7 | import pandas as pd
  8 | import numpy as np
  9 | import geojson
 10 | import json
 11 | 
 12 | from shapely.ops import transform
 13 | from shapely.geometry import Polygon
 14 | import pyproj
 15 | 
 16 | import elasticsearch
 17 | 
 18 | from dotenv import load_dotenv
 19 | import psycopg2
 20 | from postgres_logger import LinkerLoggingConnection
 21 | 
 22 | logging.basicConfig(level=logging.INFO)
 23 | logger = logging.getLogger(__name__)
 24 | 
 25 | 
 26 | def main(args):
 27 |     input_dir = args.in_geojson_dir
 28 |     output_dir = args.out_geojson_dir
 29 | 
 30 |     # elasticsearch connection 
 31 |     try:
 32 |         es = elasticsearch.Elasticsearch([{'host': '127.0.0.1', 'port': 9200}], timeout=1000)
 33 |         es_connected = es.ping()
 34 |     except:
 35 |         logging.warning('elasticsearch.ConnectionError.ElasticConnectionError')
 36 |         return
 37 |     if not es_connected:
 38 |         logging.warning('Error on elasticsearch connection')
 39 |         return
 40 |     es_logger = elasticsearch.logger
 41 |     es_logger.setLevel(elasticsearch.logging.WARNING)
 42 | 
 43 |     # postgres connection 
 44 |     load_dotenv()
 45 |     DB_HOST = os.getenv("DB_HOST")
 46 |     DB_PORT = os.getenv("DB_PORT")
 47 |     DB_USERNAME = os.getenv("DB_USERNAME")
 48 |     DB_PASSWORD = os.getenv("DB_PASSWORD")
 49 |     DB_NAME = os.getenv("DB_NAME")
 50 |     
 51 |     try:
 52 |         conn = psycopg2.connect(host=DB_HOST, database=DB_NAME, user=DB_USERNAME, password=DB_PASSWORD, port=DB_PORT, connection_factory=LinkerLoggingConnection)
 53 |     except Exception as e:
 54 |         logging.warning('Error on psycopg2 connection ', e)
 55 |         return
 56 | 
 57 |     sample_map_df = pd.read_csv(args.sample_map_path, dtype={'image_no': str})
 58 |     sample_map_df['image_no'] = sample_map_df['image_no'].str.replace('.1.jp2', '', regex=False).str.replace('.jp2', '', regex=False)
 59 |     sample_map_df['image_no'] = sample_map_df['image_no'].apply(lambda x: x[:-2] if x[-2:] == '.1' else x)
 60 | 
 61 |     conn.initialize(logger)
 62 |     conn.autocommit = True
 63 | 
 64 |     with conn.cursor() as cur:
 65 |         for index, record in sample_map_df.iterrows():
 66 |             input_geojson_file = os.path.join(input_dir, record.image_no + ".geojson")
 67 |         
 68 |             if not os.path.exists(input_geojson_file):
 69 |                 logging.warning('PostOCR output does not exist %s', record.image_no + ".geojson")
 70 |                 continue
 71 |             
 72 |             if os.path.exists(os.path.join(output_dir, input_geojson_file.split("/")[-1])):
 73 |                 logging.info('EntityLinker output already exists %s', record.image_no + ".geojson")
 74 |                 continue
 75 | 
 76 |             with open(input_geojson_file) as f:
 77 |                 try: 
 78 |                     data = geojson.load(f)
 79 |                 except json.decoder.JSONDecodeError:
 80 |                     if os.path.getsize(input_geojson_file) == 0:
 81 |                         with open(os.path.join(output_dir, input_geojson_file.split("/")[-1]), 'w') as fp:
 82 |                             continue
 83 |                     else:
 84 |                         logging.info('JSONDecodeError %s', input_geojson_file)
 85 |                         continue
 86 | 
 87 |                 for feature_data in data['features']:
 88 |                     map_text = str(feature_data['properties']['postocr_label'])
 89 | 
 90 |                     # skip null geometry
 91 |                     if feature_data['geometry'] is None:
 92 |                         feature_data["properties"]["osm_id"] = []
 93 |                         continue
 94 |                     
 95 |                     # skip text less than 3 characters
 96 |                     if len(map_text) <= 3:
 97 |                         feature_data["properties"]["osm_id"] = []
 98 |                         continue
 99 | 
100 |                     pts = np.array(feature_data['geometry']['coordinates']).reshape(-1, 2)
101 |                     map_polygon = Polygon(pts)
102 | 
103 |                     es_query = {
104 |                         "bool": {
105 |                         "must": [
106 |                             {
107 |                             "match": {'name': map_text.lower().replace("'","\'")}
108 |                             }
109 |                         ]
110 |                         }
111 |                     }
112 | 
113 |                     try:
114 |                         es_results = es.search(index="osm-linker", query=es_query)
115 |                     except elasticsearch.ElasticsearchException as es_error:
116 |                         logging.warning('ElasticsearchException while running %s', input_geojson_file.split("/")[-1])
117 |                         continue
118 | 
119 |                     if es_results['hits']['total']['value'] == 0:
120 |                         # logging.info('No elasticsearch results of word %s while running %s', map_text, input_geojson_file.split("/")[-1])
121 |                         feature_data["properties"]["osm_id"] = []
122 |                         continue
123 | 
124 |                     es_results = [ast.literal_eval(hit["_source"]['source_table_osm_id']) for hit in es_results['hits']['hits']][0]
125 |                     output_osm_ids = []
126 |                     source_tables = set([table for table, _ in es_results if "other_relations" not in table])
127 | 
128 |                     for source_table in source_tables:
129 |                         sql = ""
130 |                         osm_ids = [osm_id for table, osm_id in es_results if table == source_table]
131 | 
132 |                         if "points" in source_table:
133 |                             sql = f"""SELECT osm_id
134 |                                     FROM  {source_table}
135 |                                     WHERE ST_CONTAINS(ST_TRANSFORM(ST_SetSRID(ST_MakeValid('{map_polygon}'), 3857), 4326), wkb_geometry)
136 |                                     AND osm_id = ANY (%s)
137 |                             """
138 |                         
139 |                         elif "line" in source_table:
140 |                             sql = f"""SELECT osm_id
141 |                                     FROM  {source_table}
142 |                                     WHERE ST_INTERSECTS(ST_TRANSFORM(ST_SetSRID(ST_MakeValid('{map_polygon}'), 3857), 4326), wkb_geometry)
143 |                                     AND osm_id = ANY (%s)
144 |                             """
145 | 
146 |                         elif "polygon" in source_table:
147 |                             sql = f"""SELECT osm_id
148 |                                     FROM  {source_table}
149 |                                     WHERE ST_INTERSECTS(ST_TRANSFORM(ST_SetSRID(ST_MakeValid('{map_polygon}'), 3857), 4326), ST_MakeValid(wkb_geometry, 'method=structure'))
150 |                                     AND osm_id = ANY (%s)
151 |                             """
152 | 
153 |                         try:
154 |                             cur.execute(sql,(osm_ids,))
155 |                         except Exception as e:
156 |                             logging.warning('Error occured while executing sql for %s', input_geojson_file.split("/")[-1], e)
157 |                             if "TopologyException" in repr(e):
158 |                                 continue
159 |                             else:
160 |                                 return
161 |                             
162 |                         sql_result = cur.fetchall()
163 |                         if len(sql_result) != 0:
164 |                             output_osm_ids.extend([x[0] for x in sql_result])
165 | 
166 |                     feature_data["properties"]["osm_id"] = output_osm_ids
167 | 
168 |             with open(os.path.join(output_dir, input_geojson_file.split("/")[-1]), 'w', encoding='utf8') as output_geojson:
169 |                 geojson.dump(data, output_geojson, ensure_ascii=False)
170 |             logging.info('Done generating geojson for %s', input_geojson_file.split("/")[-1])
171 | 
172 | if __name__ == '__main__':
173 |     parser = argparse.ArgumentParser()
174 |     parser.add_argument('--sample_map_path', type=str, default='data/initial_US_100_maps.csv',
175 |                         help='path to sample map csv, which contains gcps info')
176 |     parser.add_argument('--in_geojson_dir', type=str, default='data/100_maps_geojson_abc_geocoord/',
177 |                         help='input geojson')
178 |     parser.add_argument('--out_geojson_dir', type=str, default='data/100_maps_geojson_abc_linked/',
179 |                         help='output dir for converted geojson files')
180 |     args = parser.parse_args()
181 |     main(args)
182 | 


--------------------------------------------------------------------------------
/m6_entity_linker/logstash_osm_linker.conf:
--------------------------------------------------------------------------------
 1 | input {
 2 |     file {
 3 |         path => "./m6_entity_linker/osm_linker.csv"
 4 |         start_position => beginning
 5 |     }
 6 | }
 7 | filter {
 8 |     csv {
 9 |         separator => ","
10 |         columns => [
11 |                 "name",
12 |                 "0",
13 |                 "source_table_osm_id"
14 |         ]
15 |     }
16 | }
17 | output {
18 |  elasticsearch {
19 |         hosts => "localhost:9200"
20 |         index => "osm-linker"
21 |     }
22 | }


--------------------------------------------------------------------------------
/m6_entity_linker/logstash_postgres_world.conf:
--------------------------------------------------------------------------------
 1 | input {
 2 |   jdbc {
 3 |     jdbc_connection_string => "jdbc:postgresql://localhost:5432/osm"
 4 |     jdbc_user => ""
 5 |     jdbc_password => ""
 6 |     jdbc_paging_enabled => true
 7 |     jdbc_fetch_size => 100000
 8 |     jdbc_driver_library => "/usr/share/logstash/logstash-core/lib/jars/postgresql-42.6.0.jar"
 9 |     jdbc_driver_class => "org.postgresql.Driver"
10 |     statement => "SELECT ogc_fid, osm_id, name, source_table FROM entire_continents ORDER BY ogc_fid"
11 |  }
12 | }
13 | output {
14 |  elasticsearch {
15 |         hosts => "localhost:9200"
16 |         index => "osm"
17 |         document_id => "%{ogc_fid}"
18 |         doc_as_upsert => true
19 |     }
20 | }


--------------------------------------------------------------------------------
/m6_entity_linker/postgres_logger.py:
--------------------------------------------------------------------------------
 1 | import time
 2 | 
 3 | import psycopg2
 4 | from psycopg2.extras import LoggingConnection, LoggingCursor
 5 | 
 6 | class LinkerLoggingCursor(LoggingCursor):
 7 |     def execute(self, query, vars=None):
 8 |         self.timestamp = time.time()
 9 |         return super(LinkerLoggingCursor, self).execute(query, vars)
10 | 
11 |     def callproc(self, procname, vars=None):
12 |         self.timestamp = time.time()
13 |         return super(LinkerLoggingCursor, self).callproc(procname, vars)
14 | 
15 | class LinkerLoggingConnection(LoggingConnection):
16 |     def filter(self, msg, curs):
17 |         return msg.decode(psycopg2.extensions.encodings[self.encoding], 'replace') + "   %d ms" % int((time.time() - curs.timestamp) * 1000)
18 | 
19 |     def cursor(self, *args, **kwargs):
20 |         kwargs.setdefault('cursor_factory', LinkerLoggingCursor)
21 |         return LoggingConnection.cursor(self, *args, **kwargs)
22 | 


--------------------------------------------------------------------------------
/m6_entity_linker/upload_osm_to_postgres_all_continents.py:
--------------------------------------------------------------------------------
 1 | from dotenv import load_dotenv
 2 | import psycopg2
 3 | 
 4 | 
 5 | load_dotenv()
 6 | DB_HOST = os.getenv("DB_HOST")
 7 | DB_PORT = os.getenv("DB_PORT")
 8 | DB_USERNAME = os.getenv("DB_USERNAME")
 9 | DB_PASSWORD = os.getenv("DB_PASSWORD")
10 | DB_NAME = os.getenv("DB_NAME")
11 | 
12 | conn = psycopg2.connect(database=DB_NAME, host=DB_HOST, user=DB_USERNAME, password=DB_PASSWORD, port=DB_PORT)
13 | cur = conn.cursor()
14 | cur.execute('''CREATE TABLE entire_continents (
15 |     ogc_fid SERIAL PRIMARY KEY,
16 |     osm_id character varying,
17 |     name character varying,
18 |     source_table character varying
19 |     );''')
20 | 
21 | 
22 | continents = ['africa', 'asia', 'central_america', 'north_america', 'south_america', 'antarctica', 'australia_oceania', 'europe']
23 | tables = ['points', 'lines', 'multilinestrings','multipolygons','other_relations']
24 | 
25 | for continent in continents:
26 |     for table in tables:
27 |         name = continent + "." + table
28 |         cur.execute(f'''INSERT INTO entire_continents(osm_id, name, source_table)
29 |         SELECT osm_id, name, '{name}' FROM {name}
30 |         WHERE name IS NOT NULL AND osm_id IS NOT NULL ;''')
31 |         print(name, " inserting into entire_continents...")
32 |     
33 | conn.commit()
34 | cur.close()
35 | conn.close()


--------------------------------------------------------------------------------
/m6_entity_linker/upload_osm_to_postgres_ogr2ogr.py:
--------------------------------------------------------------------------------
 1 | from dotenv import load_dotenv
 2 | import subprocess
 3 | import os
 4 | 
 5 | import psycopg2
 6 | 
 7 | continents = ['africa', 'asia', 'centeral_america', 'north_america', 'south_america', 'antarctica', 'australia_oceania', 'europe']
 8 | 
 9 | load_dotenv()
10 | DB_HOST = os.getenv("DB_HOST")
11 | DB_PORT = os.getenv("DB_PORT")
12 | DB_USERNAME = os.getenv("DB_USERNAME")
13 | DB_PASSWORD = os.getenv("DB_PASSWORD")
14 | DB_NAME = os.getenv("DB_NAME")
15 | 
16 | 
17 | try:
18 |     conn = psycopg2.connect(host=DB_HOST, database=DB_NAME, user=DB_USERNAME, password=DB_PASSWORD)
19 | except:
20 |     logging.warning('Error on psycopg2 connection while running %s', geojson_file.split("/")[-1])
21 | 
22 | cur = conn.cursor()
23 | 
24 | for continent in continents:
25 |     cur.execute(f'''CREATE SCHEMA {continent};''')
26 |     print(continent, " creating schema...")
27 | 
28 | conn.commit()
29 | cur.close()
30 | conn.close()
31 | 
32 | for continent in continents:
33 |     cmd = f'''ogr2ogr -f PostgreSQL PG:"dbname='{DB_NAME}' host='{DB_HOST}' port='{DB_PORT}' user='{DB_USERNAME}' password='{DB_PASSWORD}'" {continent.replace('_','-')}-latest.osm.pbf -nlt PROMOTE_TO_MULTI -lco SCHEMA={continent}'''
34 |     print("--", continent, "--")
35 |     print(cmd)
36 |     subprocess.call(cmd, shell=True)


--------------------------------------------------------------------------------
/m_sanborn/s1_geocoding.py:
--------------------------------------------------------------------------------
  1 | import os
  2 | import argparse 
  3 | import geojson
  4 | import geocoder
  5 | import json
  6 | import time
  7 | import pdb
  8 | 
  9 | 
 10 | def arcgic_geocoding(place_name, maxRows = 5):
 11 |     try:
 12 |         response = geocoder.arcgis(place_name,maxRows=maxRows)
 13 |         return response.json
 14 |     except exception as e:
 15 |         print(e)
 16 |         return -1
 17 |     
 18 |     
 19 | def google_geocoding(place_name, api_key = None, maxRows = 5):
 20 |     try:
 21 |         response = geocoder.google(place_name, key=api_key, maxRows = maxRows)
 22 |         return response.json
 23 |     except exception as e:
 24 |         print(e)
 25 |         return -1
 26 |         
 27 | def osm_geocoding(place_name,  maxRows = 5):
 28 |     try:
 29 |         response = geocoder.osm(place_name,  maxRows = maxRows)
 30 |         return response.json
 31 |     except exception as e:
 32 |         print(e)
 33 |         return -1   
 34 |     
 35 | 
 36 | def geonames_geocoding(place_name, user_name = None, maxRows = 5):
 37 |     try:
 38 |         response = geocoder.geonames(place_name, key = user_name,  maxRows=maxRows)
 39 |         # hourly limit of 1000 credits
 40 |         time.sleep(4)
 41 |         return response.json
 42 |     except exception as e:
 43 |         print(e)
 44 |         return -1
 45 |         
 46 | 
 47 | def geocoding(args):
 48 |     output_folder = args.output_folder
 49 |     input_map_geojson_path =  args.input_map_geojson_path
 50 |     api_key = args.api_key
 51 |     user_name = args.user_name
 52 |     geocoder_option = args.geocoder_option
 53 |     max_results = args.max_results
 54 |     suffix = args.suffix
 55 | 
 56 |     with open(input_map_geojson_path, 'r') as f:
 57 |         data = geojson.load(f)
 58 | 
 59 |     map_name = os.path.basename(input_map_geojson_path).split('.')[0]
 60 |     output_folder = os.path.join(output_folder, geocoder_option)
 61 | 
 62 |     if not os.path.isdir(output_folder):
 63 |         os.makedirs(output_folder)
 64 | 
 65 |     output_path = os.path.join(output_folder, map_name) + '.json'
 66 | 
 67 |     with open(output_path, 'w') as f:
 68 |         pass # flush output file
 69 |     
 70 |     features = data['features']
 71 |     for feature in features: # iterate through all the detected text labels
 72 |         geometry = feature['geometry']
 73 |         text = feature['properties']['text']
 74 |         score = feature['properties']['score']
 75 | 
 76 |         # suffix = ', Los Angeles'
 77 |         text = str(text) + suffix
 78 | 
 79 |         print(text)
 80 | 
 81 |         if geocoder_option == 'arcgis':
 82 |             results = arcgic_geocoding(text, maxRows = max_results)
 83 |         elif geocoder_option == 'google':
 84 |             results = google_geocoding(text, api_key = api_key, maxRows = max_results)
 85 |         elif geocoder_option == 'geonames':
 86 |             results = geonames_geocoding(text, user_name = user_name, maxRows = max_results)
 87 |         elif geocoder_option == 'osm':
 88 |             results = osm_geocoding(text, maxRows = max_results)
 89 |         else:
 90 |             raise NotImplementedError
 91 | 
 92 |         if results == -1:
 93 |             # geocoder can not find match
 94 |             pass 
 95 |         else:
 96 |             # save results 
 97 |             with open(output_path, 'a') as f:
 98 |                 json.dump({'text':text, 'score':score, 'geometry': geometry, 'geocoding':results}, f)
 99 |                 f.write('\n')
100 | 
101 |         # pdb.set_trace()
102 | 
103 | 
104 | def main():
105 |     parser = argparse.ArgumentParser()
106 |     
107 |     parser.add_argument('--output_folder', type=str, default='/data2/sanborn_maps_output/LA_sanborn/geocoding/')
108 |     parser.add_argument('--input_map_geojson_path', type=str, default='/data2/sanborn_maps_output/LA_sanborn/geojson_testr/service-gmd-gmd436m-g4364m-g4364lm-g4364lm_g00656189401-00656_01_1894-0001l.geojson')
109 |     parser.add_argument('--api_key', type=str, default=None, help='Specify API key if needed')
110 |     parser.add_argument('--user_name', type=str, default=None, help='Specify user name if needed')
111 | 
112 |     parser.add_argument('--suffix', type=str, default=None, help='placename suffix (e.g. city name)')
113 |     
114 |     parser.add_argument('--max_results', type=int, default=5, help='max number of results returend by geocoder')
115 | 
116 |     parser.add_argument('--geocoder_option', type=str, default='arcgis', 
117 |         choices=['arcgis', 'google','geonames','osm'], 
118 |         help='Select text spotting model option from ["arcgis","google","geonames","osm"]') # select text spotting model
119 | 
120 |                         
121 |     args = parser.parse_args()
122 |     print('\n')
123 |     print(args)
124 |     print('\n')
125 | 
126 |     if not os.path.isdir(args.output_folder):
127 |         os.makedirs(args.output_folder)
128 | 
129 |     geocoding(args)
130 | 
131 | 
132 | if __name__ == '__main__':
133 | 
134 |     main()
135 | 
136 |     
137 | 
138 | 


--------------------------------------------------------------------------------
/m_sanborn/s2_clustering.py:
--------------------------------------------------------------------------------
  1 | import os
  2 | import json
  3 | import argparse
  4 | from sklearn.cluster import DBSCAN
  5 | from matplotlib import pyplot as plt
  6 | import geopandas as gpd
  7 | import pandas as pd
  8 | from bs4 import BeautifulSoup
  9 | from mpl_toolkits.basemap import Basemap
 10 | from pyproj import Proj, transform
 11 | 
 12 | from shapely.geometry import Point
 13 | from shapely.geometry.polygon import Polygon
 14 | import numpy as np
 15 | from shapely.geometry import MultiPoint
 16 | from geopy.distance import great_circle
 17 | 
 18 | 
 19 | county_index_dict = {'Cuyahoga County (OH)': 193,
 20 |  'Fulton County (GA)': 73,
 21 |  'Kern County (CA)': 2872,
 22 |  'Lancaster County (NE)': 1629,
 23 |  'Los Angeles County (CA)': 44,
 24 |  'Mexico': -1,
 25 |  'Nevada County (CA)': 46,
 26 |  'New Orleans (LA)': -1,
 27 |  'Pima County (AZ)': 2797,
 28 |  'Placer County (CA)': 1273,
 29 |  'Providence County (RI)\xa0': 1124,
 30 |  'Saint Louis (MO)': -1,
 31 |  'San Francisco County (CA)': 1261,
 32 |  'San Joaquin County (CA)': 1213,
 33 |  'Santa Clara (CA)': 48,
 34 |  'Santa Cruz (CA)': 2386,
 35 |  'Suffolk County (MA)': 272,
 36 |  'Tulsa County (OK)': 526,
 37 |  'Washington County (AK)': -1,
 38 |  'Washington DC': -1}
 39 | 
 40 | def get_centermost_point(cluster):
 41 |     centroid = (MultiPoint(cluster).centroid.x, MultiPoint(cluster).centroid.y)
 42 |     centermost_point = min(cluster, key=lambda point: great_circle(point, centroid).m)
 43 |     return tuple(centermost_point)
 44 | 
 45 | def clustering_func(lat_list, lng_list):
 46 |     X = [[a,b] for a,b in zip(lat_list, lng_list)]
 47 |     coords = np.array(X)
 48 |     
 49 |     # https://geoffboeing.com/2014/08/clustering-to-reduce-spatial-data-set-size/
 50 |     kms_per_radian = 6371.0088
 51 |     epsilon = 1.5 / kms_per_radian
 52 |     db = DBSCAN(eps=epsilon, min_samples=1, algorithm='ball_tree', metric='haversine').fit(np.radians(coords))
 53 |     cluster_labels = db.labels_
 54 |     num_clusters = len(set(cluster_labels))
 55 |     clusters = pd.Series([coords[cluster_labels == n] for n in range(num_clusters)])
 56 |     
 57 |     centermost_points = get_centermost_point(clusters[0])
 58 |     return centermost_points
 59 | 
 60 | def plot_points(lat_list, lng_list, target_lat_list=None, target_lng_list = None, pred_lat=None, pred_lng = None, title = None):
 61 |     
 62 |     plt.figure(figsize=(10,6))
 63 |     plt.title(title)
 64 |     
 65 |     plt.scatter(lng_list, lat_list, marker='o', c = 'violet', alpha=0.5)
 66 |     if pred_lat is not None and pred_lng is not None:
 67 |         plt.scatter(pred_lng, pred_lat, marker='o', c = 'red')
 68 |     
 69 |     if target_lat_list is not None and target_lng_list is not None:
 70 |         plt.scatter(target_lng_list, target_lat_list, 10, c = 'blue')
 71 |     plt.show()
 72 | 
 73 | def plot_points_basemap(lat_list, lng_list, target_lat_list=None, target_lng_list = None, pred_lat=None, pred_lng = None, title = None):
 74 |     
 75 |     plt.figure(figsize=(10,6))
 76 |     plt.title(title)
 77 |     
 78 |     if len(lat_list) >0 and len(lng_list) > 0:
 79 |         anchor_lat, anchor_lng = lat_list[0], lng_list[0]
 80 |     elif target_lat_list is not None:
 81 |         anchor_lat, anchor_lng = target_lat_list[0], target_lng_list[0]
 82 |     else:
 83 |         anchor_lat, anchor_lng = 45, -100
 84 |         
 85 |     m = Basemap(projection='lcc', resolution=None,
 86 |             width=8E4, height=8E4, 
 87 |             lat_0=anchor_lat, lon_0=anchor_lng)
 88 |     m.etopo(scale=0.5, alpha=0.5)
 89 |     # m.arcgisimage(service='ESRI_Imagery_World_2D', xpixels = 2000, verbose= True)
 90 |     # m.arcgisimage(service='ESRI_Imagery_World_2D',scale=0.5, alpha=0.5)
 91 |     # m.arcgisimage(service='ESRI_Imagery_World_2D', xpixels = 2000, verbose= True)
 92 |     
 93 |     lng_list, lat_list = m(lng_list, lat_list)  # transform coordinates
 94 |     plt.scatter(lng_list, lat_list, marker='o', c = 'violet', alpha=0.5)
 95 |     
 96 |     
 97 |     if target_lat_list is not None and target_lng_list is not None:
 98 |         target_lng_list, target_lat_list = m(target_lng_list, target_lat_list) 
 99 |         plt.scatter(target_lng_list, target_lat_list,  marker='o', c = 'blue',edgecolor='blue')
100 |         
101 |     if pred_lat is not None and pred_lng is not None:
102 |         pred_lng, pred_lat = m(pred_lng, pred_lat) 
103 |         plt.scatter(pred_lng, pred_lat, marker='o', c = 'red', edgecolor='black')
104 |         
105 |     plt.show()
106 | 
107 | def plotting_func(loc_sanborn_dir, pred_dict, lat_lng_dict, dataset_name, geocoding_name):
108 | 
109 |     for map_name, pred in pred_dict.items():
110 |         
111 |         title = dataset_name + '-' + geocoding_name + '-' + map_name
112 |         lat_list = lat_lng_dict[map_name]['lat_list']
113 |         lng_list = lat_lng_dict[map_name]['lng_list']
114 |         
115 |         if dataset_name == 'LoC_sanborn':
116 |             xml_path = os.path.join(loc_sanborn_dir,map_name + '.tif.aux.xml')
117 |             try:
118 |                 with open(xml_path) as fp:
119 |                     soup = BeautifulSoup(fp)
120 |                 
121 |                 target_gcp_list = soup.findAll("metadata")[1].targetgcps.findAll("double")
122 |             except Exception as e:
123 |                 print(xml_path)
124 |                 continue
125 |             
126 |             xy_list = []
127 |             for target_gcp in target_gcp_list:
128 |                 xy_list.append(float(target_gcp.contents[0]))
129 |                 
130 |             x_list = xy_list[0::2]
131 |             y_list = xy_list[1::2]
132 |             
133 |             lng2_list,  lat2_list = [],[]
134 |             for x1,y1 in zip(x_list, y_list):
135 |                 x2,y2 = transform(inProj,outProj,x1,y1)
136 |                 #print (x2,y2)
137 |                 lng2_list.append(x2)
138 |                 lat2_list.append(y2)
139 |                 
140 |             plot_points(lat_list, lng_list, lat2_list, lng2_list, pred_lat = pred[0], pred_lng = pred[1], title=title)
141 |         else:
142 |             plot_points(lat_list, lng_list,pred_lat = pred[0], pred_lng = pred[1], title=title)
143 |         
144 | 
145 | def clustering(args):
146 |     dataset_name = args.dataset_name
147 |     geocoding_name = args.geocoding_name
148 |     remove_duplicate_location = args.remove_duplicate_location
149 |     visualize = args.visualize
150 | 
151 |     sanborn_output_dir = '/data2/sanborn_maps_output'
152 | 
153 |     input_dir=os.path.join(sanborn_output_dir, dataset_name, 'geocoding_suffix_testr', geocoding_name)
154 |     if remove_duplicate_location:
155 |         output_dir = os.path.join(sanborn_output_dir, dataset_name, 'clustering_testr_removeduplicate', geocoding_name)
156 |     else:
157 |         output_dir = os.path.join(sanborn_output_dir, dataset_name, 'clustering_testr', geocoding_name)
158 |         
159 |     county_boundary_path = '/home/zekun/Sanborn/cb_2018_us_county_500k/cb_2018_us_county_500k.shp'
160 | 
161 |     if not os.path.isdir(output_dir):
162 |         os.makedirs(output_dir)
163 | 
164 |     inProj = Proj(init='epsg:3857')
165 |     outProj = Proj(init='epsg:4326')
166 | 
167 |     county_boundary_df = gpd.read_file(county_boundary_path)
168 | 
169 |     if dataset_name == 'LoC_sanborn':
170 |         loc_sanborn_dir = '/data2/sanborn_maps/Sanborn100_Georef/' # for comparing with GT
171 |         metadata_tsv_path = '/home/zekun/Sanborn/Sheet_List.tsv'
172 |         meta_df = pd.read_csv(metadata_tsv_path, sep='\t')
173 | 
174 |     file_list = os.listdir(input_dir)
175 | 
176 |     pred_dict = dict()
177 |     lat_lng_dict = dict()
178 |     for file_path in file_list:
179 |         
180 |         map_name = os.path.basename(file_path).split('.')[0]
181 |         if dataset_name == 'LoC_sanborn':
182 |             county_name = meta_df[meta_df['filename'] == map_name]['County'].values[0]
183 |         elif dataset_name == 'LA_sanborn' or 'two_more':
184 |             county_name = 'Los Angeles County (CA)'
185 |         else:
186 |             raise NotImplementedError
187 | 
188 |         index = county_index_dict[county_name]
189 |         if index >= 0:
190 |             poly_geometry = county_boundary_df.iloc[index].geometry
191 |         
192 |         with open(os.path.join(input_dir,file_path), 'r') as f:
193 |             data = f.readlines()
194 |             
195 |         lat_list = []
196 |         lng_list = []
197 |         for line in data:
198 | 
199 |             line_dict = json.loads(line)
200 |             geocoding_dict = line_dict['geocoding']
201 |             text = line_dict['text']
202 |             score = line_dict['score']
203 |             geometry = line_dict['geometry']
204 | 
205 |             if geocoding_dict is None:
206 |                 continue # if no geolocation returned by geocoder, then skip 
207 |             
208 |             if 'lat' not in geocoding_dict or 'lng' not in geocoding_dict:
209 |                 #print(geocoding_dict)
210 |                 continue 
211 | 
212 |             lat = float(geocoding_dict['lat'])
213 |             lng = float(geocoding_dict['lng'])
214 |             
215 |             point = Point(lng, lat)
216 |             
217 |             if index >= 0:
218 |                 if point.within(poly_geometry): # geocoding point within county boundary
219 |                     lat_list.append(lat)
220 |                     lng_list.append(lng)
221 |                 else:
222 |                     pass
223 |             else: # cluster based on all results
224 |                 lat_list.append(lat)
225 |                 lng_list.append(lng)
226 | 
227 |         if remove_duplicate_location:
228 |             lat_list = list(set(lat_list))
229 |             lng_list = list(set(lng_list))
230 |             
231 |         if len(lat_list) >0 and len(lng_list) > 0:
232 |             pred = clustering_func(lat_list, lng_list)
233 |             # print(pred)
234 |         else:
235 |             print('No data to cluster')
236 | 
237 |         print(map_name, pred)
238 |         pred_dict[map_name] = pred
239 |         lat_lng_dict[map_name]={'lat_list':lat_list, 'lng_list':lng_list}
240 | 
241 |     if visualize:
242 |         plotting_func(loc_sanborn_dir = loc_sanborn_dir, pred_dict = pred_dict, lat_lng_dict = lat_lng_dict,
243 |             dataset_name = dataset_name, geocoding_name = geocoding_name)
244 | 
245 |     with open(os.path.join(output_dir, 'pred_center.json'),'w') as f:
246 |         json.dump(pred_dict, f)
247 |         
248 | 
249 | def main():
250 |     parser = argparse.ArgumentParser()
251 | 
252 |     parser.add_argument('--dataset_name', type=str, default=None,
253 |         choices=['LA_sanborn', 'LoC_sanborn',],
254 |         help='dataset name, same as expt_name')
255 |     parser.add_argument('--geocoding_name', type=str, default=None, 
256 |         choices=['google','arcgis','geonames','osm'],
257 |         help='geocoder name')
258 |     parser.add_argument('--visualize', default = False, action = 'store_true') # Enable this when in notebook
259 |     parser.add_argument('--remove_duplicate_location', default=False, action='store_true') # whether remove duplicate geolocations for clustering
260 |     
261 |     # parser.add_argument('--output_folder', type=str, default='/data2/sanborn_maps_output/LA_sanborn/geocoding/')
262 |     # parser.add_argument('--input_map_geojson_path', type=str, default='/data2/sanborn_maps_output/LA_sanborn/geojson_testr/service-gmd-gmd436m-g4364m-g4364lm-g4364lm_g00656189401-00656_01_1894-0001l.geojson')
263 |    
264 |                         
265 |     args = parser.parse_args()
266 |     print('\n')
267 |     print(args)
268 |     print('\n')
269 |     
270 |     clustering(args)
271 | 
272 | 
273 | if __name__ == '__main__':
274 | 
275 |     main()
276 | 


--------------------------------------------------------------------------------
/m_sanborn/s3_gen_geojson.py:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/knowledge-computing/mapkurator-system/5b765d99c4898ce07654d904b6f3b608b9e76189/m_sanborn/s3_gen_geojson.py


--------------------------------------------------------------------------------
/requirements.txt:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/knowledge-computing/mapkurator-system/5b765d99c4898ce07654d904b6f3b608b9e76189/requirements.txt


--------------------------------------------------------------------------------
/run.py:
--------------------------------------------------------------------------------
 1 | import os
 2 | import glob
 3 | import argparse
 4 | import time
 5 | import logging
 6 | import pandas as pd
 7 | import datetime
 8 | from PIL import Image 
 9 | from utils import get_img_path_from_external_id, get_img_path_from_external_id_and_image_no,run_pipeline
10 | 
11 | import subprocess
12 | 
13 | 
14 | logging.basicConfig(level=logging.INFO)
15 | Image.MAX_IMAGE_PIXELS=None # allow reading huge images
16 | 
17 | def main():
18 |     parser = argparse.ArgumentParser()
19 | 
20 |     parser.add_argument('--map_kurator_system_dir', type=str, default='/home/maplord/rumsey/mapkurator-system/')
21 |     parser.add_argument('--text_spotting_model_dir', type=str, default='/home/maplord/rumsey/TESTR/')
22 |     
23 |     parser.add_argument('--sample_map_csv_path', type=str, default=None)
24 | 
25 |     parser.add_argument('--output_folder', type=str, default='/data2/rumsey_output') # Original: /data2/rumsey_output
26 |     parser.add_argument('--expt_name', type=str, default='1000_maps') # output prefix 
27 |     
28 |     parser.add_argument('--module_get_dimension', default=False, action='store_true')
29 |     parser.add_argument('--module_gen_geotiff', default=False, action='store_true')
30 |     parser.add_argument('--module_cropping', default=False, action='store_true')
31 |     parser.add_argument('--module_text_spotting', default=False, action='store_true')
32 |     parser.add_argument('--module_img_geojson', default=False, action='store_true')
33 |     parser.add_argument('--module_geocoord_geojson', default=False, action='store_true')
34 |     parser.add_argument('--module_post_ocr_entity_linking', default=False, action='store_true')
35 |     parser.add_argument('--module_post_ocr_only', default=False, action='store_true')
36 |     parser.add_argument('--module_post_ocr', default=False, action='store_true')
37 | 
38 |     parser.add_argument('--spotter_model', type=str, default='spotter-v2', choices=['testr', 'spotter-v2', "palette"], 
39 |         help='Select text spotting model option from ["testr", "spotter-v2", "palette"]') # select text spotting model
40 |     parser.add_argument('--spotter_config', type=str, default='/home/maplord/rumsey/TESTR/configs/TESTR/SynMap/SynMap_Polygon.yaml',
41 |         help='Path to the config file for text spotting model')
42 |     parser.add_argument('--spotter_expt_name', type=str, default='exp',
43 |         help='Name of spotter experiment, if empty using config file name') 
44 |     
45 |     # Running spotter-testr
46 |     # python run.py --text_spotting_model_dir /home/maplord/rumsey/spotter-testr/TESTR/
47 |     #               --sample_map_csv_path /home/maplord/maplist_csv/luna_omo_splits/luna_omo_metadata_56628_20220724_part1.csv
48 |     #               --expt_name 57k_maps_r3 --module_text_spotting 
49 |     #               --spotter_model testr --spotter_config /home/maplord/rumsey/spotter-testr/TESTR/configs/TESTR/SynthMap/SynthMap_Polygon.yaml --spotter_expt_name test
50 |     # Running spotter-v2
51 |     # python run.py --text_spotting_model_dir /home/maplord/rumsey/spotter-v2/PALEJUN/
52 |     #               --sample_map_csv_path /home/maplord/maplist_csv/luna_omo_splits/luna_omo_metadata_56628_20220724_part1.csv
53 |     #               --expt_name 57k_maps_r3 --module_text_spotting 
54 |     #               --spotter_model spotter-v2 --spotter_config /home/maplord/rumsey/spotter-v2/PALEJUN/configs/PALEJUN/SynthMap/SynthMap_Polygon.yaml --spotter_expt_name test
55 |     # Running spotter-palette
56 |     # python run.py --text_spotting_model_dir /home/maplord/rumsey/spotter-palette/PALETTE/
57 |     #               --sample_map_csv_path /home/maplord/maplist_csv/luna_omo_splits/luna_omo_metadata_56628_20220724_part1.csv
58 |     #               --expt_name 57k_maps_r3 --module_text_spotting 
59 |     #               --spotter_model palette --spotter_config /home/maplord/rumsey/spotter-palette/PALETTR/configs/PALETTE/Pretrain/SynthMap_Polygon.yaml --spotter_expt_name test
60 |     
61 |     parser.add_argument('--print_command', default=False, action='store_true')
62 |     parser.add_argument('--gpu_id', type=int, default=0)
63 | 
64 |                         
65 |     args = parser.parse_args()
66 |     print('\n')
67 |     print(args)
68 |     print('\n')
69 | 
70 |     run_pipeline(args)
71 | 
72 | 
73 | 
74 | if __name__ == '__main__':
75 | 
76 |     main()
77 | 
78 |     
79 | 


--------------------------------------------------------------------------------
/run_img.py:
--------------------------------------------------------------------------------
 1 | import os
 2 | import subprocess
 3 | import glob
 4 | import argparse
 5 | import time
 6 | import logging
 7 | import pandas as pd
 8 | import pdb
 9 | import datetime
10 | from PIL import Image 
11 | from utils import run_pipeline
12 | 
13 | import subprocess
14 | 
15 | #this code is the case for getting an input as folders which include images.  
16 | #tested image : /home/maplord/rumsey/mapkurator-system/data/100_maps_crop/crop_leeje_2/test_run_img/
17 | logging.basicConfig(level=logging.INFO)
18 | Image.MAX_IMAGE_PIXELS=None # allow reading huge images
19 | 
20 | 
21 | 
22 | def main():
23 |     parser = argparse.ArgumentParser()
24 | 
25 |     parser.add_argument('--map_kurator_system_dir', type=str, default='/home/maplord/rumsey/mapkurator-system/')
26 |     parser.add_argument('--text_spotting_model_dir', type=str, default='/home/maplord/rumsey/TESTR/')
27 | 
28 |     parser.add_argument('--input_dir_path', type=str, default=None)
29 | 
30 |     parser.add_argument('--output_folder', type=str, default='/data2/rumsey_output') 
31 |     parser.add_argument('--expt_name', type=str, default='1000_maps') # output prefix 
32 |     
33 |     parser.add_argument('--module_get_dimension', default=False, action='store_true')
34 |     parser.add_argument('--module_gen_geotiff', default=False, action='store_true')
35 |     parser.add_argument('--module_cropping', default=False, action='store_true')
36 |     parser.add_argument('--module_text_spotting', default=False, action='store_true')
37 |     parser.add_argument('--module_img_geojson', default=False, action='store_true')
38 | 
39 |     
40 |     parser.add_argument('--spotter_model', type=str, default='spotter-v2', choices=['testr', 'spotter-v2', "palette"], 
41 |         help='Select text spotting model option from ["testr", "spotter-v2", "palette"]') # select text spotting model
42 |     parser.add_argument('--spotter_config', type=str, default='/home/maplord/rumsey/TESTR/configs/TESTR/SynMap/SynMap_Polygon.yaml',
43 |         help='Path to the config file for text spotting model')
44 |     parser.add_argument('--spotter_expt_name', type=str, default='exp',
45 |         help='Name of spotter experiment, if empty using config file name') 
46 |    
47 |     parser.add_argument('--print_command', default=False, action='store_true')
48 |     parser.add_argument('--gpu_id', type=int, default=0)
49 |                         
50 |     args = parser.parse_args()
51 |     print('\n')
52 |     print(args)
53 |     print('\n')
54 | 
55 |     run_pipeline(args)
56 | 
57 | 
58 | 
59 | if __name__ == '__main__':
60 | 
61 |     main()
62 | 
63 |     
64 | 


--------------------------------------------------------------------------------
/utils.py:
--------------------------------------------------------------------------------
  1 | import os
  2 | import glob
  3 | import pandas as pd
  4 | import ast
  5 | import argparse
  6 | import logging
  7 | import pdb
  8 | from PIL import Image 
  9 | import datetime
 10 | import subprocess
 11 | import time
 12 | 
 13 | logging.basicConfig(level=logging.INFO)
 14 | Image.MAX_IMAGE_PIXELS=None 
 15 | 
 16 | 
 17 | def execute_command(command, if_print_command):
 18 |     t1 = time.time()
 19 | 
 20 |     if if_print_command:
 21 |         print(command)
 22 | 
 23 |     try:
 24 |         subprocess.run(command, shell=True,check=True, capture_output = True) #stderr=subprocess.STDOUT)
 25 |         t2 = time.time()
 26 |         time_usage = t2 - t1 
 27 |         return {'time_usage':time_usage}
 28 |     except subprocess.CalledProcessError as err:
 29 |         error = err.stderr.decode('utf8')
 30 |         # format error message to one line
 31 |         error  = error.replace('\n','\t')
 32 |         error = error.replace(',',';')
 33 |         return {'error': error}
 34 | 
 35 | 
 36 | def get_img_dimension(img_path):
 37 |     map_img = Image.open(img_path) 
 38 |     width, height = map_img.size 
 39 | 
 40 |     return width, height
 41 | 
 42 | 
 43 | def run_pipeline(args):
 44 |     # -------------------------  Pass arguments -----------------------------------------
 45 |     map_kurator_system_dir = args.map_kurator_system_dir
 46 |     text_spotting_model_dir = args.text_spotting_model_dir
 47 | 
 48 |     if hasattr(args, "sample_map_csv_path"):
 49 |     # if  typeof === 'undefined':  
 50 |         sample_map_path = args.sample_map_csv_path
 51 |         module_geocoord_geojson = args.module_geocoord_geojson 
 52 |         module_post_ocr_entity_linking = args.module_post_ocr_entity_linking
 53 |         module_post_ocr_only = args.module_post_ocr_only
 54 |         module_post_ocr = args.module_post_ocr
 55 |     
 56 |     elif hasattr(args, "input_dir_path"): 
 57 |         input_dir_path = args.input_dir_path
 58 | 
 59 |     expt_name = args.expt_name
 60 |     output_folder = args.output_folder
 61 | 
 62 |     module_get_dimension = args.module_get_dimension
 63 |     module_gen_geotiff = args.module_gen_geotiff
 64 |     module_cropping = args.module_cropping
 65 |     module_text_spotting = args.module_text_spotting
 66 |     module_img_geojson = args.module_img_geojson 
 67 | 
 68 |     spotter_model = args.spotter_model
 69 |     spotter_config = args.spotter_config
 70 |     spotter_expt_name = args.spotter_expt_name
 71 |     gpu_id = args.gpu_id
 72 |     
 73 |     if_print_command = args.print_command
 74 |     error_reason_dict = dict()
 75 | 
 76 |     if "sample_map_path" in locals():
 77 |         # ------------------------- Read sample map list and prepare output dir ----------------
 78 |         if sample_map_path is not None:
 79 |             input_csv_path = sample_map_path
 80 |             if input_csv_path[-4:] == '.csv':
 81 |                 sample_map_df = pd.read_csv(sample_map_path, dtype={'external_id':str})
 82 |             elif input_csv_path[-4:] == '.tsv':
 83 |                 sample_map_df = pd.read_csv(sample_map_path, dtype={'external_id':str}, sep='\t')
 84 |             else:
 85 |                 raise NotImplementedError
 86 | 
 87 |         external_id_to_img_path_dict, unmatched_external_id_list = get_img_path_from_external_id_and_image_no( sample_map_path = input_csv_path)
 88 | 
 89 |         # initialize error reason dict
 90 |         
 91 |         for ex_id in unmatched_external_id_list:
 92 |             error_reason_dict[ex_id] = {'img_path':None, 'error':'Can not find image given external_id.'} 
 93 |     
 94 |     elif "input_dir_path" in locals():
 95 |         if input_dir_path is not None: 
 96 |             input_img_path = input_dir_path 
 97 |             sample_map_df = pd.DataFrame(columns = ["external_id"])
 98 |             for images in os.listdir(input_img_path):
 99 |                     tmp_path={"external_id": os.path.join(input_img_path,images)}
100 |                     sample_map_df=sample_map_df.append(tmp_path,ignore_index=True)
101 |         else:
102 |             raise NotImplementedError            
103 |     else:
104 |         raise NotImplementedError
105 | 
106 | 
107 |     expt_out_dir = os.path.join(output_folder, expt_name)
108 |     geotiff_output_dir = os.path.join(output_folder, expt_name,  'geotiff')
109 |     cropping_output_dir = os.path.join(output_folder, expt_name, 'crop/')
110 |     spotting_output_dir = os.path.join(output_folder, expt_name,  'spotter/' + spotter_expt_name)
111 |     stitch_output_dir = os.path.join(output_folder, expt_name, 'stitch/' + spotter_expt_name)
112 |     geocoord_output_dir = os.path.join(output_folder, expt_name, 'geocoord/' + spotter_expt_name)
113 |     postocr_linking_output_dir = os.path.join(output_folder, expt_name, 'postocr_linking/'+ spotter_expt_name)
114 |     postocr_only_output_dir = os.path.join(output_folder, expt_name, 'postocr_only/'+ spotter_expt_name)
115 | 
116 | 
117 |     if not os.path.isdir(expt_out_dir):
118 |         os.makedirs(expt_out_dir)
119 | 
120 |     # ------------------------ Get image dimension  ------------------------------
121 |     if module_get_dimension:
122 |         for index, record in sample_map_df.iterrows():
123 |             external_id = record.external_id
124 |             # pdb.set_trace()
125 |             if "sample_map_path" in locals():
126 |                 if external_id not in external_id_to_img_path_dict:
127 |                     error_reason_dict[external_id] = {'img_path':None, 'error':'key not in external_id_to_img_path_dict'} 
128 |                     continue 
129 | 
130 |                 img_path = external_id_to_img_path_dict[external_id]
131 | 
132 |                 try:
133 |                     width, height = get_img_dimension(img_path)
134 |                 except Exception as e:
135 |                     error_reason_dict[external_id] = {'img_path':img_path, 'error': e } 
136 | 
137 |             elif "input_dir_path" in locals():
138 |                 img_path = sample_map_df['external_id'].iloc[index]              
139 |                 width, height = get_img_dimension(img_path)        
140 |             
141 |             map_name = os.path.basename(img_path).split('.')[0]
142 |             
143 |     # ------------------------- Generate geotiff ------------------------------
144 |     time_start =  time.time()
145 |     if module_gen_geotiff:
146 |         os.chdir(os.path.join(map_kurator_system_dir ,'m1_geotiff'))
147 |         
148 |         if not os.path.isdir(geotiff_output_dir):
149 |             os.makedirs(geotiff_output_dir)
150 | 
151 |         # use converted jpg folder instead of original sid folder
152 |         if "sample_map_path" in locals():
153 |             merged_input_path=sample_map_path
154 |         else: 
155 |             merged_input_path=input_dir_path
156 | 
157 |             run_geotiff_command = 'python convert_image_to_geotiff.py --sid_root_dir /data2/rumsey_sid_to_jpg/ --sample_map_path '+ merged_input_path +' --out_geotiff_dir '+geotiff_output_dir  # can change params in argparse
158 |             exe_ret = execute_command(run_geotiff_command, if_print_command)
159 |             if 'error' in exe_ret:
160 |                 error = exe_ret['error']
161 | 
162 |         
163 | 
164 |     # ------------------------- Image cropping  ------------------------------
165 |     if module_cropping:
166 |         for index, record in sample_map_df.iterrows():
167 |             external_id = record.external_id
168 |             if "sample_map_path" in locals():
169 |                 if external_id not in external_id_to_img_path_dict:
170 |                     error_reason_dict[external_id] = {'img_path':None, 'error':'key not in external_id_to_img_path_dict'} 
171 |                     continue 
172 |                 img_path = external_id_to_img_path_dict[external_id]
173 |             else: 
174 |                 img_path = sample_map_df['external_id'].iloc[index]
175 | 
176 |             map_name = os.path.basename(img_path).split('.')[0]
177 | 
178 |             os.chdir(os.path.join(map_kurator_system_dir ,'m2_detection_recognition'))
179 |             if not os.path.isdir(cropping_output_dir):
180 |                 os.makedirs(cropping_output_dir)
181 |             
182 |             run_crop_command = 'python crop_img.py --img_path '+img_path + ' --output_dir '+ cropping_output_dir
183 | 
184 |             exe_ret = execute_command(run_crop_command, if_print_command)
185 |             
186 |             if "sample_map_path" in locals():
187 |                 if 'error' in exe_ret:
188 |                     error = exe_ret['error']
189 |                     error_reason_dict[external_id] = {'img_path':img_path, 'error': error } 
190 | 
191 |                 
192 |             
193 |     time_cropping = time.time()
194 |     
195 |     # ------------------------- Text Spotting (patch level) ------------------------------
196 |     if module_text_spotting:
197 |         assert os.path.exists(spotter_config), "Config file for spotter must exist!"
198 |         os.chdir(text_spotting_model_dir) 
199 |         os.system("python setup.py build develop 1> /dev/null")
200 | 
201 |         for index, record in sample_map_df.iterrows():
202 | 
203 |             external_id = record.external_id
204 |             if "sample_map_path" in locals():
205 |                 if external_id not in external_id_to_img_path_dict:
206 |                     error_reason_dict[external_id] = {'img_path':None, 'error':'key not in external_id_to_img_path_dict'} 
207 |                     continue 
208 |                 img_path = external_id_to_img_path_dict[external_id]
209 |             else:
210 |                 img_path = sample_map_df['external_id'].iloc[index]
211 | 
212 |             map_name = os.path.basename(img_path).split('.')[0]
213 |             # print(map_name)
214 | 
215 |             map_spotting_output_dir = os.path.join(spotting_output_dir, map_name)
216 |             
217 |             if not os.path.isdir(map_spotting_output_dir):
218 |                 os.makedirs(map_spotting_output_dir)
219 |             else:
220 |                 num_existing_json = len(glob.glob(os.path.join(map_spotting_output_dir, '*.json')))
221 |                 num_existing_images = len(glob.glob(os.path.join(cropping_output_dir, map_name, '*jpg')))
222 |                 if num_existing_json == num_existing_images:
223 |                     continue
224 |                 else:
225 |                     print(f'{index}/{len(sample_map_df)}: Re-run spotting for map {map_name}')
226 |                     import shutil
227 |                     shutil.rmtree(map_spotting_output_dir)
228 |                     os.makedirs(map_spotting_output_dir)        
229 | 
230 |             if spotter_model in ['testr', 'spotter-v2', 'palette']:
231 |                 run_spotting_command = f'CUDA_VISIBLE_DEVICES={gpu_id} python tools/inference.py --config-file {spotter_config} --output_json --input {os.path.join(cropping_output_dir,map_name)} --output {map_spotting_output_dir}'
232 |             else:
233 |                 raise NotImplementedError
234 |             
235 |             # print(run_spotting_command)
236 |             run_spotting_command  += ' 1> /dev/null'
237 |         
238 |             exe_ret = execute_command(run_spotting_command, if_print_command)  
239 |             if "sample_map_path" in locals():   
240 |                 if 'error' in exe_ret:
241 |                     error = exe_ret['error']
242 |                     error_reason_dict[external_id] = {'img_path':img_path, 'error': error } 
243 |             
244 |             # elif 'time_usage' in exe_ret:
245 |             #     time_usage = exe_ret['time_usage']
246 |             #     time_usage_dict[external_id]['spotting'] = time_usage
247 |             # else:
248 |             #     raise NotImplementedError
249 | 
250 |             logging.info(f'{index}/{len(sample_map_df)}: Done text spotting for {map_name}')
251 |             
252 |     # time_text_spotting = time.time()
253 |     
254 | 
255 |     # ------------------------- Image coord geojson (map level) ------------------------------
256 |     if module_img_geojson:
257 |         os.chdir(os.path.join(map_kurator_system_dir ,'m3_image_geojson'))
258 |         
259 |         if not os.path.isdir(stitch_output_dir):
260 |             os.makedirs(stitch_output_dir)
261 | 
262 |         for index, record in sample_map_df.iterrows():
263 |             external_id = record.external_id
264 |             if "sample_map_path" in locals():
265 |                 if external_id not in external_id_to_img_path_dict:
266 |                     error_reason_dict[external_id] = {'img_path':None, 'error':'key not in external_id_to_img_path_dict'} 
267 |                     continue 
268 |                 img_path = external_id_to_img_path_dict[external_id]
269 |             else: 
270 |                 img_path = sample_map_df['external_id'].iloc[index]
271 |             map_name = os.path.basename(img_path).split('.')[0]
272 | 
273 |             stitch_input_dir = os.path.join(spotting_output_dir, map_name)
274 |             output_geojson = os.path.join(stitch_output_dir, map_name + '.geojson')
275 |             
276 |             run_stitch_command = 'python stitch_output.py --input_dir '+stitch_input_dir + ' --output_geojson ' + output_geojson
277 |             
278 |             exe_ret = execute_command(run_stitch_command, if_print_command)
279 | 
280 |             if "sample_map_path" in locals():
281 |                 if 'error' in exe_ret:
282 |                     error = exe_ret['error']
283 |                     error_reason_dict[external_id] = {'img_path':img_path, 'error': error } 
284 | 
285 |             # elif 'time_usage' in exe_ret:
286 |             #     time_usage = exe_ret['time_usage']
287 |             #     time_usage_dict[external_id]['stitch'] = time_usage
288 |             # else:
289 |             #     raise NotImplementedError
290 |             
291 |     # time_img_geojson = time.time()
292 | 
293 | 
294 |     # ------------------------- post-OCR ------------------------------
295 |     if "sample_map_path" in locals():
296 |         if module_post_ocr:
297 |             os.chdir(os.path.join(map_kurator_system_dir, 'm4_post_ocr'))
298 | 
299 |             if not os.path.isdir(postocr_only_output_dir):
300 |                 os.makedirs(postocr_only_output_dir)
301 |             
302 |             for index, record in sample_map_df.iterrows():
303 |                 
304 |                 external_id = record.external_id
305 |                 if external_id not in external_id_to_img_path_dict:
306 |                     error_reason_dict[external_id] = {'img_path': None, 'error': 'key not in external_id_to_img_path_dict'}
307 |                     continue
308 | 
309 |                 img_path = external_id_to_img_path_dict[external_id]
310 |                 map_name = os.path.basename(img_path).split('.')[0]
311 |                 
312 |                 input_geojson_file = os.path.join(geocoord_output_dir, map_name + '.geojson')
313 | 
314 |                 run_postocr_command = 'python post_ocr_main.py --in_geojson_file '+ input_geojson_file + ' --out_geojson_dir ' + os.path.join(map_kurator_system_dir, postocr_only_output_dir)
315 |                 
316 |                 exe_ret = execute_command(run_postocr_command, if_print_command)
317 |                 
318 |                 if 'error' in exe_ret:
319 |                     error = exe_ret['error']
320 |                     error_reason_dict[external_id] = {'img_path':img_path, 'error': error }
321 | 
322 |     #         elif 'time_usage' in exe_ret:
323 |     #             time_usage = exe_ret['time_usage']
324 |     #             time_usage_dict[external_id]['geocoord_geojson'] = time_usage
325 |     #         else:
326 |     #             raise NotImplementedError
327 | 
328 |         # time_geocoord_geojson = time.time()
329 | 
330 |     # ------------------------- Convert image coordinates to geocoordinates ------------------------------
331 |     if "sample_map_path" in locals():
332 |         if module_geocoord_geojson:
333 |             os.chdir(os.path.join(map_kurator_system_dir, 'm5_geocoordinate_converter'))
334 | 
335 |             if not os.path.isdir(geocoord_output_dir):
336 |                 os.makedirs(geocoord_output_dir)
337 | 
338 |             for index, record in sample_map_df.iterrows():
339 |                 external_id = record.external_id
340 |                 if external_id not in external_id_to_img_path_dict:
341 |                     error_reason_dict[external_id] = {'img_path': None,
342 |                                                     'error': 'key not in external_id_to_img_path_dict'}
343 |                     continue
344 | 
345 |                 img_path = external_id_to_img_path_dict[external_id]
346 |                 map_name = os.path.basename(img_path).split('.')[0]
347 | 
348 |                 # current_files_list = glob.glob(os.path.join(map_kurator_system_dir, geocoord_output_dir, "*.geojson"))
349 | 
350 |                 # saved_map_list = []
351 |                 # for mapname in current_files_list:
352 |                 #     only_map = mapname.split("/")[-1]#.strip().replace(".geojson", "")
353 |                 #     saved_map_list.append(only_map)
354 |               
355 |                 in_geojson = os.path.join(stitch_output_dir, map_name + '.geojson')
356 |                 
357 |                 # current_map = in_geojson.split("/")[-1]
358 | 
359 |                 # if current_map not in saved_map_list: 
360 |                     # print("running missing file",current_map)
361 | 
362 |                 run_converter_command = 'python convert_geojson_to_geocoord.py --sample_map_path ' + os.path.join(map_kurator_system_dir, input_csv_path) + ' --in_geojson_file ' + in_geojson + ' --out_geojson_dir ' + os.path.join(map_kurator_system_dir, geocoord_output_dir)
363 | 
364 |                 exe_ret = execute_command(run_converter_command, if_print_command)
365 | 
366 |                 if 'error' in exe_ret:
367 |                     error = exe_ret['error']
368 |                     error_reason_dict[external_id] = {'img_path': img_path, 'error': error}
369 | 
370 | #             elif 'time_usage' in exe_ret:
371 | #                 time_usage = exe_ret['time_usage']
372 | #                 time_usage_dict[external_id]['geocoord_geojson'] = time_usage
373 | #             else:
374 | #                 raise NotImplementedError
375 | 
376 | #     time_geocoord_geojson = time.time()
377 | 
378 | 
379 |     # --------------------- Error logging --------------------------
380 |     print('\n')
381 |     current_time = datetime.datetime.now().strftime("%Y_%m_%d-%I:%M:%S_%p")
382 |     error_reason_df = pd.DataFrame.from_dict(error_reason_dict, orient='index')
383 |     error_reason_log_path = os.path.join(output_folder, expt_name, 'error_reason_' +  current_time +'.csv')
384 |     error_reason_df.to_csv(error_reason_log_path, index_label='external_id')
385 | 
386 | 
387 | def func_file_to_fullpath_dict(file_path_list):
388 | 
389 |     file_fullpath_dict = dict()
390 |     for file_path in file_path_list:
391 |         file_fullpath_dict[os.path.basename(file_path).split('.')[0]] = file_path
392 | 
393 |     return file_fullpath_dict  
394 | 
395 | def get_img_path_from_external_id(jp2_root_dir = '/data/rumsey-jp2/', sid_root_dir = '/data2/rumsey_sid_to_jpg/', additional_root_dir='/data2/rumsey-luna-img/', sample_map_path = None,external_id_key = 'external_id') :
396 |     # returns (1) a dict with external-id as key, full image path as value (2) list of external-id that can not find image path
397 | 
398 |     jp2_file_path_list = glob.glob(os.path.join(jp2_root_dir, '*/*.jp2'))
399 |     sid_file_path_list = glob.glob(os.path.join(sid_root_dir, '*.jpg'))
400 |     add_file_path_list = glob.glob(os.path.join(additional_root_dir, '*'))
401 | 
402 |     jp2_file_fullpath_dict = func_file_to_fullpath_dict(jp2_file_path_list) 
403 |     sid_file_fullpath_dict = func_file_to_fullpath_dict(sid_file_path_list) 
404 |     add_file_fullpath_dict = func_file_to_fullpath_dict(add_file_path_list) 
405 | 
406 |     sample_map_df = pd.read_csv(sample_map_path, dtype={'external_id':str})
407 | 
408 |     external_id_to_img_path_dict = {}
409 | 
410 |     unmatched_external_id_list = []
411 | 
412 |     for index, record in sample_map_df.iterrows():
413 |         external_id = record.external_id
414 |         filename_without_extension = external_id.strip("'").replace('.','')
415 | 
416 |         full_path = ''
417 |         if filename_without_extension in jp2_file_fullpath_dict:
418 |             full_path = jp2_file_fullpath_dict[filename_without_extension]
419 |         elif filename_without_extension in sid_file_fullpath_dict:
420 |             full_path = sid_file_fullpath_dict[filename_without_extension]
421 |         elif filename_without_extension in add_file_fullpath_dict:
422 |             full_path = add_file_fullpath_dict[filename_without_extension]
423 |         else:
424 |             # print('image with external_id not found in image_dir:', external_id)
425 |             unmatched_external_id_list.append(external_id)
426 |             continue
427 |         assert (len(full_path)!=0)
428 | 
429 |         external_id_to_img_path_dict[external_id] = full_path
430 |     
431 |     return external_id_to_img_path_dict,  unmatched_external_id_list
432 | 
433 | def get_img_path_from_external_id_and_image_no(jp2_root_dir = '/data/rumsey-jp2/', sid_root_dir = '/data2/rumsey_sid_to_jpg/', additional_root_dir='/data2/rumsey-luna-img/', sample_map_path = None,external_id_key = 'external_id') :
434 |     # returns (1) a dict with external-id as key, full image path as value (2) list of external-id that can not find image path
435 | 
436 |     jp2_file_path_list = glob.glob(os.path.join(jp2_root_dir, '*/*.jp2'))
437 |     sid_file_path_list = glob.glob(os.path.join(sid_root_dir, '*.jpg')) # use converted jpg directly
438 |     add_file_path_list = glob.glob(os.path.join(additional_root_dir, '*'))
439 | 
440 |     jp2_file_fullpath_dict = func_file_to_fullpath_dict(jp2_file_path_list) 
441 |     sid_file_fullpath_dict = func_file_to_fullpath_dict(sid_file_path_list) 
442 |     add_file_fullpath_dict = func_file_to_fullpath_dict(add_file_path_list) 
443 | 
444 |     sample_map_df = pd.read_csv(sample_map_path, dtype={'external_id':str})
445 | 
446 |     external_id_to_img_path_dict = {}
447 | 
448 |     unmatched_external_id_list = []
449 |     for index, record in sample_map_df.iterrows():
450 |         external_id = record.external_id 
451 |         image_no = record.image_no
452 |         # filename_without_extension = external_id.strip("'").replace('.','')
453 |         filename_without_extension = image_no.strip("'").split('.')[0]
454 | 
455 |         full_path = ''
456 |         if filename_without_extension in jp2_file_fullpath_dict:
457 |             full_path = jp2_file_fullpath_dict[filename_without_extension]
458 |         elif filename_without_extension in sid_file_fullpath_dict:
459 |             full_path = sid_file_fullpath_dict[filename_without_extension]
460 |         elif filename_without_extension in add_file_fullpath_dict:
461 |             full_path = add_file_fullpath_dict[filename_without_extension]
462 |         else:
463 |             print('image with external_id not found in image_dir:', external_id)
464 |             unmatched_external_id_list.append(external_id)
465 |             continue
466 |         assert (len(full_path)!=0)
467 | 
468 |         external_id_to_img_path_dict[external_id] = full_path
469 |     
470 |     return external_id_to_img_path_dict, unmatched_external_id_list
471 | 
472 | 
473 | if __name__ == '__main__':
474 | 
475 |     parser = argparse.ArgumentParser()
476 |     parser.add_argument('--jp2_root_dir', type=str, default='/data/rumsey-jp2/',
477 |                         help='image dir of jp2 files.')
478 |     parser.add_argument('--sid_root_dir', type=str, default='/data2/rumsey_sid_to_jpg/',
479 |                         help='image dir of sid files.')
480 |     parser.add_argument('--additional_root_dir', type=str, default='/data2/rumsey-luna-img/',
481 |                         help='image dir of additional luna files.')
482 |     parser.add_argument('--sample_map_path', type=str, default='data/initial_US_100_maps.csv',
483 |                         help='path to sample map csv, which contains gcps info')
484 |     parser.add_argument('--external_id_key', type=str, default='external_id',
485 |                         help='key string for external id, could be external_id or ListNo')
486 |  
487 |     args = parser.parse_args()
488 |     print(args)
489 | 
490 |     # get_img_path_from_external_id(jp2_root_dir = args.jp2_root_dir, sid_root_dir = args.sid_root_dir, additional_root_dir = args.additional_root_dir,
491 |     # sample_map_path = args.sample_map_path,external_id_key = args.external_id_key)
492 | 
493 |     get_img_path_from_external_id_and_image_no(jp2_root_dir = args.jp2_root_dir, sid_root_dir = args.sid_root_dir, additional_root_dir = args.additional_root_dir,
494 |      sample_map_path = args.sample_map_path,external_id_key = args.external_id_key)
495 | 


--------------------------------------------------------------------------------