├── .gitignore ├── README.md ├── external_id_search └── script.py ├── m0_preprocessing └── convert_sid_to_jpg.py ├── m1_geotiff └── convert_image_to_geotiff.py ├── m2_detection_recognition └── crop_img.py ├── m3_image_geojson ├── run.sh └── stitch_output.py ├── m4_post_ocr ├── logstash_postocr.conf ├── post_ocr.py ├── post_ocr_main.py └── preprocess.py ├── m5_geocoordinate_converter └── convert_geojson_to_geocoord.py ├── m6_entity_linker ├── create_elasticsearch_index.py ├── create_spatial_index_postgres.py ├── entity_linking.py ├── logstash_osm_linker.conf ├── logstash_postgres_world.conf ├── postgres_logger.py ├── upload_osm_to_postgres_all_continents.py └── upload_osm_to_postgres_ogr2ogr.py ├── m_sanborn ├── s1_geocoding.py ├── s2_clustering.py └── s3_gen_geojson.py ├── requirements.txt ├── run.py ├── run_img.py └── utils.py /.gitignore: -------------------------------------------------------------------------------- 1 | data/ 2 | data0/ 3 | data1/ 4 | rumsey_output/ 5 | .idea/ 6 | .env 7 | MrSID* 8 | __pycache__ 9 | debug/ 10 | .ipynb_checkpoints/ 11 | run_linker.py 12 | osm_linker.csv -------------------------------------------------------------------------------- /README.md: -------------------------------------------------------------------------------- 1 | 2 | ## About mapKurator System 3 | 4 | **[New]**: Our documentation website for mapKurator system is up! [https://knowledge-computing.github.io/mapkurator-doc/](https://knowledge-computing.github.io/mapkurator-doc/#/) 5 | 6 | [mapKurator](https://dl.acm.org/doi/abs/10.1145/3589132.3625579) is a fully automatic pipeline developed by the [**Knowledge Computing Lab**](https://knowledge-computing.github.io/) at the **University of Minnesota** to process a large number of scanned historical map images. Outputs include the recognized text labels, label bounding polygons, labels after post-OCR correction, and a geo-entity identifier from OpenStreetMap. 7 | 8 | ### mapKurator textspotter repository 9 | Please refer to this link to check all the spotter models in mapKurator : [Spotter-v2](https://github.com/knowledge-computing/mapkurator-spotter), [PALETTE](https://github.com/knowledge-computing/mapkurator-palette) 10 | 11 | --------- 12 | 13 | ## Data Card - Derived Dataset Processed by mapKurator System 14 | 15 | Map text recognized from the [Rumsey historical map collection](https://www.davidrumsey.com/) with 57K georeferenced maps. 16 | 17 | ### Dataset Download Link 18 | 19 | Text extraction and recognition results [https://s3.msi.umn.edu/rumsey_output/Round3/english.zip](https://s3.msi.umn.edu/rumsey_output/Round3/english.zip (~50GB)) (~50GB) 20 | 21 | ### Dataset Languages 22 | 23 | English 24 | 25 | ### Language Creators: 26 | 27 | Machine-generated 28 | 29 | ## Dataset Structure 30 | 31 | ### Data Fields 32 | 33 | 34 | 35 | ### Output File Name 36 | 37 | Output GeoJSON file is named after the external ID of original map image. 38 | 39 | 40 | 41 | ### Citation 42 | ``` 43 | @inproceedings{kim2023mapkurator, 44 | title={The mapKurator System: A Complete Pipeline for Extracting and Linking Text from Historical Maps}, 45 | author={Kim, Jina and Li, Zekun and Lin, Yijun and Namgung, Min and Jang, Leeje and Chiang, Yao-Yi}, 46 | booktitle={Proceedings of the 31st ACM International Conference on Advances in Geographic Information Systems}, 47 | pages={1--4}, 48 | year={2023} 49 | } 50 | ``` 51 | 52 | ### Licensing Information 53 | 54 | CC BY-NC 2.0 55 | 56 | -------------------------------------------------------------------------------- /external_id_search/script.py: -------------------------------------------------------------------------------- 1 | from elasticsearch_dsl import Search, Q 2 | from elasticsearch import Elasticsearch, helpers 3 | from elasticsearch import RequestsHttpConnection 4 | import argparse 5 | import os 6 | import glob 7 | import json 8 | import nltk 9 | import logging 10 | from dotenv import load_dotenv 11 | 12 | import pandas as pd 13 | import numpy as np 14 | import logging 15 | import re 16 | import warnings 17 | warnings.filterwarnings("ignore") 18 | 19 | 20 | 21 | def db_connect(): 22 | """Elasticsearch Connection on Sansa""" 23 | load_dotenv() 24 | 25 | DB_HOST = os.getenv("DB_HOST") 26 | USER_NAME = os.getenv("DB_USERNAME") 27 | PASSWORD = os.getenv("DB_PASSWORD") 28 | 29 | es = Elasticsearch([DB_HOST], connection_class=RequestsHttpConnection, http_auth=(USER_NAME, PASSWORD), verify_certs=False) 30 | return es 31 | 32 | 33 | def query(target): 34 | es = db_connect() 35 | inputs = target.upper() 36 | query = {"query": {"match": {"text": f"{inputs}"}}} 37 | test = es.search(index="meta", body=query, size=10000)["hits"]["hits"] 38 | 39 | id_list = [] 40 | if len(test) != 0 : 41 | for i in range(len(test)): 42 | map_id = test[i]['_source']['external_id'] 43 | id_list.append(map_id) 44 | 45 | 46 | result = sorted(list(set(id_list))) 47 | return result 48 | 49 | 50 | def main(args): 51 | keyword = args.target 52 | metadata_path = args.metadata 53 | meta_df = pd.read_csv(metadata_path) 54 | meta_df['tmp'] = meta_df['image_no'].str.split(".").str[0] 55 | 56 | results = query(keyword) 57 | # print(f' "{keyword}" exist in: {results}') 58 | 59 | tmp_df = meta_df[meta_df.tmp.isin(results)] 60 | 61 | print(f'"{keyword}" exist in:') 62 | for index, row in tmp_df.iterrows(): 63 | print(f'{row.tmp} \t {row.title}') 64 | 65 | 66 | if __name__ == '__main__': 67 | parser = argparse.ArgumentParser() 68 | parser.add_argument('--target', type=str, default='east', help='') 69 | parser.add_argument('--metadata', type=str, default='/home/maplord/maplist_csv/luna_omo_metadata_56628_20220724.csv', help='') 70 | 71 | args = parser.parse_args() 72 | print(args) 73 | 74 | main(args) 75 | -------------------------------------------------------------------------------- /m0_preprocessing/convert_sid_to_jpg.py: -------------------------------------------------------------------------------- 1 | import os 2 | import glob 3 | import time 4 | import multiprocessing 5 | 6 | sid_dir = '/data/rumsey-sid' 7 | sid_to_jpg_dir = '/data2/rumsey_sid_to_jpg/' 8 | num_process = 20 9 | if_print_command = True 10 | 11 | sid_list = glob.glob(os.path.join(sid_dir, '*/*.sid')) 12 | 13 | def execute_command(command, if_print_command): 14 | t1 = time.time() 15 | 16 | if if_print_command: 17 | print(command) 18 | os.system(command) 19 | 20 | t2 = time.time() 21 | time_usage = t2 - t1 22 | return time_usage 23 | 24 | 25 | def conversion(img_path): 26 | mrsiddecode_executable="/home/zekun/dr_maps/mapkurator-system/m1_geotiff/MrSID_DSDK-9.5.4.4709-rhel6.x86-64.gcc531/Raster_DSDK/bin/mrsiddecode" 27 | map_name = os.path.basename(img_path)[:-4] 28 | 29 | redirected_path = os.path.join(sid_to_jpg_dir, map_name + '.jpg') 30 | 31 | run_sid_to_jpg_command = mrsiddecode_executable + ' -quiet -i '+ img_path + ' -o '+redirected_path 32 | time_usage = execute_command(run_sid_to_jpg_command, if_print_command) 33 | 34 | 35 | 36 | if __name__ == "__main__": 37 | pool = multiprocessing.Pool(num_process) 38 | start_time = time.perf_counter() 39 | processes = [pool.apply_async(conversion, args=(sid_path,)) for sid_path in sid_list] 40 | result = [p.get() for p in processes] 41 | finish_time = time.perf_counter() 42 | print(f"Program finished in {finish_time-start_time} seconds") 43 | 44 | -------------------------------------------------------------------------------- /m1_geotiff/convert_image_to_geotiff.py: -------------------------------------------------------------------------------- 1 | import os 2 | import glob 3 | import pandas as pd 4 | import ast 5 | import argparse 6 | import logging 7 | import pdb 8 | 9 | logging.basicConfig(level=logging.INFO) 10 | 11 | def func_file_to_fullpath_dict(file_path_list): 12 | 13 | file_fullpath_dict = dict() 14 | for file_path in file_path_list: 15 | file_fullpath_dict[os.path.basename(file_path).split('.')[0]] = file_path 16 | 17 | return file_fullpath_dict 18 | 19 | def main(args): 20 | 21 | jp2_root_dir = args.jp2_root_dir 22 | sid_root_dir = args.sid_root_dir 23 | additional_root_dir = args.additional_root_dir 24 | out_geotiff_dir = args.out_geotiff_dir 25 | 26 | sample_map_path = args.sample_map_path 27 | external_id_key = args.external_id_key 28 | 29 | jp2_file_path_list = glob.glob(os.path.join(jp2_root_dir, '*/*.jp2')) 30 | sid_file_path_list = glob.glob(os.path.join(sid_root_dir, '*.jpg')) # use converted jpg directly 31 | add_file_path_list = glob.glob(os.path.join(additional_root_dir, '*')) 32 | 33 | jp2_file_fullpath_dict = func_file_to_fullpath_dict(jp2_file_path_list) 34 | sid_file_fullpath_dict = func_file_to_fullpath_dict(sid_file_path_list) 35 | add_file_fullpath_dict = func_file_to_fullpath_dict(add_file_path_list) 36 | 37 | sample_map_df = pd.read_csv(sample_map_path, dtype={'external_id':str}) 38 | 39 | 40 | for index, record in sample_map_df.iterrows(): 41 | external_id = record.external_id 42 | transform_method = record.transformation_method 43 | gcps = record.gcps 44 | filename_without_extension = external_id.strip("'").replace('.','') 45 | 46 | full_path = '' 47 | if filename_without_extension in jp2_file_fullpath_dict: 48 | full_path = jp2_file_fullpath_dict[filename_without_extension] 49 | elif filename_without_extension in sid_file_fullpath_dict: 50 | full_path = sid_file_fullpath_dict[filename_without_extension] 51 | elif filename_without_extension in add_file_fullpath_dict: 52 | full_path = add_file_fullpath_dict[filename_without_extension] 53 | else: 54 | print('image with external_id not found in image_dir:', external_id) 55 | continue 56 | assert (len(full_path)!=0) 57 | 58 | gcps = ast.literal_eval(gcps) 59 | 60 | gcp_str = '' 61 | for gcp in gcps: 62 | lng, lat = gcp['location'] 63 | x, y = gcp['pixel'] 64 | gcp_str += '-gcp '+str(x) + ' ' + str(y) + ' ' + str(lng) + ' ' + str(lat) + ' ' 65 | 66 | # gdal_translate to add GCP to raw image 67 | gdal_command = 'gdal_translate -of Gtiff '+gcp_str + full_path + ' ' + os.path.join(out_geotiff_dir, filename_without_extension) + '_temp.geotiff' 68 | print(gdal_command) 69 | os.system(gdal_command) 70 | 71 | 72 | assert transform_method in ['affine','polynomial','tps'] 73 | 74 | # reprojection with gdal_warp 75 | if transform_method == 'affine': 76 | # first order 77 | 78 | warp_command = 'gdalwarp -s_srs EPSG:4326 -t_srs EPSG:3857 -r near -order 1 -of GTiff ' + os.path.join(out_geotiff_dir, filename_without_extension) + '_temp.geotiff' + ' ' + os.path.join(out_geotiff_dir, filename_without_extension) + '.geotiff' 79 | 80 | elif transform_method == 'polynomial': 81 | # second order 82 | warp_command = 'gdalwarp -s_srs EPSG:4326 -t_srs EPSG:3857 -r near -order 2 -of GTiff '+ os.path.join(out_geotiff_dir, filename_without_extension) + '_temp.geotiff' + ' ' + os.path.join(out_geotiff_dir, filename_without_extension) + '.geotiff' 83 | 84 | elif transform_method == 'tps': 85 | # Thin plate spline #debug/11558008.geotiff #10057000.geotiff 86 | warp_command = 'gdalwarp -s_srs EPSG:4326 -t_srs EPSG:3857 -r near -tps -of GTiff '+ os.path.join(out_geotiff_dir, filename_without_extension) + '_temp.geotiff' + ' ' + os.path.join(out_geotiff_dir, filename_without_extension) + '.geotiff' 87 | 88 | else: 89 | raise NotImplementedError 90 | print(warp_command) 91 | os.system(warp_command) 92 | # remove temporary tiff file 93 | # os.system('rm ' + os.path.join(out_geotiff_dir, filename_without_extension) + '_temp.geotiff') 94 | 95 | 96 | logging.info('Done generating geotiff for %s', external_id) 97 | 98 | 99 | if __name__ == '__main__': 100 | 101 | parser = argparse.ArgumentParser() 102 | parser.add_argument('--jp2_root_dir', type=str, default='/data/rumsey-jp2/', 103 | help='image dir of jp2 files.') 104 | parser.add_argument('--sid_root_dir', type=str, default='/data2/rumsey_sid_to_jpg/', 105 | help='image dir of sid files.') 106 | parser.add_argument('--additional_root_dir', type=str, default='/data2/rumsey-luna-img/', 107 | help='image dir of additional luna files.') 108 | parser.add_argument('--out_geotiff_dir', type=str, default='data/geotiff/', 109 | help='output dir for geotiff') 110 | parser.add_argument('--sample_map_path', type=str, default='data/initial_US_100_maps.csv', 111 | help='path to sample map csv, which contains gcps info') 112 | parser.add_argument('--external_id_key', type=str, default='external_id', 113 | help='key string for external id, could be external_id or ListNo') 114 | 115 | args = parser.parse_args() 116 | print(args) 117 | 118 | 119 | main(args) 120 | -------------------------------------------------------------------------------- /m2_detection_recognition/crop_img.py: -------------------------------------------------------------------------------- 1 | import sys 2 | import os 3 | from PIL import Image, ImageFile 4 | import numpy as np 5 | import argparse 6 | import logging 7 | 8 | logging.basicConfig(level=logging.INFO) 9 | Image.MAX_IMAGE_PIXELS=None # allow reading huge images 10 | 11 | #add this one line and import ImageFile above 12 | ImageFile.LOAD_TRUNCATED_IMAGES = True 13 | 14 | def main(args): 15 | 16 | img_path = args.img_path 17 | output_dir = args.output_dir 18 | 19 | map_name = os.path.basename(img_path).split('.')[0] # get the map name without extension 20 | output_dir = os.path.join(output_dir, map_name) 21 | 22 | if not os.path.isdir(output_dir): 23 | os.makedirs(output_dir) 24 | 25 | map_img = Image.open(img_path) 26 | width, height = map_img.size 27 | 28 | #print(width, height) 29 | 30 | shift_size = 1000 31 | 32 | # pad the image to the size divisible by shift-size 33 | num_tiles_w = int(np.ceil(1. * width / shift_size)) 34 | num_tiles_h = int(np.ceil(1. * height / shift_size)) 35 | enlarged_width = int(shift_size * num_tiles_w) 36 | enlarged_height = int(shift_size * num_tiles_h) 37 | 38 | enlarged_map = Image.new(mode="RGB", size=(enlarged_width, enlarged_height)) 39 | # paste map_imge to enlarged_map 40 | enlarged_map.paste(map_img) 41 | 42 | for idx in range(0, num_tiles_h): 43 | for jdx in range(0, num_tiles_w): 44 | img_clip = enlarged_map.crop((jdx * shift_size, idx * shift_size,(jdx + 1) * shift_size, (idx + 1) * shift_size, )) 45 | 46 | out_path = os.path.join(output_dir, 'h' + str(idx) + '_w' + str(jdx) + '.jpg') 47 | img_clip.save(out_path) 48 | 49 | logging.info('Done cropping %s' %img_path ) 50 | 51 | 52 | if __name__ == '__main__': 53 | 54 | parser = argparse.ArgumentParser() 55 | parser.add_argument('--img_path', type=str, default='../data/100_maps/8628000.jp2', 56 | help='path to image file.') 57 | parser.add_argument('--output_dir', type=str, default='../data/100_maps_crop/', 58 | help='path to output dir') 59 | 60 | args = parser.parse_args() 61 | print(args) 62 | 63 | 64 | # if not os.path.isdir(args.output_dir): 65 | # os.makedirs(args.output_dir) 66 | # print('created dir',args.output_dir) 67 | 68 | main(args) 69 | -------------------------------------------------------------------------------- /m3_image_geojson/run.sh: -------------------------------------------------------------------------------- 1 | # python3 stitch_output.py --input_dir='/data2/rumsey_output/57k_maps_r3/spotter/arabic/13415000' --output='/data2/rumsey_output/57k_maps_r3/stitch/arabic/13415000.geojson' 2 | 3 | # python3 stitch_output.py --input_dir='/data2/rumsey_output/57k_maps_r3/spotter/arabic/0845008' --output='/data2/rumsey_output/57k_maps_r3/stitch/arabic/0845008.geojson' 4 | 5 | # python3 stitch_output.py --input_dir='/data2/rumsey_output/57k_maps_r3/spotter/arabic/8407000' --output='/data2/rumsey_output/57k_maps_r3/stitch/arabic/8407000.geojson' 6 | 7 | # python3 stitch_output.py --input_dir='/data2/rumsey_output/57k_maps_r3/spotter/arabic/13272452' --output='/data2/rumsey_output/57k_maps_r3/stitch/arabic/13272452.geojson' 8 | 9 | # python3 stitch_output.py --input_dir='/data2/rumsey_output/57k_maps_r3/spotter/arabic/6855023' --output='/data2/rumsey_output/57k_maps_r3/stitch/arabic/6855023.geojson' 10 | 11 | # python3 stitch_output.py --input_dir='/data2/rumsey_output/57k_maps_r3/spotter/arabic/10198088' --output='/data2/rumsey_output/57k_maps_r3/stitch/arabic/10198088.geojson' 12 | 13 | # python3 stitch_output.py --input_dir='/data2/rumsey_output/57k_maps_r3/spotter/arabic/2119002' --output='/data2/rumsey_output/57k_maps_r3/stitch/arabic/2119002.geojson' 14 | 15 | # python3 stitch_output.py --input_dir='/data2/rumsey_output/57k_maps_r3/spotter/arabic/5850099' --output='/data2/rumsey_output/57k_maps_r3/stitch/arabic/5850099.geojson' 16 | 17 | # python3 stitch_output.py --input_dir='/data2/rumsey_output/57k_maps_r3/spotter/arabic/0352067' --output='/data2/rumsey_output/57k_maps_r3/stitch/arabic/0352067.geojson' 18 | 19 | # python3 stitch_output.py --input_dir='/data2/rumsey_output/57k_maps_r3/spotter/arabic/8496000' --output='/data2/rumsey_output/57k_maps_r3/stitch/arabic/8496000.geojson' 20 | 21 | # python3 stitch_output.py --input_dir='/data2/rumsey_output/57k_maps_r3/spotter/arabic/10285112' --output='/data2/rumsey_output/57k_maps_r3/stitch/arabic/10285112.geojson' 22 | 23 | # python3 stitch_output.py --input_dir='/data2/rumsey_output/57k_maps_r3/spotter/arabic/11201250' --output='/data2/rumsey_output/57k_maps_r3/stitch/arabic/11201250.geojson' 24 | 25 | # python3 stitch_output.py --input_dir='/data2/rumsey_output/57k_maps_r3/spotter/arabic/7924008' --output='/data2/rumsey_output/57k_maps_r3/stitch/arabic/7924008.geojson' 26 | 27 | # python3 stitch_output.py --input_dir='/data2/rumsey_output/57k_maps_r3/spotter/arabic/8859002' --output='/data2/rumsey_output/57k_maps_r3/stitch/arabic/8859002.geojson' 28 | 29 | # python3 stitch_output.py --input_dir='/data2/rumsey_output/57k_maps_r3/spotter/arabic/2239006' --output='/data2/rumsey_output/57k_maps_r3/stitch/arabic/2239006.geojson' 30 | 31 | # python3 stitch_output.py --input_dir='/data2/rumsey_output/57k_maps_r3/spotter/arabic/6954000' --output='/data2/rumsey_output/57k_maps_r3/stitch/arabic/6954000.geojson' 32 | 33 | # python3 stitch_output.py --input_dir='/data2/rumsey_output/57k_maps_r3/spotter/arabic/9085004' --output='/data2/rumsey_output/57k_maps_r3/stitch/arabic/9085004.geojson' 34 | 35 | # python3 stitch_output.py --input_dir='/data2/rumsey_output/57k_maps_r3/spotter/arabic/6353076' --output='/data2/rumsey_output/57k_maps_r3/stitch/arabic/6353076.geojson' 36 | 37 | 38 | python3 stitch_output.py --input_dir='/data2/rumsey_output/57k_maps_r3/spotter/english/0019149' --output='/data2/rumsey_output/57k_maps_r3/stitch/english/0019149.geojson' 39 | python3 stitch_output.py --input_dir='/data2/rumsey_output/57k_maps_r3/spotter/russian/0019149' --output='/data2/rumsey_output/57k_maps_r3/stitch/russian/0019149.geojson' 40 | python3 stitch_output.py --input_dir='/data2/rumsey_output/57k_maps_r3/spotter/arabic/0019149' --output='/data2/rumsey_output/57k_maps_r3/stitch/arabic/0019149.geojson' 41 | python3 stitch_output.py --input_dir='/data2/rumsey_output/57k_maps_r3/spotter/chinese/0019149' --output='/data2/rumsey_output/57k_maps_r3/stitch/chinese/0019149.geojson' 42 | 43 | 44 | 45 | -------------------------------------------------------------------------------- /m3_image_geojson/stitch_output.py: -------------------------------------------------------------------------------- 1 | import os 2 | import glob 3 | import pandas as pd 4 | import numpy as np 5 | import argparse 6 | from geojson import Polygon, Feature, FeatureCollection, dump 7 | import logging 8 | import pdb 9 | 10 | # logging.basicConfig(level=logging.INFO) 11 | 12 | logging.basicConfig(level=logging.ERROR) 13 | pd.options.mode.chained_assignment = None 14 | 15 | def concatenate_and_convert_to_geojson(args): 16 | map_subdir = args.input_dir 17 | output_geojson = args.output_geojson 18 | shift_size = args.shift_size 19 | eval_bool = args.eval_only 20 | 21 | file_list = glob.glob(map_subdir + '/*.json') 22 | file_list = sorted(file_list) 23 | if len(file_list) == 0: 24 | logging.warning('No files found for %s' % map_subdir) 25 | 26 | map_data = [] 27 | for file_path in file_list: 28 | patch_index_h, patch_index_w = os.path.basename(file_path).split('.')[0].split('_') 29 | patch_index_h = int(patch_index_h[1:]) 30 | patch_index_w = int(patch_index_w[1:]) 31 | 32 | try: 33 | # fix text column to be type 'object', to avoid errors (e.g. '6' -> 6.0 'NAn' -> nan) 34 | df = pd.read_json(file_path, dtype={'text':object}) 35 | except pd.errors.EmptyDataError: 36 | logging.warning('%s is empty. Skipping.' % file_path) 37 | continue 38 | except KeyError as ke: 39 | logging.warning('%s has no detected labels. Skipping.' %file_path) 40 | continue 41 | 42 | for index, line_data in df.iterrows(): 43 | df['polygon_x'][index] = np.array(df['polygon_x'][index]).astype(float) + shift_size * patch_index_w 44 | df['polygon_y'][index] = np.array(df['polygon_y'][index]).astype(float) + shift_size * patch_index_h 45 | map_data.append(df) 46 | 47 | if len(map_data) == 0: 48 | with open(output_geojson, 'w', encoding='utf8') as f: 49 | pass 50 | print('created empty geojson for', output_geojson) 51 | return 0 52 | 53 | map_df = pd.concat(map_data) 54 | 55 | 56 | features = [] 57 | for index, line_data in map_df.iterrows(): 58 | polygon_x, polygon_y = list(line_data['polygon_x']), list(line_data['polygon_y']) 59 | 60 | if eval_bool == False: 61 | # y is kept to be positive. Needs to be negative for QGIS visualization 62 | # For flip coordinates: [x,-y] for x,y in zip(polygon_x, polygon_y), 63 | # To form a closed loop polygon: [polygon_x[0], -polygon_y[0]], otherwise QGIS can not display the polygon 64 | polygon = Polygon([[[x,-y] for x,y in zip(polygon_x, polygon_y)]+[[polygon_x[0], -polygon_y[0]]]]) 65 | else: 66 | polygon = Polygon([[[x,y] for x,y in zip(polygon_x, polygon_y)]+[[polygon_x[0], polygon_y[0]]]]) 67 | 68 | text = line_data['text'] 69 | score = line_data['score'] 70 | features.append(Feature(geometry = polygon, properties={"text": text, "score": score} )) 71 | 72 | feature_collection = FeatureCollection(features) 73 | # with open(os.path.join(output_dir, map_subdir +'.geojson'), 'w') as f: 74 | # dump(feature_collection, f) 75 | with open(output_geojson, 'w', encoding='utf8') as f: 76 | dump(feature_collection, f, ensure_ascii=False) 77 | 78 | logging.info('Done generating geojson (img coord) for %s', map_subdir) 79 | 80 | 81 | if __name__ == '__main__': 82 | 83 | parser = argparse.ArgumentParser() 84 | parser.add_argument('--input_dir', type=str, default='data/100_maps_crop_abc/0063014', 85 | help='path to input json path.') 86 | 87 | parser.add_argument('--output_geojson', type=str, default='data/100_maps_geojson_abc/0063014.geojson', 88 | help='path to output geojson path') 89 | 90 | parser.add_argument('--shift_size', type=int, default = 1000, 91 | help='image patch size and shift size.') 92 | 93 | # This can not be of string type. Otherwise it will be interpreted to True all the time. 94 | parser.add_argument('--eval_only', default = False, action='store_true', 95 | help='keep positive coordinate') 96 | 97 | args = parser.parse_args() 98 | print(args) 99 | 100 | concatenate_and_convert_to_geojson(args) 101 | 102 | 103 | 104 | 105 | 106 | -------------------------------------------------------------------------------- /m4_post_ocr/logstash_postocr.conf: -------------------------------------------------------------------------------- 1 | input { 2 | file { 3 | path => "total.csv" 4 | start_position => beginning 5 | sincedb_path => "/dev/null" 6 | } 7 | } 8 | filter { 9 | csv { 10 | separator => "," 11 | columns => [ 12 | "name", 13 | "count" 14 | ] 15 | } 16 | } 17 | output { 18 | elasticsearch { 19 | hosts => "localhost:9200" 20 | index => "osm-voca" 21 | doc_as_upsert => true 22 | } 23 | } -------------------------------------------------------------------------------- /m4_post_ocr/post_ocr.py: -------------------------------------------------------------------------------- 1 | import logging 2 | import requests 3 | import json 4 | import http.client as http_client 5 | import nltk 6 | import re 7 | 8 | import elasticsearch 9 | import elasticsearch.helpers 10 | 11 | 12 | def lexical_search_query(target_text, es): 13 | """ Query candidates and save them as 'postocr_label' """ 14 | 15 | clean_txt = [] 16 | if type(target_text) == str: 17 | if any(char.isdigit() for char in target_text) == False: 18 | for t in range(len(target_text)): 19 | txt = target_text[t] 20 | if txt.isalpha(): 21 | clean_txt.append(txt) 22 | 23 | temp_label = ''.join([str(item) for item in clean_txt]) 24 | if len(temp_label) != 0: 25 | target_text = temp_label 26 | 27 | process = re.findall('[A-Z][^A-Z]*', target_text) 28 | if all(c.isupper() for c in process) or len(process) == 1: 29 | 30 | if type(target_text) == str and any(c.isalpha() for c in target_text): 31 | # edist 0 32 | fuzziness = 0 33 | inputs = target_text.lower() 34 | q1 = {'query': {'fuzzy': {'name': {'value': inputs, 'fuzziness': 0}}}} 35 | try: 36 | es_results = elasticsearch.helpers.scan(es, index="osm-voca", preserve_order=True, query=q1) 37 | except elasticsearch.ElasticsearchException as es_error: 38 | print(es_error) 39 | 40 | test = [item['_source'] for item in es_results if item["_source"]['name'] is not None] 41 | 42 | 43 | edist = [] 44 | edist_update = [] 45 | 46 | edd_min_find = 0 47 | min_candidates = False 48 | 49 | if test != 'NaN': 50 | for tt in range(len(test)): 51 | if 'name' in test[tt]: 52 | candidate = test[tt]['name'] 53 | edist.append(candidate) 54 | 55 | for e in range(len(edist)): 56 | edd = nltk.edit_distance(inputs.upper(), edist[e].upper()) 57 | 58 | if edd == 0: 59 | edist_update.append(edist[e]) 60 | min_candidates = edist[e] 61 | edd_min_find = 1 62 | 63 | # edd 1 64 | if edd_min_find != 1: 65 | # edist 1 66 | fuzziness = 1 67 | 68 | q2 = {'query': {'fuzzy': {'name': {'value': inputs, 'fuzziness': fuzziness}}}} 69 | try: 70 | es_results = elasticsearch.helpers.scan(es, index="osm-voca", preserve_order=True, query=q2) 71 | except elasticsearch.ElasticsearchException as es_error: 72 | print(es_error) 73 | 74 | test = [item['_source'] for item in es_results if item["_source"]['name'] is not None] 75 | 76 | 77 | edist = [] 78 | edist_count = [] 79 | edist_update = [] 80 | edist_count_update = [] 81 | 82 | if test != 'NaN': 83 | for tt in range(len(test)): 84 | if 'name' in test[tt]: 85 | candidate = test[tt]['message'] 86 | cand = candidate.split(',')[0] 87 | count = candidate.split(',')[1] 88 | edist.append(cand) 89 | edist_count.append(count) 90 | 91 | for e in range(len(edist)): 92 | edd = nltk.edit_distance(inputs.upper(), edist[e].upper()) 93 | 94 | if edd == 1: 95 | edist_update.append(edist[e]) 96 | edist_count_update.append(edist_count[e]) 97 | 98 | if len(edist_update) != 0: 99 | index = edist_count_update.index(max(edist_count_update)) 100 | min_candidates = edist_update[index] 101 | edd_min_find = 1 102 | 103 | # edd 2 104 | if edd_min_find != 1: 105 | # edist 2 106 | fuzziness = 2 107 | q3 = {'query': {'fuzzy': {'name': {'value': inputs, 'fuzziness': fuzziness}}}} 108 | try: 109 | es_results = elasticsearch.helpers.scan(es, index="osm-voca", preserve_order=True, query=q3) 110 | except elasticsearch.ElasticsearchException as es_error: 111 | print(es_error) 112 | 113 | test = [item['_source'] for item in es_results if item["_source"]['name'] is not None] 114 | 115 | edist = [] 116 | edist_count = [] 117 | edist_update = [] 118 | edist_count_update = [] 119 | 120 | if test != 'NaN': 121 | for tt in range(len(test)): 122 | if 'name' in test[tt]: 123 | candidate = test[tt]['message'] 124 | cand = candidate.split(',')[0] 125 | count = candidate.split(',')[1] 126 | edist.append(cand) 127 | edist_count.append(count) 128 | 129 | for e in range(len(edist)): 130 | edd = nltk.edit_distance(inputs.upper(), edist[e].upper()) 131 | 132 | if edd == 2: 133 | edist_update.append(edist[e]) 134 | edist_count_update.append(edist_count[e]) 135 | 136 | if len(edist_update) != 0: 137 | index = edist_count_update.index(max(edist_count_update)) 138 | min_candidates = edist_update[index] 139 | edd_min_find = 1 140 | 141 | if edd_min_find != 1: 142 | min_candidates = False 143 | 144 | 145 | if min_candidates != False: 146 | return str(min_candidates) 147 | else: 148 | return str(target_text) 149 | 150 | else: # added 151 | return str(target_text) 152 | 153 | else: 154 | # only numeric pred_text 155 | return str(target_text) 156 | else: 157 | # Combination of 140D -> 140D 158 | return str(target_text) 159 | else: 160 | return str(target_text) -------------------------------------------------------------------------------- /m4_post_ocr/post_ocr_main.py: -------------------------------------------------------------------------------- 1 | import os 2 | import argparse 3 | import ast 4 | import re 5 | import pandas as pd 6 | import numpy as np 7 | import geojson 8 | import json 9 | from dotenv import load_dotenv 10 | from shapely.geometry import Polygon 11 | import psycopg2 12 | import reverse_geocoder as rg 13 | import pycountry_convert as pc 14 | from pyproj import Transformer, transform, Proj 15 | import sys 16 | import elasticsearch 17 | import elasticsearch.helpers 18 | from post_ocr import lexical_search_query 19 | import logging 20 | import time 21 | 22 | logging.basicConfig(level=logging.INFO) 23 | 24 | 25 | 26 | def save_postocr_results(in_geojson_data, unique_map_text_li, es_conn, output_dir, in_geojson_filename): 27 | result_dict_postocr = dict() 28 | for map_text in set(unique_map_text_li): 29 | map_text_candidate = lexical_search_query(map_text, es_conn) 30 | result_dict_postocr[map_text] = map_text_candidate 31 | 32 | for feature_data in in_geojson_data["features"]: 33 | feature_data["properties"]["postocr_label"] = result_dict_postocr[str(feature_data["properties"]["text"]).lower()] 34 | 35 | with open(os.path.join(output_dir, in_geojson_filename.split("/")[-1]), 'w', encoding='utf8') as output_geojson: 36 | geojson.dump(in_geojson_data, output_geojson, ensure_ascii=False) 37 | 38 | 39 | 40 | def main(args): 41 | geojson_file = args.in_geojson_file 42 | output_dir = args.out_geojson_dir 43 | 44 | 45 | try: 46 | es = elasticsearch.Elasticsearch([{'host': "127.0.0.1", 'port': 9200}], timeout=1000) 47 | es_connected = es.ping() 48 | except: 49 | logging.warning('elasticsearch.ConnectionError.ElasticConnectionError while running %s', geojson_file.split("/")[-1]) 50 | return 51 | if not es_connected: 52 | logging.warning('Error on elasticsearch connection while running %s', geojson_file.split("/")[-1]) 53 | return 54 | es_logger = elasticsearch.logger 55 | es_logger.setLevel(elasticsearch.logging.WARNING) 56 | 57 | with open(geojson_file) as f: 58 | # Need update 59 | try: 60 | data = geojson.load(f) 61 | except json.decoder.JSONDecodeError: 62 | if os.path.getsize(geojson_file) == 0: 63 | with open(os.path.join(output_dir, geojson_file.split("/")[-1]), 'w') as fp: 64 | pass 65 | else: 66 | logging.info('JSONDecodeError %s', geojson_file) 67 | # sys.exit(1) 68 | return 69 | 70 | min_x, min_y, max_x, max_y = float('inf'), float('inf'), float('-inf') ,float('-inf') 71 | unique_map_text = [] 72 | for feature_data in data['features']: 73 | unique_map_text.append(str(feature_data['properties']['text']).lower()) 74 | 75 | # if postocr_only: 76 | save_postocr_results(data, unique_map_text, es, output_dir, geojson_file) 77 | logging.info('Done generating standalone post-ocr geojson for %s', geojson_file.split("/")[-1]) 78 | # return 79 | 80 | 81 | if __name__ == '__main__': 82 | parser = argparse.ArgumentParser() 83 | parser.add_argument('--in_geojson_file', type=str, default='data/100_maps_geojson_abc_geocoord/', 84 | help='input geojson') 85 | parser.add_argument('--out_geojson_dir', type=str, default='data/100_maps_geojson_abc_linked/', 86 | help='output dir for converted geojson files') 87 | 88 | args = parser.parse_args() 89 | 90 | main(args) -------------------------------------------------------------------------------- /m4_post_ocr/preprocess.py: -------------------------------------------------------------------------------- 1 | from elasticsearch import Elasticsearch 2 | import logging 3 | import requests 4 | import json 5 | 6 | import http.client as http_client 7 | 8 | import pandas as pd 9 | import string 10 | import emoji 11 | import time 12 | import string 13 | 14 | import glob 15 | import os 16 | 17 | 18 | 19 | def read_name(): 20 | http_client.HTTPConnection.debuglevel = 1 21 | 22 | logging.basicConfig() 23 | logging.getLogger().setLevel(logging.DEBUG) 24 | 25 | requests_log = logging.getLogger("requests.packages.urllib3") 26 | requests_log.setLevel(logging.DEBUG) 27 | requests_log.propagate = True 28 | 29 | 30 | #Popularity Count 31 | headers = { 32 | 'Content-Type': 'application/json', 33 | } 34 | 35 | json_body = '{"track_total_hits": true}' 36 | 37 | resp = requests.get(f'http://localhost:9200/osm/_search?&pretty=true', \ 38 | data=json_body, \ 39 | headers = headers) 40 | resp_json = json.loads(resp.text) 41 | total_value = resp_json["hits"]["total"]["value"] 42 | 43 | 44 | # Initialize 45 | json_body_page = '{"track_total_hits": true, "size": 10000, "sort": [{"ogc_fid": {"order" : "desc" }}]}' 46 | resp_page = requests.post(f'http://localhost:9200/osm/_search?', \ 47 | data=json_body_page, \ 48 | headers = headers) 49 | resp_page_json = json.loads(resp_page.text) 50 | 51 | name_list = [] 52 | 53 | st = [] 54 | for h in range(len(resp_page_json["hits"]["hits"])): 55 | st = resp_page_json["hits"]["hits"][h]["sort"] 56 | text = resp_page_json["hits"]["hits"][h]["_source"]["name"] 57 | token_list = text.split(" ") 58 | for t in range(len(token_list)): 59 | name_list.append(token_list[t].lower()) 60 | 61 | n_val = len(resp_page_json["hits"]["hits"]) 62 | st_list = [st[0]] 63 | error_track = [] 64 | 65 | # Iterate over pages 66 | while n_val != total_value: 67 | 68 | try: #osm_id.keyword 69 | json_body_page2 = '{"track_total_hits": true, "size": 10000, "sort": [{"ogc_fid": {"order" : "desc" }}], "search_after": ['+str(st[0])+']}' 70 | resp_page2 = requests.get(f'http://localhost:9200/osm/_search?', \ 71 | data=json_body_page2, \ 72 | headers = headers) 73 | resp_page_json2 = json.loads(resp_page2.text) 74 | 75 | for h in range(len(resp_page_json2["hits"]["hits"])): 76 | st = resp_page_json2["hits"]["hits"][h]["sort"] 77 | text = resp_page_json2["hits"]["hits"][h]["_source"]["name"] 78 | token_list = text.split(" ") 79 | for t in range(len(token_list)): 80 | name_list.append(token_list[t].lower()) 81 | 82 | n_val += len(resp_page_json2["hits"]["hits"]) 83 | st_list.append(st[0]) 84 | print(f'n_val: {n_val} done!') 85 | 86 | except Exception as e: 87 | print(e.message) 88 | error_track.append(str(st[0])) 89 | 90 | with open('error_id.txt', 'w') as fp: 91 | for item in error_track: 92 | fp.write("%s\n" % item) 93 | print('Done') 94 | 95 | with open('name_mid.txt', 'w') as fp: 96 | for item in name_list: 97 | fp.write("%s\n" % item) 98 | print('Done') 99 | 100 | with open('last_sort_id.txt', 'w') as fp: 101 | for item in st_list: 102 | fp.write("%s\n" % item) 103 | print('Done') 104 | 105 | with open('name.txt', 'w') as fp: 106 | for item in name_list: 107 | fp.write("%s\n" % item) 108 | print('Done') 109 | 110 | with open('name_set.txt', 'w') as fp: 111 | name_set = list(set(name_list)) 112 | for item in name_set: 113 | fp.write("%s\n" % item) 114 | print('Done') 115 | 116 | 117 | 118 | def counting(): 119 | input_txt = "name.txt" 120 | 121 | if os.path.exists(input_txt): 122 | 123 | punc = '''!()-[]{};:'"\,<>./?@#$%^&*_~''' 124 | start2 = time.time() 125 | set_lst2 = [] 126 | with open(input_txt) as file: 127 | for item in file: 128 | name = emoji.replace_emoji(item.strip(), '') #filter out emoji 129 | name = name.translate(str.maketrans('', '', string.punctuation)) 130 | if len(name) > 0: 131 | set_lst2.append(name.upper()) 132 | 133 | end2 = time.time() 134 | start = time.time() 135 | 136 | dic = {} 137 | count = 0 138 | 139 | for word in set_lst2: 140 | count += 1 141 | if word in dic: 142 | dic[word] += 1 143 | else: 144 | dic[word] = 1 145 | 146 | end = time.time() 147 | 148 | print(end - start) 149 | print(end2 - start2) 150 | dff = pd.DataFrame.from_dict([dic]).T 151 | 152 | dff.reset_index(inplace=True) 153 | dff = dff.rename(columns = {'index':'name', '0': 'count'}) 154 | dff.to_csv("out.csv", index=False) 155 | 156 | 157 | if __name__ == '__main__': 158 | read_name() 159 | counting() 160 | -------------------------------------------------------------------------------- /m5_geocoordinate_converter/convert_geojson_to_geocoord.py: -------------------------------------------------------------------------------- 1 | import os 2 | import argparse 3 | import logging 4 | import ast 5 | 6 | import pandas as pd 7 | import numpy as np 8 | import geojson 9 | import os 10 | 11 | logging.basicConfig(level=logging.INFO) 12 | 13 | 14 | def main(args): 15 | geojson_file = args.in_geojson_file 16 | output_dir = args.out_geojson_dir 17 | 18 | sample_map_df = pd.read_csv(args.sample_map_path, dtype={'image_no': str}) 19 | sample_map_df['image_no'] = sample_map_df['image_no'].str.replace('.1.jp2', '', regex=False).str.replace('.jp2', '', regex=False) 20 | sample_map_df['image_no'] = sample_map_df['image_no'].apply(lambda x: x[:-2] if x[-2:] == '.1' else x) 21 | 22 | geojson_filename_id = geojson_file.split(".")[0].split("/")[-1] 23 | 24 | if not os.path.isdir(os.path.join(output_dir, "tmp/")): 25 | os.makedirs(os.path.join(output_dir, "tmp/")) 26 | 27 | row = sample_map_df[sample_map_df['image_no'] == geojson_filename_id] 28 | if not row.empty: 29 | gcps = ast.literal_eval(row.iloc[0]['gcps']) 30 | gcp_str = '' 31 | for gcp in gcps: 32 | lng, lat = gcp['location'] 33 | x, y = gcp['pixel'] 34 | gcp_str += '-gcp ' + str(x) + ' ' + str(y) + ' ' + str(lng) + ' ' + str(lat) + ' ' 35 | 36 | transform_method = row.iloc[0]['transformation_method'] 37 | assert transform_method in ['affine', 'polynomial', 'tps'] 38 | 39 | # minus in y 40 | with open(geojson_file) as img_geojson: 41 | try: 42 | img_data = geojson.load(img_geojson) 43 | except json.decoder.JSONDecodeError: 44 | if os.stat(geojson_file).st_size == 0: 45 | with open(os.path.join(output_dir, geojson_filename_id + '.geojson'), 'w') as fp: 46 | pass 47 | logging.info('Done generating empty geocoord geojson for %s', geojson_file) 48 | else: 49 | logging.info('JSONDecodeError %s', geojson_file) 50 | return 51 | 52 | for img_feature in img_data['features']: 53 | arr = np.array(img_feature['geometry']['coordinates']) 54 | img_feature['properties']['img_coordinates'] = np.array(arr).reshape(-1, 2).tolist() 55 | 56 | arr[:, :, 1] *= -1 57 | img_feature['geometry']['coordinates'] = arr.tolist() 58 | 59 | with open(os.path.join(os.path.join(output_dir, "tmp/"), geojson_filename_id + '.geojson'), 'w', encoding='utf8') as geocoord_geojson: 60 | geojson.dump(img_data, geocoord_geojson, ensure_ascii=False) 61 | 62 | input = '"' + output_dir + "/tmp/" + geojson_filename_id + '.geojson"' 63 | output = '"' + output_dir + "/" + geojson_filename_id + '.geojson"' 64 | 65 | if transform_method == 'affine': 66 | gecoord_convert_command = 'ogr2ogr -f "GeoJSON" ' + output + " " + input + ' -order 1 -s_srs epsg:4326 -t_srs epsg:3857 -skipfailures ' + gcp_str 67 | 68 | elif transform_method == 'polynomial': 69 | gecoord_convert_command = 'ogr2ogr -f "GeoJSON" ' + output + " " + input + ' -order 2 -s_srs epsg:4326 -t_srs epsg:3857 -skipfailures ' + gcp_str 70 | 71 | elif transform_method == 'tps': 72 | gecoord_convert_command = 'ogr2ogr -f "GeoJSON" ' + output + " " + input + ' -tps -s_srs epsg:4326 -t_srs epsg:3857 -skipfailures ' + gcp_str 73 | 74 | else: 75 | raise NotImplementedError 76 | 77 | ret_value = os.system(gecoord_convert_command) 78 | if os.path.exists(os.path.join(os.path.join(output_dir, "tmp/"), geojson_filename_id + '.geojson')): 79 | os.remove(os.path.join(os.path.join(output_dir, "tmp/"), geojson_filename_id + '.geojson')) 80 | 81 | if ret_value != 0: 82 | logging.info('Failed generating geocoord geojson for %s', geojson_file) 83 | else: 84 | logging.info('Done generating geocoord geojson for %s', geojson_file) 85 | 86 | 87 | if __name__ == '__main__': 88 | parser = argparse.ArgumentParser() 89 | parser.add_argument('--sample_map_path', type=str, default='data/initial_US_100_maps.csv', 90 | help='path to sample map csv, which contains gcps info') 91 | parser.add_argument('--in_geojson_file', type=str, 92 | help='input geojson file; results of M2') 93 | parser.add_argument('--out_geojson_dir', type=str, default='data/100_maps_geojson_abc_geocoord/', 94 | help='output dir for converted geojson files') 95 | 96 | args = parser.parse_args() 97 | 98 | main(args) 99 | -------------------------------------------------------------------------------- /m6_entity_linker/create_elasticsearch_index.py: -------------------------------------------------------------------------------- 1 | import argparse 2 | import logging 3 | 4 | import pandas as pd 5 | 6 | import elasticsearch 7 | from elasticsearch import helpers 8 | 9 | logging.basicConfig(level=logging.INFO) 10 | 11 | def main(args): 12 | # elasticsearch connection 13 | try: 14 | es = elasticsearch.Elasticsearch([{'host': '127.0.0.1', 'port': 9200}], timeout=1000) 15 | es_connected = es.ping() 16 | except: 17 | logging.warning('elasticsearch.ConnectionError.ElasticConnectionError while running %s', geojson_file.split("/")[-1]) 18 | return 19 | if not es_connected: 20 | logging.warning('Error on elasticsearch connection while running %s', geojson_file.split("/")[-1]) 21 | return 22 | es_logger = elasticsearch.logger 23 | es_logger.setLevel(elasticsearch.logging.WARNING) 24 | 25 | df = pd.read_csv(args.in_csv) 26 | 27 | for index, row in df.iterrows(): 28 | if index % 1000 == 0: print(index, "processed ...") 29 | 30 | es_query = {"query": { 31 | "bool": { 32 | "must": [ 33 | { 34 | "match": {'name': str(row['name']).lower().replace("'","\'")} 35 | } 36 | ] 37 | } 38 | }} 39 | 40 | try: 41 | osm_count = es.count(index="osm", body=es_query)["count"] 42 | except elasticsearch.ElasticsearchException as es_error: 43 | logging.warning('ElasticsearchException while running %s', geojson_file.split("/")[-1]) 44 | continue 45 | 46 | # skip word that has more than 10000 matched cases in OSM 47 | if osm_count > 10000: 48 | # logging.info('ElasticsearchException while running %s', geojson_file.split("/")[-1]) 49 | continue 50 | 51 | try: 52 | es_results = elasticsearch.helpers.scan(es, index="osm", query=es_query) 53 | except elasticsearch.ElasticsearchException as es_error: 54 | logging.warning('ElasticsearchException while running %s', geojson_file.split("/")[-1]) 55 | continue 56 | 57 | es_results = [(hit["_source"]['source_table'], hit["_source"]['osm_id']) for hit in es_results] 58 | if len(es_results) == 0: 59 | # logging.info('No elasticsearch results of word %s while running %s', map_text, geojson_file.split("/")[-1]) 60 | continue 61 | 62 | df.loc[index, 'source_table_osm_id'] = str(es_results) 63 | 64 | df = df.dropna() 65 | df.to_csv(args.out_csv, index=False) 66 | 67 | if __name__ == '__main__': 68 | parser = argparse.ArgumentParser() 69 | parser.add_argument('--in_csv', type=str, default='out.csv', help='input csv') 70 | parser.add_argument('--out_csv', type=str, default='./m6_entity_linker/osm_linker.csv', help='output csv') 71 | args = parser.parse_args() 72 | 73 | main(args) -------------------------------------------------------------------------------- /m6_entity_linker/create_spatial_index_postgres.py: -------------------------------------------------------------------------------- 1 | from dotenv import load_dotenv 2 | import psycopg2 3 | 4 | 5 | load_dotenv() 6 | DB_HOST = os.getenv("DB_HOST") 7 | DB_PORT = os.getenv("DB_PORT") 8 | DB_USERNAME = os.getenv("DB_USERNAME") 9 | DB_PASSWORD = os.getenv("DB_PASSWORD") 10 | DB_NAME = os.getenv("DB_NAME") 11 | 12 | conn = psycopg2.connect(database=DB_NAME, host=DB_HOST, user=DB_USERNAME, password=DB_PASSWORD, port=DB_PORT) 13 | 14 | continents = ['africa', 'asia', 'central_america', 'north_america', 'south_america', 'antarctica', 'australia_oceania', 'europe'] 15 | tables = ['points', 'lines', 'multilinestrings','multipolygons','other_relations'] 16 | 17 | for continent in continents: 18 | for table in tables: 19 | name = continent + "." + table 20 | cur.execute(f'''CREATE INDEX {name.replace(".","_")+"_index"} ON {name} USING gist (wkb_geometry);''') 21 | cur.execute(f'''CREATE INDEX {name.replace(".","_")+"_osm_index"} ON {name} (osm_id);''') 22 | print(name, " creating index...") 23 | 24 | conn.commit() 25 | cur.close() 26 | conn.close() 27 | -------------------------------------------------------------------------------- /m6_entity_linker/entity_linking.py: -------------------------------------------------------------------------------- 1 | import os 2 | import argparse 3 | import ast 4 | import logging 5 | import time 6 | 7 | import pandas as pd 8 | import numpy as np 9 | import geojson 10 | import json 11 | 12 | from shapely.ops import transform 13 | from shapely.geometry import Polygon 14 | import pyproj 15 | 16 | import elasticsearch 17 | 18 | from dotenv import load_dotenv 19 | import psycopg2 20 | from postgres_logger import LinkerLoggingConnection 21 | 22 | logging.basicConfig(level=logging.INFO) 23 | logger = logging.getLogger(__name__) 24 | 25 | 26 | def main(args): 27 | input_dir = args.in_geojson_dir 28 | output_dir = args.out_geojson_dir 29 | 30 | # elasticsearch connection 31 | try: 32 | es = elasticsearch.Elasticsearch([{'host': '127.0.0.1', 'port': 9200}], timeout=1000) 33 | es_connected = es.ping() 34 | except: 35 | logging.warning('elasticsearch.ConnectionError.ElasticConnectionError') 36 | return 37 | if not es_connected: 38 | logging.warning('Error on elasticsearch connection') 39 | return 40 | es_logger = elasticsearch.logger 41 | es_logger.setLevel(elasticsearch.logging.WARNING) 42 | 43 | # postgres connection 44 | load_dotenv() 45 | DB_HOST = os.getenv("DB_HOST") 46 | DB_PORT = os.getenv("DB_PORT") 47 | DB_USERNAME = os.getenv("DB_USERNAME") 48 | DB_PASSWORD = os.getenv("DB_PASSWORD") 49 | DB_NAME = os.getenv("DB_NAME") 50 | 51 | try: 52 | conn = psycopg2.connect(host=DB_HOST, database=DB_NAME, user=DB_USERNAME, password=DB_PASSWORD, port=DB_PORT, connection_factory=LinkerLoggingConnection) 53 | except Exception as e: 54 | logging.warning('Error on psycopg2 connection ', e) 55 | return 56 | 57 | sample_map_df = pd.read_csv(args.sample_map_path, dtype={'image_no': str}) 58 | sample_map_df['image_no'] = sample_map_df['image_no'].str.replace('.1.jp2', '', regex=False).str.replace('.jp2', '', regex=False) 59 | sample_map_df['image_no'] = sample_map_df['image_no'].apply(lambda x: x[:-2] if x[-2:] == '.1' else x) 60 | 61 | conn.initialize(logger) 62 | conn.autocommit = True 63 | 64 | with conn.cursor() as cur: 65 | for index, record in sample_map_df.iterrows(): 66 | input_geojson_file = os.path.join(input_dir, record.image_no + ".geojson") 67 | 68 | if not os.path.exists(input_geojson_file): 69 | logging.warning('PostOCR output does not exist %s', record.image_no + ".geojson") 70 | continue 71 | 72 | if os.path.exists(os.path.join(output_dir, input_geojson_file.split("/")[-1])): 73 | logging.info('EntityLinker output already exists %s', record.image_no + ".geojson") 74 | continue 75 | 76 | with open(input_geojson_file) as f: 77 | try: 78 | data = geojson.load(f) 79 | except json.decoder.JSONDecodeError: 80 | if os.path.getsize(input_geojson_file) == 0: 81 | with open(os.path.join(output_dir, input_geojson_file.split("/")[-1]), 'w') as fp: 82 | continue 83 | else: 84 | logging.info('JSONDecodeError %s', input_geojson_file) 85 | continue 86 | 87 | for feature_data in data['features']: 88 | map_text = str(feature_data['properties']['postocr_label']) 89 | 90 | # skip null geometry 91 | if feature_data['geometry'] is None: 92 | feature_data["properties"]["osm_id"] = [] 93 | continue 94 | 95 | # skip text less than 3 characters 96 | if len(map_text) <= 3: 97 | feature_data["properties"]["osm_id"] = [] 98 | continue 99 | 100 | pts = np.array(feature_data['geometry']['coordinates']).reshape(-1, 2) 101 | map_polygon = Polygon(pts) 102 | 103 | es_query = { 104 | "bool": { 105 | "must": [ 106 | { 107 | "match": {'name': map_text.lower().replace("'","\'")} 108 | } 109 | ] 110 | } 111 | } 112 | 113 | try: 114 | es_results = es.search(index="osm-linker", query=es_query) 115 | except elasticsearch.ElasticsearchException as es_error: 116 | logging.warning('ElasticsearchException while running %s', input_geojson_file.split("/")[-1]) 117 | continue 118 | 119 | if es_results['hits']['total']['value'] == 0: 120 | # logging.info('No elasticsearch results of word %s while running %s', map_text, input_geojson_file.split("/")[-1]) 121 | feature_data["properties"]["osm_id"] = [] 122 | continue 123 | 124 | es_results = [ast.literal_eval(hit["_source"]['source_table_osm_id']) for hit in es_results['hits']['hits']][0] 125 | output_osm_ids = [] 126 | source_tables = set([table for table, _ in es_results if "other_relations" not in table]) 127 | 128 | for source_table in source_tables: 129 | sql = "" 130 | osm_ids = [osm_id for table, osm_id in es_results if table == source_table] 131 | 132 | if "points" in source_table: 133 | sql = f"""SELECT osm_id 134 | FROM {source_table} 135 | WHERE ST_CONTAINS(ST_TRANSFORM(ST_SetSRID(ST_MakeValid('{map_polygon}'), 3857), 4326), wkb_geometry) 136 | AND osm_id = ANY (%s) 137 | """ 138 | 139 | elif "line" in source_table: 140 | sql = f"""SELECT osm_id 141 | FROM {source_table} 142 | WHERE ST_INTERSECTS(ST_TRANSFORM(ST_SetSRID(ST_MakeValid('{map_polygon}'), 3857), 4326), wkb_geometry) 143 | AND osm_id = ANY (%s) 144 | """ 145 | 146 | elif "polygon" in source_table: 147 | sql = f"""SELECT osm_id 148 | FROM {source_table} 149 | WHERE ST_INTERSECTS(ST_TRANSFORM(ST_SetSRID(ST_MakeValid('{map_polygon}'), 3857), 4326), ST_MakeValid(wkb_geometry, 'method=structure')) 150 | AND osm_id = ANY (%s) 151 | """ 152 | 153 | try: 154 | cur.execute(sql,(osm_ids,)) 155 | except Exception as e: 156 | logging.warning('Error occured while executing sql for %s', input_geojson_file.split("/")[-1], e) 157 | if "TopologyException" in repr(e): 158 | continue 159 | else: 160 | return 161 | 162 | sql_result = cur.fetchall() 163 | if len(sql_result) != 0: 164 | output_osm_ids.extend([x[0] for x in sql_result]) 165 | 166 | feature_data["properties"]["osm_id"] = output_osm_ids 167 | 168 | with open(os.path.join(output_dir, input_geojson_file.split("/")[-1]), 'w', encoding='utf8') as output_geojson: 169 | geojson.dump(data, output_geojson, ensure_ascii=False) 170 | logging.info('Done generating geojson for %s', input_geojson_file.split("/")[-1]) 171 | 172 | if __name__ == '__main__': 173 | parser = argparse.ArgumentParser() 174 | parser.add_argument('--sample_map_path', type=str, default='data/initial_US_100_maps.csv', 175 | help='path to sample map csv, which contains gcps info') 176 | parser.add_argument('--in_geojson_dir', type=str, default='data/100_maps_geojson_abc_geocoord/', 177 | help='input geojson') 178 | parser.add_argument('--out_geojson_dir', type=str, default='data/100_maps_geojson_abc_linked/', 179 | help='output dir for converted geojson files') 180 | args = parser.parse_args() 181 | main(args) 182 | -------------------------------------------------------------------------------- /m6_entity_linker/logstash_osm_linker.conf: -------------------------------------------------------------------------------- 1 | input { 2 | file { 3 | path => "./m6_entity_linker/osm_linker.csv" 4 | start_position => beginning 5 | } 6 | } 7 | filter { 8 | csv { 9 | separator => "," 10 | columns => [ 11 | "name", 12 | "0", 13 | "source_table_osm_id" 14 | ] 15 | } 16 | } 17 | output { 18 | elasticsearch { 19 | hosts => "localhost:9200" 20 | index => "osm-linker" 21 | } 22 | } -------------------------------------------------------------------------------- /m6_entity_linker/logstash_postgres_world.conf: -------------------------------------------------------------------------------- 1 | input { 2 | jdbc { 3 | jdbc_connection_string => "jdbc:postgresql://localhost:5432/osm" 4 | jdbc_user => "" 5 | jdbc_password => "" 6 | jdbc_paging_enabled => true 7 | jdbc_fetch_size => 100000 8 | jdbc_driver_library => "/usr/share/logstash/logstash-core/lib/jars/postgresql-42.6.0.jar" 9 | jdbc_driver_class => "org.postgresql.Driver" 10 | statement => "SELECT ogc_fid, osm_id, name, source_table FROM entire_continents ORDER BY ogc_fid" 11 | } 12 | } 13 | output { 14 | elasticsearch { 15 | hosts => "localhost:9200" 16 | index => "osm" 17 | document_id => "%{ogc_fid}" 18 | doc_as_upsert => true 19 | } 20 | } -------------------------------------------------------------------------------- /m6_entity_linker/postgres_logger.py: -------------------------------------------------------------------------------- 1 | import time 2 | 3 | import psycopg2 4 | from psycopg2.extras import LoggingConnection, LoggingCursor 5 | 6 | class LinkerLoggingCursor(LoggingCursor): 7 | def execute(self, query, vars=None): 8 | self.timestamp = time.time() 9 | return super(LinkerLoggingCursor, self).execute(query, vars) 10 | 11 | def callproc(self, procname, vars=None): 12 | self.timestamp = time.time() 13 | return super(LinkerLoggingCursor, self).callproc(procname, vars) 14 | 15 | class LinkerLoggingConnection(LoggingConnection): 16 | def filter(self, msg, curs): 17 | return msg.decode(psycopg2.extensions.encodings[self.encoding], 'replace') + " %d ms" % int((time.time() - curs.timestamp) * 1000) 18 | 19 | def cursor(self, *args, **kwargs): 20 | kwargs.setdefault('cursor_factory', LinkerLoggingCursor) 21 | return LoggingConnection.cursor(self, *args, **kwargs) 22 | -------------------------------------------------------------------------------- /m6_entity_linker/upload_osm_to_postgres_all_continents.py: -------------------------------------------------------------------------------- 1 | from dotenv import load_dotenv 2 | import psycopg2 3 | 4 | 5 | load_dotenv() 6 | DB_HOST = os.getenv("DB_HOST") 7 | DB_PORT = os.getenv("DB_PORT") 8 | DB_USERNAME = os.getenv("DB_USERNAME") 9 | DB_PASSWORD = os.getenv("DB_PASSWORD") 10 | DB_NAME = os.getenv("DB_NAME") 11 | 12 | conn = psycopg2.connect(database=DB_NAME, host=DB_HOST, user=DB_USERNAME, password=DB_PASSWORD, port=DB_PORT) 13 | cur = conn.cursor() 14 | cur.execute('''CREATE TABLE entire_continents ( 15 | ogc_fid SERIAL PRIMARY KEY, 16 | osm_id character varying, 17 | name character varying, 18 | source_table character varying 19 | );''') 20 | 21 | 22 | continents = ['africa', 'asia', 'central_america', 'north_america', 'south_america', 'antarctica', 'australia_oceania', 'europe'] 23 | tables = ['points', 'lines', 'multilinestrings','multipolygons','other_relations'] 24 | 25 | for continent in continents: 26 | for table in tables: 27 | name = continent + "." + table 28 | cur.execute(f'''INSERT INTO entire_continents(osm_id, name, source_table) 29 | SELECT osm_id, name, '{name}' FROM {name} 30 | WHERE name IS NOT NULL AND osm_id IS NOT NULL ;''') 31 | print(name, " inserting into entire_continents...") 32 | 33 | conn.commit() 34 | cur.close() 35 | conn.close() -------------------------------------------------------------------------------- /m6_entity_linker/upload_osm_to_postgres_ogr2ogr.py: -------------------------------------------------------------------------------- 1 | from dotenv import load_dotenv 2 | import subprocess 3 | import os 4 | 5 | import psycopg2 6 | 7 | continents = ['africa', 'asia', 'centeral_america', 'north_america', 'south_america', 'antarctica', 'australia_oceania', 'europe'] 8 | 9 | load_dotenv() 10 | DB_HOST = os.getenv("DB_HOST") 11 | DB_PORT = os.getenv("DB_PORT") 12 | DB_USERNAME = os.getenv("DB_USERNAME") 13 | DB_PASSWORD = os.getenv("DB_PASSWORD") 14 | DB_NAME = os.getenv("DB_NAME") 15 | 16 | 17 | try: 18 | conn = psycopg2.connect(host=DB_HOST, database=DB_NAME, user=DB_USERNAME, password=DB_PASSWORD) 19 | except: 20 | logging.warning('Error on psycopg2 connection while running %s', geojson_file.split("/")[-1]) 21 | 22 | cur = conn.cursor() 23 | 24 | for continent in continents: 25 | cur.execute(f'''CREATE SCHEMA {continent};''') 26 | print(continent, " creating schema...") 27 | 28 | conn.commit() 29 | cur.close() 30 | conn.close() 31 | 32 | for continent in continents: 33 | cmd = f'''ogr2ogr -f PostgreSQL PG:"dbname='{DB_NAME}' host='{DB_HOST}' port='{DB_PORT}' user='{DB_USERNAME}' password='{DB_PASSWORD}'" {continent.replace('_','-')}-latest.osm.pbf -nlt PROMOTE_TO_MULTI -lco SCHEMA={continent}''' 34 | print("--", continent, "--") 35 | print(cmd) 36 | subprocess.call(cmd, shell=True) -------------------------------------------------------------------------------- /m_sanborn/s1_geocoding.py: -------------------------------------------------------------------------------- 1 | import os 2 | import argparse 3 | import geojson 4 | import geocoder 5 | import json 6 | import time 7 | import pdb 8 | 9 | 10 | def arcgic_geocoding(place_name, maxRows = 5): 11 | try: 12 | response = geocoder.arcgis(place_name,maxRows=maxRows) 13 | return response.json 14 | except exception as e: 15 | print(e) 16 | return -1 17 | 18 | 19 | def google_geocoding(place_name, api_key = None, maxRows = 5): 20 | try: 21 | response = geocoder.google(place_name, key=api_key, maxRows = maxRows) 22 | return response.json 23 | except exception as e: 24 | print(e) 25 | return -1 26 | 27 | def osm_geocoding(place_name, maxRows = 5): 28 | try: 29 | response = geocoder.osm(place_name, maxRows = maxRows) 30 | return response.json 31 | except exception as e: 32 | print(e) 33 | return -1 34 | 35 | 36 | def geonames_geocoding(place_name, user_name = None, maxRows = 5): 37 | try: 38 | response = geocoder.geonames(place_name, key = user_name, maxRows=maxRows) 39 | # hourly limit of 1000 credits 40 | time.sleep(4) 41 | return response.json 42 | except exception as e: 43 | print(e) 44 | return -1 45 | 46 | 47 | def geocoding(args): 48 | output_folder = args.output_folder 49 | input_map_geojson_path = args.input_map_geojson_path 50 | api_key = args.api_key 51 | user_name = args.user_name 52 | geocoder_option = args.geocoder_option 53 | max_results = args.max_results 54 | suffix = args.suffix 55 | 56 | with open(input_map_geojson_path, 'r') as f: 57 | data = geojson.load(f) 58 | 59 | map_name = os.path.basename(input_map_geojson_path).split('.')[0] 60 | output_folder = os.path.join(output_folder, geocoder_option) 61 | 62 | if not os.path.isdir(output_folder): 63 | os.makedirs(output_folder) 64 | 65 | output_path = os.path.join(output_folder, map_name) + '.json' 66 | 67 | with open(output_path, 'w') as f: 68 | pass # flush output file 69 | 70 | features = data['features'] 71 | for feature in features: # iterate through all the detected text labels 72 | geometry = feature['geometry'] 73 | text = feature['properties']['text'] 74 | score = feature['properties']['score'] 75 | 76 | # suffix = ', Los Angeles' 77 | text = str(text) + suffix 78 | 79 | print(text) 80 | 81 | if geocoder_option == 'arcgis': 82 | results = arcgic_geocoding(text, maxRows = max_results) 83 | elif geocoder_option == 'google': 84 | results = google_geocoding(text, api_key = api_key, maxRows = max_results) 85 | elif geocoder_option == 'geonames': 86 | results = geonames_geocoding(text, user_name = user_name, maxRows = max_results) 87 | elif geocoder_option == 'osm': 88 | results = osm_geocoding(text, maxRows = max_results) 89 | else: 90 | raise NotImplementedError 91 | 92 | if results == -1: 93 | # geocoder can not find match 94 | pass 95 | else: 96 | # save results 97 | with open(output_path, 'a') as f: 98 | json.dump({'text':text, 'score':score, 'geometry': geometry, 'geocoding':results}, f) 99 | f.write('\n') 100 | 101 | # pdb.set_trace() 102 | 103 | 104 | def main(): 105 | parser = argparse.ArgumentParser() 106 | 107 | parser.add_argument('--output_folder', type=str, default='/data2/sanborn_maps_output/LA_sanborn/geocoding/') 108 | parser.add_argument('--input_map_geojson_path', type=str, default='/data2/sanborn_maps_output/LA_sanborn/geojson_testr/service-gmd-gmd436m-g4364m-g4364lm-g4364lm_g00656189401-00656_01_1894-0001l.geojson') 109 | parser.add_argument('--api_key', type=str, default=None, help='Specify API key if needed') 110 | parser.add_argument('--user_name', type=str, default=None, help='Specify user name if needed') 111 | 112 | parser.add_argument('--suffix', type=str, default=None, help='placename suffix (e.g. city name)') 113 | 114 | parser.add_argument('--max_results', type=int, default=5, help='max number of results returend by geocoder') 115 | 116 | parser.add_argument('--geocoder_option', type=str, default='arcgis', 117 | choices=['arcgis', 'google','geonames','osm'], 118 | help='Select text spotting model option from ["arcgis","google","geonames","osm"]') # select text spotting model 119 | 120 | 121 | args = parser.parse_args() 122 | print('\n') 123 | print(args) 124 | print('\n') 125 | 126 | if not os.path.isdir(args.output_folder): 127 | os.makedirs(args.output_folder) 128 | 129 | geocoding(args) 130 | 131 | 132 | if __name__ == '__main__': 133 | 134 | main() 135 | 136 | 137 | 138 | -------------------------------------------------------------------------------- /m_sanborn/s2_clustering.py: -------------------------------------------------------------------------------- 1 | import os 2 | import json 3 | import argparse 4 | from sklearn.cluster import DBSCAN 5 | from matplotlib import pyplot as plt 6 | import geopandas as gpd 7 | import pandas as pd 8 | from bs4 import BeautifulSoup 9 | from mpl_toolkits.basemap import Basemap 10 | from pyproj import Proj, transform 11 | 12 | from shapely.geometry import Point 13 | from shapely.geometry.polygon import Polygon 14 | import numpy as np 15 | from shapely.geometry import MultiPoint 16 | from geopy.distance import great_circle 17 | 18 | 19 | county_index_dict = {'Cuyahoga County (OH)': 193, 20 | 'Fulton County (GA)': 73, 21 | 'Kern County (CA)': 2872, 22 | 'Lancaster County (NE)': 1629, 23 | 'Los Angeles County (CA)': 44, 24 | 'Mexico': -1, 25 | 'Nevada County (CA)': 46, 26 | 'New Orleans (LA)': -1, 27 | 'Pima County (AZ)': 2797, 28 | 'Placer County (CA)': 1273, 29 | 'Providence County (RI)\xa0': 1124, 30 | 'Saint Louis (MO)': -1, 31 | 'San Francisco County (CA)': 1261, 32 | 'San Joaquin County (CA)': 1213, 33 | 'Santa Clara (CA)': 48, 34 | 'Santa Cruz (CA)': 2386, 35 | 'Suffolk County (MA)': 272, 36 | 'Tulsa County (OK)': 526, 37 | 'Washington County (AK)': -1, 38 | 'Washington DC': -1} 39 | 40 | def get_centermost_point(cluster): 41 | centroid = (MultiPoint(cluster).centroid.x, MultiPoint(cluster).centroid.y) 42 | centermost_point = min(cluster, key=lambda point: great_circle(point, centroid).m) 43 | return tuple(centermost_point) 44 | 45 | def clustering_func(lat_list, lng_list): 46 | X = [[a,b] for a,b in zip(lat_list, lng_list)] 47 | coords = np.array(X) 48 | 49 | # https://geoffboeing.com/2014/08/clustering-to-reduce-spatial-data-set-size/ 50 | kms_per_radian = 6371.0088 51 | epsilon = 1.5 / kms_per_radian 52 | db = DBSCAN(eps=epsilon, min_samples=1, algorithm='ball_tree', metric='haversine').fit(np.radians(coords)) 53 | cluster_labels = db.labels_ 54 | num_clusters = len(set(cluster_labels)) 55 | clusters = pd.Series([coords[cluster_labels == n] for n in range(num_clusters)]) 56 | 57 | centermost_points = get_centermost_point(clusters[0]) 58 | return centermost_points 59 | 60 | def plot_points(lat_list, lng_list, target_lat_list=None, target_lng_list = None, pred_lat=None, pred_lng = None, title = None): 61 | 62 | plt.figure(figsize=(10,6)) 63 | plt.title(title) 64 | 65 | plt.scatter(lng_list, lat_list, marker='o', c = 'violet', alpha=0.5) 66 | if pred_lat is not None and pred_lng is not None: 67 | plt.scatter(pred_lng, pred_lat, marker='o', c = 'red') 68 | 69 | if target_lat_list is not None and target_lng_list is not None: 70 | plt.scatter(target_lng_list, target_lat_list, 10, c = 'blue') 71 | plt.show() 72 | 73 | def plot_points_basemap(lat_list, lng_list, target_lat_list=None, target_lng_list = None, pred_lat=None, pred_lng = None, title = None): 74 | 75 | plt.figure(figsize=(10,6)) 76 | plt.title(title) 77 | 78 | if len(lat_list) >0 and len(lng_list) > 0: 79 | anchor_lat, anchor_lng = lat_list[0], lng_list[0] 80 | elif target_lat_list is not None: 81 | anchor_lat, anchor_lng = target_lat_list[0], target_lng_list[0] 82 | else: 83 | anchor_lat, anchor_lng = 45, -100 84 | 85 | m = Basemap(projection='lcc', resolution=None, 86 | width=8E4, height=8E4, 87 | lat_0=anchor_lat, lon_0=anchor_lng) 88 | m.etopo(scale=0.5, alpha=0.5) 89 | # m.arcgisimage(service='ESRI_Imagery_World_2D', xpixels = 2000, verbose= True) 90 | # m.arcgisimage(service='ESRI_Imagery_World_2D',scale=0.5, alpha=0.5) 91 | # m.arcgisimage(service='ESRI_Imagery_World_2D', xpixels = 2000, verbose= True) 92 | 93 | lng_list, lat_list = m(lng_list, lat_list) # transform coordinates 94 | plt.scatter(lng_list, lat_list, marker='o', c = 'violet', alpha=0.5) 95 | 96 | 97 | if target_lat_list is not None and target_lng_list is not None: 98 | target_lng_list, target_lat_list = m(target_lng_list, target_lat_list) 99 | plt.scatter(target_lng_list, target_lat_list, marker='o', c = 'blue',edgecolor='blue') 100 | 101 | if pred_lat is not None and pred_lng is not None: 102 | pred_lng, pred_lat = m(pred_lng, pred_lat) 103 | plt.scatter(pred_lng, pred_lat, marker='o', c = 'red', edgecolor='black') 104 | 105 | plt.show() 106 | 107 | def plotting_func(loc_sanborn_dir, pred_dict, lat_lng_dict, dataset_name, geocoding_name): 108 | 109 | for map_name, pred in pred_dict.items(): 110 | 111 | title = dataset_name + '-' + geocoding_name + '-' + map_name 112 | lat_list = lat_lng_dict[map_name]['lat_list'] 113 | lng_list = lat_lng_dict[map_name]['lng_list'] 114 | 115 | if dataset_name == 'LoC_sanborn': 116 | xml_path = os.path.join(loc_sanborn_dir,map_name + '.tif.aux.xml') 117 | try: 118 | with open(xml_path) as fp: 119 | soup = BeautifulSoup(fp) 120 | 121 | target_gcp_list = soup.findAll("metadata")[1].targetgcps.findAll("double") 122 | except Exception as e: 123 | print(xml_path) 124 | continue 125 | 126 | xy_list = [] 127 | for target_gcp in target_gcp_list: 128 | xy_list.append(float(target_gcp.contents[0])) 129 | 130 | x_list = xy_list[0::2] 131 | y_list = xy_list[1::2] 132 | 133 | lng2_list, lat2_list = [],[] 134 | for x1,y1 in zip(x_list, y_list): 135 | x2,y2 = transform(inProj,outProj,x1,y1) 136 | #print (x2,y2) 137 | lng2_list.append(x2) 138 | lat2_list.append(y2) 139 | 140 | plot_points(lat_list, lng_list, lat2_list, lng2_list, pred_lat = pred[0], pred_lng = pred[1], title=title) 141 | else: 142 | plot_points(lat_list, lng_list,pred_lat = pred[0], pred_lng = pred[1], title=title) 143 | 144 | 145 | def clustering(args): 146 | dataset_name = args.dataset_name 147 | geocoding_name = args.geocoding_name 148 | remove_duplicate_location = args.remove_duplicate_location 149 | visualize = args.visualize 150 | 151 | sanborn_output_dir = '/data2/sanborn_maps_output' 152 | 153 | input_dir=os.path.join(sanborn_output_dir, dataset_name, 'geocoding_suffix_testr', geocoding_name) 154 | if remove_duplicate_location: 155 | output_dir = os.path.join(sanborn_output_dir, dataset_name, 'clustering_testr_removeduplicate', geocoding_name) 156 | else: 157 | output_dir = os.path.join(sanborn_output_dir, dataset_name, 'clustering_testr', geocoding_name) 158 | 159 | county_boundary_path = '/home/zekun/Sanborn/cb_2018_us_county_500k/cb_2018_us_county_500k.shp' 160 | 161 | if not os.path.isdir(output_dir): 162 | os.makedirs(output_dir) 163 | 164 | inProj = Proj(init='epsg:3857') 165 | outProj = Proj(init='epsg:4326') 166 | 167 | county_boundary_df = gpd.read_file(county_boundary_path) 168 | 169 | if dataset_name == 'LoC_sanborn': 170 | loc_sanborn_dir = '/data2/sanborn_maps/Sanborn100_Georef/' # for comparing with GT 171 | metadata_tsv_path = '/home/zekun/Sanborn/Sheet_List.tsv' 172 | meta_df = pd.read_csv(metadata_tsv_path, sep='\t') 173 | 174 | file_list = os.listdir(input_dir) 175 | 176 | pred_dict = dict() 177 | lat_lng_dict = dict() 178 | for file_path in file_list: 179 | 180 | map_name = os.path.basename(file_path).split('.')[0] 181 | if dataset_name == 'LoC_sanborn': 182 | county_name = meta_df[meta_df['filename'] == map_name]['County'].values[0] 183 | elif dataset_name == 'LA_sanborn' or 'two_more': 184 | county_name = 'Los Angeles County (CA)' 185 | else: 186 | raise NotImplementedError 187 | 188 | index = county_index_dict[county_name] 189 | if index >= 0: 190 | poly_geometry = county_boundary_df.iloc[index].geometry 191 | 192 | with open(os.path.join(input_dir,file_path), 'r') as f: 193 | data = f.readlines() 194 | 195 | lat_list = [] 196 | lng_list = [] 197 | for line in data: 198 | 199 | line_dict = json.loads(line) 200 | geocoding_dict = line_dict['geocoding'] 201 | text = line_dict['text'] 202 | score = line_dict['score'] 203 | geometry = line_dict['geometry'] 204 | 205 | if geocoding_dict is None: 206 | continue # if no geolocation returned by geocoder, then skip 207 | 208 | if 'lat' not in geocoding_dict or 'lng' not in geocoding_dict: 209 | #print(geocoding_dict) 210 | continue 211 | 212 | lat = float(geocoding_dict['lat']) 213 | lng = float(geocoding_dict['lng']) 214 | 215 | point = Point(lng, lat) 216 | 217 | if index >= 0: 218 | if point.within(poly_geometry): # geocoding point within county boundary 219 | lat_list.append(lat) 220 | lng_list.append(lng) 221 | else: 222 | pass 223 | else: # cluster based on all results 224 | lat_list.append(lat) 225 | lng_list.append(lng) 226 | 227 | if remove_duplicate_location: 228 | lat_list = list(set(lat_list)) 229 | lng_list = list(set(lng_list)) 230 | 231 | if len(lat_list) >0 and len(lng_list) > 0: 232 | pred = clustering_func(lat_list, lng_list) 233 | # print(pred) 234 | else: 235 | print('No data to cluster') 236 | 237 | print(map_name, pred) 238 | pred_dict[map_name] = pred 239 | lat_lng_dict[map_name]={'lat_list':lat_list, 'lng_list':lng_list} 240 | 241 | if visualize: 242 | plotting_func(loc_sanborn_dir = loc_sanborn_dir, pred_dict = pred_dict, lat_lng_dict = lat_lng_dict, 243 | dataset_name = dataset_name, geocoding_name = geocoding_name) 244 | 245 | with open(os.path.join(output_dir, 'pred_center.json'),'w') as f: 246 | json.dump(pred_dict, f) 247 | 248 | 249 | def main(): 250 | parser = argparse.ArgumentParser() 251 | 252 | parser.add_argument('--dataset_name', type=str, default=None, 253 | choices=['LA_sanborn', 'LoC_sanborn',], 254 | help='dataset name, same as expt_name') 255 | parser.add_argument('--geocoding_name', type=str, default=None, 256 | choices=['google','arcgis','geonames','osm'], 257 | help='geocoder name') 258 | parser.add_argument('--visualize', default = False, action = 'store_true') # Enable this when in notebook 259 | parser.add_argument('--remove_duplicate_location', default=False, action='store_true') # whether remove duplicate geolocations for clustering 260 | 261 | # parser.add_argument('--output_folder', type=str, default='/data2/sanborn_maps_output/LA_sanborn/geocoding/') 262 | # parser.add_argument('--input_map_geojson_path', type=str, default='/data2/sanborn_maps_output/LA_sanborn/geojson_testr/service-gmd-gmd436m-g4364m-g4364lm-g4364lm_g00656189401-00656_01_1894-0001l.geojson') 263 | 264 | 265 | args = parser.parse_args() 266 | print('\n') 267 | print(args) 268 | print('\n') 269 | 270 | clustering(args) 271 | 272 | 273 | if __name__ == '__main__': 274 | 275 | main() 276 | -------------------------------------------------------------------------------- /m_sanborn/s3_gen_geojson.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/knowledge-computing/mapkurator-system/5b765d99c4898ce07654d904b6f3b608b9e76189/m_sanborn/s3_gen_geojson.py -------------------------------------------------------------------------------- /requirements.txt: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/knowledge-computing/mapkurator-system/5b765d99c4898ce07654d904b6f3b608b9e76189/requirements.txt -------------------------------------------------------------------------------- /run.py: -------------------------------------------------------------------------------- 1 | import os 2 | import glob 3 | import argparse 4 | import time 5 | import logging 6 | import pandas as pd 7 | import datetime 8 | from PIL import Image 9 | from utils import get_img_path_from_external_id, get_img_path_from_external_id_and_image_no,run_pipeline 10 | 11 | import subprocess 12 | 13 | 14 | logging.basicConfig(level=logging.INFO) 15 | Image.MAX_IMAGE_PIXELS=None # allow reading huge images 16 | 17 | def main(): 18 | parser = argparse.ArgumentParser() 19 | 20 | parser.add_argument('--map_kurator_system_dir', type=str, default='/home/maplord/rumsey/mapkurator-system/') 21 | parser.add_argument('--text_spotting_model_dir', type=str, default='/home/maplord/rumsey/TESTR/') 22 | 23 | parser.add_argument('--sample_map_csv_path', type=str, default=None) 24 | 25 | parser.add_argument('--output_folder', type=str, default='/data2/rumsey_output') # Original: /data2/rumsey_output 26 | parser.add_argument('--expt_name', type=str, default='1000_maps') # output prefix 27 | 28 | parser.add_argument('--module_get_dimension', default=False, action='store_true') 29 | parser.add_argument('--module_gen_geotiff', default=False, action='store_true') 30 | parser.add_argument('--module_cropping', default=False, action='store_true') 31 | parser.add_argument('--module_text_spotting', default=False, action='store_true') 32 | parser.add_argument('--module_img_geojson', default=False, action='store_true') 33 | parser.add_argument('--module_geocoord_geojson', default=False, action='store_true') 34 | parser.add_argument('--module_post_ocr_entity_linking', default=False, action='store_true') 35 | parser.add_argument('--module_post_ocr_only', default=False, action='store_true') 36 | parser.add_argument('--module_post_ocr', default=False, action='store_true') 37 | 38 | parser.add_argument('--spotter_model', type=str, default='spotter-v2', choices=['testr', 'spotter-v2', "palette"], 39 | help='Select text spotting model option from ["testr", "spotter-v2", "palette"]') # select text spotting model 40 | parser.add_argument('--spotter_config', type=str, default='/home/maplord/rumsey/TESTR/configs/TESTR/SynMap/SynMap_Polygon.yaml', 41 | help='Path to the config file for text spotting model') 42 | parser.add_argument('--spotter_expt_name', type=str, default='exp', 43 | help='Name of spotter experiment, if empty using config file name') 44 | 45 | # Running spotter-testr 46 | # python run.py --text_spotting_model_dir /home/maplord/rumsey/spotter-testr/TESTR/ 47 | # --sample_map_csv_path /home/maplord/maplist_csv/luna_omo_splits/luna_omo_metadata_56628_20220724_part1.csv 48 | # --expt_name 57k_maps_r3 --module_text_spotting 49 | # --spotter_model testr --spotter_config /home/maplord/rumsey/spotter-testr/TESTR/configs/TESTR/SynthMap/SynthMap_Polygon.yaml --spotter_expt_name test 50 | # Running spotter-v2 51 | # python run.py --text_spotting_model_dir /home/maplord/rumsey/spotter-v2/PALEJUN/ 52 | # --sample_map_csv_path /home/maplord/maplist_csv/luna_omo_splits/luna_omo_metadata_56628_20220724_part1.csv 53 | # --expt_name 57k_maps_r3 --module_text_spotting 54 | # --spotter_model spotter-v2 --spotter_config /home/maplord/rumsey/spotter-v2/PALEJUN/configs/PALEJUN/SynthMap/SynthMap_Polygon.yaml --spotter_expt_name test 55 | # Running spotter-palette 56 | # python run.py --text_spotting_model_dir /home/maplord/rumsey/spotter-palette/PALETTE/ 57 | # --sample_map_csv_path /home/maplord/maplist_csv/luna_omo_splits/luna_omo_metadata_56628_20220724_part1.csv 58 | # --expt_name 57k_maps_r3 --module_text_spotting 59 | # --spotter_model palette --spotter_config /home/maplord/rumsey/spotter-palette/PALETTR/configs/PALETTE/Pretrain/SynthMap_Polygon.yaml --spotter_expt_name test 60 | 61 | parser.add_argument('--print_command', default=False, action='store_true') 62 | parser.add_argument('--gpu_id', type=int, default=0) 63 | 64 | 65 | args = parser.parse_args() 66 | print('\n') 67 | print(args) 68 | print('\n') 69 | 70 | run_pipeline(args) 71 | 72 | 73 | 74 | if __name__ == '__main__': 75 | 76 | main() 77 | 78 | 79 | -------------------------------------------------------------------------------- /run_img.py: -------------------------------------------------------------------------------- 1 | import os 2 | import subprocess 3 | import glob 4 | import argparse 5 | import time 6 | import logging 7 | import pandas as pd 8 | import pdb 9 | import datetime 10 | from PIL import Image 11 | from utils import run_pipeline 12 | 13 | import subprocess 14 | 15 | #this code is the case for getting an input as folders which include images. 16 | #tested image : /home/maplord/rumsey/mapkurator-system/data/100_maps_crop/crop_leeje_2/test_run_img/ 17 | logging.basicConfig(level=logging.INFO) 18 | Image.MAX_IMAGE_PIXELS=None # allow reading huge images 19 | 20 | 21 | 22 | def main(): 23 | parser = argparse.ArgumentParser() 24 | 25 | parser.add_argument('--map_kurator_system_dir', type=str, default='/home/maplord/rumsey/mapkurator-system/') 26 | parser.add_argument('--text_spotting_model_dir', type=str, default='/home/maplord/rumsey/TESTR/') 27 | 28 | parser.add_argument('--input_dir_path', type=str, default=None) 29 | 30 | parser.add_argument('--output_folder', type=str, default='/data2/rumsey_output') 31 | parser.add_argument('--expt_name', type=str, default='1000_maps') # output prefix 32 | 33 | parser.add_argument('--module_get_dimension', default=False, action='store_true') 34 | parser.add_argument('--module_gen_geotiff', default=False, action='store_true') 35 | parser.add_argument('--module_cropping', default=False, action='store_true') 36 | parser.add_argument('--module_text_spotting', default=False, action='store_true') 37 | parser.add_argument('--module_img_geojson', default=False, action='store_true') 38 | 39 | 40 | parser.add_argument('--spotter_model', type=str, default='spotter-v2', choices=['testr', 'spotter-v2', "palette"], 41 | help='Select text spotting model option from ["testr", "spotter-v2", "palette"]') # select text spotting model 42 | parser.add_argument('--spotter_config', type=str, default='/home/maplord/rumsey/TESTR/configs/TESTR/SynMap/SynMap_Polygon.yaml', 43 | help='Path to the config file for text spotting model') 44 | parser.add_argument('--spotter_expt_name', type=str, default='exp', 45 | help='Name of spotter experiment, if empty using config file name') 46 | 47 | parser.add_argument('--print_command', default=False, action='store_true') 48 | parser.add_argument('--gpu_id', type=int, default=0) 49 | 50 | args = parser.parse_args() 51 | print('\n') 52 | print(args) 53 | print('\n') 54 | 55 | run_pipeline(args) 56 | 57 | 58 | 59 | if __name__ == '__main__': 60 | 61 | main() 62 | 63 | 64 | -------------------------------------------------------------------------------- /utils.py: -------------------------------------------------------------------------------- 1 | import os 2 | import glob 3 | import pandas as pd 4 | import ast 5 | import argparse 6 | import logging 7 | import pdb 8 | from PIL import Image 9 | import datetime 10 | import subprocess 11 | import time 12 | 13 | logging.basicConfig(level=logging.INFO) 14 | Image.MAX_IMAGE_PIXELS=None 15 | 16 | 17 | def execute_command(command, if_print_command): 18 | t1 = time.time() 19 | 20 | if if_print_command: 21 | print(command) 22 | 23 | try: 24 | subprocess.run(command, shell=True,check=True, capture_output = True) #stderr=subprocess.STDOUT) 25 | t2 = time.time() 26 | time_usage = t2 - t1 27 | return {'time_usage':time_usage} 28 | except subprocess.CalledProcessError as err: 29 | error = err.stderr.decode('utf8') 30 | # format error message to one line 31 | error = error.replace('\n','\t') 32 | error = error.replace(',',';') 33 | return {'error': error} 34 | 35 | 36 | def get_img_dimension(img_path): 37 | map_img = Image.open(img_path) 38 | width, height = map_img.size 39 | 40 | return width, height 41 | 42 | 43 | def run_pipeline(args): 44 | # ------------------------- Pass arguments ----------------------------------------- 45 | map_kurator_system_dir = args.map_kurator_system_dir 46 | text_spotting_model_dir = args.text_spotting_model_dir 47 | 48 | if hasattr(args, "sample_map_csv_path"): 49 | # if typeof === 'undefined': 50 | sample_map_path = args.sample_map_csv_path 51 | module_geocoord_geojson = args.module_geocoord_geojson 52 | module_post_ocr_entity_linking = args.module_post_ocr_entity_linking 53 | module_post_ocr_only = args.module_post_ocr_only 54 | module_post_ocr = args.module_post_ocr 55 | 56 | elif hasattr(args, "input_dir_path"): 57 | input_dir_path = args.input_dir_path 58 | 59 | expt_name = args.expt_name 60 | output_folder = args.output_folder 61 | 62 | module_get_dimension = args.module_get_dimension 63 | module_gen_geotiff = args.module_gen_geotiff 64 | module_cropping = args.module_cropping 65 | module_text_spotting = args.module_text_spotting 66 | module_img_geojson = args.module_img_geojson 67 | 68 | spotter_model = args.spotter_model 69 | spotter_config = args.spotter_config 70 | spotter_expt_name = args.spotter_expt_name 71 | gpu_id = args.gpu_id 72 | 73 | if_print_command = args.print_command 74 | error_reason_dict = dict() 75 | 76 | if "sample_map_path" in locals(): 77 | # ------------------------- Read sample map list and prepare output dir ---------------- 78 | if sample_map_path is not None: 79 | input_csv_path = sample_map_path 80 | if input_csv_path[-4:] == '.csv': 81 | sample_map_df = pd.read_csv(sample_map_path, dtype={'external_id':str}) 82 | elif input_csv_path[-4:] == '.tsv': 83 | sample_map_df = pd.read_csv(sample_map_path, dtype={'external_id':str}, sep='\t') 84 | else: 85 | raise NotImplementedError 86 | 87 | external_id_to_img_path_dict, unmatched_external_id_list = get_img_path_from_external_id_and_image_no( sample_map_path = input_csv_path) 88 | 89 | # initialize error reason dict 90 | 91 | for ex_id in unmatched_external_id_list: 92 | error_reason_dict[ex_id] = {'img_path':None, 'error':'Can not find image given external_id.'} 93 | 94 | elif "input_dir_path" in locals(): 95 | if input_dir_path is not None: 96 | input_img_path = input_dir_path 97 | sample_map_df = pd.DataFrame(columns = ["external_id"]) 98 | for images in os.listdir(input_img_path): 99 | tmp_path={"external_id": os.path.join(input_img_path,images)} 100 | sample_map_df=sample_map_df.append(tmp_path,ignore_index=True) 101 | else: 102 | raise NotImplementedError 103 | else: 104 | raise NotImplementedError 105 | 106 | 107 | expt_out_dir = os.path.join(output_folder, expt_name) 108 | geotiff_output_dir = os.path.join(output_folder, expt_name, 'geotiff') 109 | cropping_output_dir = os.path.join(output_folder, expt_name, 'crop/') 110 | spotting_output_dir = os.path.join(output_folder, expt_name, 'spotter/' + spotter_expt_name) 111 | stitch_output_dir = os.path.join(output_folder, expt_name, 'stitch/' + spotter_expt_name) 112 | geocoord_output_dir = os.path.join(output_folder, expt_name, 'geocoord/' + spotter_expt_name) 113 | postocr_linking_output_dir = os.path.join(output_folder, expt_name, 'postocr_linking/'+ spotter_expt_name) 114 | postocr_only_output_dir = os.path.join(output_folder, expt_name, 'postocr_only/'+ spotter_expt_name) 115 | 116 | 117 | if not os.path.isdir(expt_out_dir): 118 | os.makedirs(expt_out_dir) 119 | 120 | # ------------------------ Get image dimension ------------------------------ 121 | if module_get_dimension: 122 | for index, record in sample_map_df.iterrows(): 123 | external_id = record.external_id 124 | # pdb.set_trace() 125 | if "sample_map_path" in locals(): 126 | if external_id not in external_id_to_img_path_dict: 127 | error_reason_dict[external_id] = {'img_path':None, 'error':'key not in external_id_to_img_path_dict'} 128 | continue 129 | 130 | img_path = external_id_to_img_path_dict[external_id] 131 | 132 | try: 133 | width, height = get_img_dimension(img_path) 134 | except Exception as e: 135 | error_reason_dict[external_id] = {'img_path':img_path, 'error': e } 136 | 137 | elif "input_dir_path" in locals(): 138 | img_path = sample_map_df['external_id'].iloc[index] 139 | width, height = get_img_dimension(img_path) 140 | 141 | map_name = os.path.basename(img_path).split('.')[0] 142 | 143 | # ------------------------- Generate geotiff ------------------------------ 144 | time_start = time.time() 145 | if module_gen_geotiff: 146 | os.chdir(os.path.join(map_kurator_system_dir ,'m1_geotiff')) 147 | 148 | if not os.path.isdir(geotiff_output_dir): 149 | os.makedirs(geotiff_output_dir) 150 | 151 | # use converted jpg folder instead of original sid folder 152 | if "sample_map_path" in locals(): 153 | merged_input_path=sample_map_path 154 | else: 155 | merged_input_path=input_dir_path 156 | 157 | run_geotiff_command = 'python convert_image_to_geotiff.py --sid_root_dir /data2/rumsey_sid_to_jpg/ --sample_map_path '+ merged_input_path +' --out_geotiff_dir '+geotiff_output_dir # can change params in argparse 158 | exe_ret = execute_command(run_geotiff_command, if_print_command) 159 | if 'error' in exe_ret: 160 | error = exe_ret['error'] 161 | 162 | 163 | 164 | # ------------------------- Image cropping ------------------------------ 165 | if module_cropping: 166 | for index, record in sample_map_df.iterrows(): 167 | external_id = record.external_id 168 | if "sample_map_path" in locals(): 169 | if external_id not in external_id_to_img_path_dict: 170 | error_reason_dict[external_id] = {'img_path':None, 'error':'key not in external_id_to_img_path_dict'} 171 | continue 172 | img_path = external_id_to_img_path_dict[external_id] 173 | else: 174 | img_path = sample_map_df['external_id'].iloc[index] 175 | 176 | map_name = os.path.basename(img_path).split('.')[0] 177 | 178 | os.chdir(os.path.join(map_kurator_system_dir ,'m2_detection_recognition')) 179 | if not os.path.isdir(cropping_output_dir): 180 | os.makedirs(cropping_output_dir) 181 | 182 | run_crop_command = 'python crop_img.py --img_path '+img_path + ' --output_dir '+ cropping_output_dir 183 | 184 | exe_ret = execute_command(run_crop_command, if_print_command) 185 | 186 | if "sample_map_path" in locals(): 187 | if 'error' in exe_ret: 188 | error = exe_ret['error'] 189 | error_reason_dict[external_id] = {'img_path':img_path, 'error': error } 190 | 191 | 192 | 193 | time_cropping = time.time() 194 | 195 | # ------------------------- Text Spotting (patch level) ------------------------------ 196 | if module_text_spotting: 197 | assert os.path.exists(spotter_config), "Config file for spotter must exist!" 198 | os.chdir(text_spotting_model_dir) 199 | os.system("python setup.py build develop 1> /dev/null") 200 | 201 | for index, record in sample_map_df.iterrows(): 202 | 203 | external_id = record.external_id 204 | if "sample_map_path" in locals(): 205 | if external_id not in external_id_to_img_path_dict: 206 | error_reason_dict[external_id] = {'img_path':None, 'error':'key not in external_id_to_img_path_dict'} 207 | continue 208 | img_path = external_id_to_img_path_dict[external_id] 209 | else: 210 | img_path = sample_map_df['external_id'].iloc[index] 211 | 212 | map_name = os.path.basename(img_path).split('.')[0] 213 | # print(map_name) 214 | 215 | map_spotting_output_dir = os.path.join(spotting_output_dir, map_name) 216 | 217 | if not os.path.isdir(map_spotting_output_dir): 218 | os.makedirs(map_spotting_output_dir) 219 | else: 220 | num_existing_json = len(glob.glob(os.path.join(map_spotting_output_dir, '*.json'))) 221 | num_existing_images = len(glob.glob(os.path.join(cropping_output_dir, map_name, '*jpg'))) 222 | if num_existing_json == num_existing_images: 223 | continue 224 | else: 225 | print(f'{index}/{len(sample_map_df)}: Re-run spotting for map {map_name}') 226 | import shutil 227 | shutil.rmtree(map_spotting_output_dir) 228 | os.makedirs(map_spotting_output_dir) 229 | 230 | if spotter_model in ['testr', 'spotter-v2', 'palette']: 231 | run_spotting_command = f'CUDA_VISIBLE_DEVICES={gpu_id} python tools/inference.py --config-file {spotter_config} --output_json --input {os.path.join(cropping_output_dir,map_name)} --output {map_spotting_output_dir}' 232 | else: 233 | raise NotImplementedError 234 | 235 | # print(run_spotting_command) 236 | run_spotting_command += ' 1> /dev/null' 237 | 238 | exe_ret = execute_command(run_spotting_command, if_print_command) 239 | if "sample_map_path" in locals(): 240 | if 'error' in exe_ret: 241 | error = exe_ret['error'] 242 | error_reason_dict[external_id] = {'img_path':img_path, 'error': error } 243 | 244 | # elif 'time_usage' in exe_ret: 245 | # time_usage = exe_ret['time_usage'] 246 | # time_usage_dict[external_id]['spotting'] = time_usage 247 | # else: 248 | # raise NotImplementedError 249 | 250 | logging.info(f'{index}/{len(sample_map_df)}: Done text spotting for {map_name}') 251 | 252 | # time_text_spotting = time.time() 253 | 254 | 255 | # ------------------------- Image coord geojson (map level) ------------------------------ 256 | if module_img_geojson: 257 | os.chdir(os.path.join(map_kurator_system_dir ,'m3_image_geojson')) 258 | 259 | if not os.path.isdir(stitch_output_dir): 260 | os.makedirs(stitch_output_dir) 261 | 262 | for index, record in sample_map_df.iterrows(): 263 | external_id = record.external_id 264 | if "sample_map_path" in locals(): 265 | if external_id not in external_id_to_img_path_dict: 266 | error_reason_dict[external_id] = {'img_path':None, 'error':'key not in external_id_to_img_path_dict'} 267 | continue 268 | img_path = external_id_to_img_path_dict[external_id] 269 | else: 270 | img_path = sample_map_df['external_id'].iloc[index] 271 | map_name = os.path.basename(img_path).split('.')[0] 272 | 273 | stitch_input_dir = os.path.join(spotting_output_dir, map_name) 274 | output_geojson = os.path.join(stitch_output_dir, map_name + '.geojson') 275 | 276 | run_stitch_command = 'python stitch_output.py --input_dir '+stitch_input_dir + ' --output_geojson ' + output_geojson 277 | 278 | exe_ret = execute_command(run_stitch_command, if_print_command) 279 | 280 | if "sample_map_path" in locals(): 281 | if 'error' in exe_ret: 282 | error = exe_ret['error'] 283 | error_reason_dict[external_id] = {'img_path':img_path, 'error': error } 284 | 285 | # elif 'time_usage' in exe_ret: 286 | # time_usage = exe_ret['time_usage'] 287 | # time_usage_dict[external_id]['stitch'] = time_usage 288 | # else: 289 | # raise NotImplementedError 290 | 291 | # time_img_geojson = time.time() 292 | 293 | 294 | # ------------------------- post-OCR ------------------------------ 295 | if "sample_map_path" in locals(): 296 | if module_post_ocr: 297 | os.chdir(os.path.join(map_kurator_system_dir, 'm4_post_ocr')) 298 | 299 | if not os.path.isdir(postocr_only_output_dir): 300 | os.makedirs(postocr_only_output_dir) 301 | 302 | for index, record in sample_map_df.iterrows(): 303 | 304 | external_id = record.external_id 305 | if external_id not in external_id_to_img_path_dict: 306 | error_reason_dict[external_id] = {'img_path': None, 'error': 'key not in external_id_to_img_path_dict'} 307 | continue 308 | 309 | img_path = external_id_to_img_path_dict[external_id] 310 | map_name = os.path.basename(img_path).split('.')[0] 311 | 312 | input_geojson_file = os.path.join(geocoord_output_dir, map_name + '.geojson') 313 | 314 | run_postocr_command = 'python post_ocr_main.py --in_geojson_file '+ input_geojson_file + ' --out_geojson_dir ' + os.path.join(map_kurator_system_dir, postocr_only_output_dir) 315 | 316 | exe_ret = execute_command(run_postocr_command, if_print_command) 317 | 318 | if 'error' in exe_ret: 319 | error = exe_ret['error'] 320 | error_reason_dict[external_id] = {'img_path':img_path, 'error': error } 321 | 322 | # elif 'time_usage' in exe_ret: 323 | # time_usage = exe_ret['time_usage'] 324 | # time_usage_dict[external_id]['geocoord_geojson'] = time_usage 325 | # else: 326 | # raise NotImplementedError 327 | 328 | # time_geocoord_geojson = time.time() 329 | 330 | # ------------------------- Convert image coordinates to geocoordinates ------------------------------ 331 | if "sample_map_path" in locals(): 332 | if module_geocoord_geojson: 333 | os.chdir(os.path.join(map_kurator_system_dir, 'm5_geocoordinate_converter')) 334 | 335 | if not os.path.isdir(geocoord_output_dir): 336 | os.makedirs(geocoord_output_dir) 337 | 338 | for index, record in sample_map_df.iterrows(): 339 | external_id = record.external_id 340 | if external_id not in external_id_to_img_path_dict: 341 | error_reason_dict[external_id] = {'img_path': None, 342 | 'error': 'key not in external_id_to_img_path_dict'} 343 | continue 344 | 345 | img_path = external_id_to_img_path_dict[external_id] 346 | map_name = os.path.basename(img_path).split('.')[0] 347 | 348 | # current_files_list = glob.glob(os.path.join(map_kurator_system_dir, geocoord_output_dir, "*.geojson")) 349 | 350 | # saved_map_list = [] 351 | # for mapname in current_files_list: 352 | # only_map = mapname.split("/")[-1]#.strip().replace(".geojson", "") 353 | # saved_map_list.append(only_map) 354 | 355 | in_geojson = os.path.join(stitch_output_dir, map_name + '.geojson') 356 | 357 | # current_map = in_geojson.split("/")[-1] 358 | 359 | # if current_map not in saved_map_list: 360 | # print("running missing file",current_map) 361 | 362 | run_converter_command = 'python convert_geojson_to_geocoord.py --sample_map_path ' + os.path.join(map_kurator_system_dir, input_csv_path) + ' --in_geojson_file ' + in_geojson + ' --out_geojson_dir ' + os.path.join(map_kurator_system_dir, geocoord_output_dir) 363 | 364 | exe_ret = execute_command(run_converter_command, if_print_command) 365 | 366 | if 'error' in exe_ret: 367 | error = exe_ret['error'] 368 | error_reason_dict[external_id] = {'img_path': img_path, 'error': error} 369 | 370 | # elif 'time_usage' in exe_ret: 371 | # time_usage = exe_ret['time_usage'] 372 | # time_usage_dict[external_id]['geocoord_geojson'] = time_usage 373 | # else: 374 | # raise NotImplementedError 375 | 376 | # time_geocoord_geojson = time.time() 377 | 378 | 379 | # --------------------- Error logging -------------------------- 380 | print('\n') 381 | current_time = datetime.datetime.now().strftime("%Y_%m_%d-%I:%M:%S_%p") 382 | error_reason_df = pd.DataFrame.from_dict(error_reason_dict, orient='index') 383 | error_reason_log_path = os.path.join(output_folder, expt_name, 'error_reason_' + current_time +'.csv') 384 | error_reason_df.to_csv(error_reason_log_path, index_label='external_id') 385 | 386 | 387 | def func_file_to_fullpath_dict(file_path_list): 388 | 389 | file_fullpath_dict = dict() 390 | for file_path in file_path_list: 391 | file_fullpath_dict[os.path.basename(file_path).split('.')[0]] = file_path 392 | 393 | return file_fullpath_dict 394 | 395 | def get_img_path_from_external_id(jp2_root_dir = '/data/rumsey-jp2/', sid_root_dir = '/data2/rumsey_sid_to_jpg/', additional_root_dir='/data2/rumsey-luna-img/', sample_map_path = None,external_id_key = 'external_id') : 396 | # returns (1) a dict with external-id as key, full image path as value (2) list of external-id that can not find image path 397 | 398 | jp2_file_path_list = glob.glob(os.path.join(jp2_root_dir, '*/*.jp2')) 399 | sid_file_path_list = glob.glob(os.path.join(sid_root_dir, '*.jpg')) 400 | add_file_path_list = glob.glob(os.path.join(additional_root_dir, '*')) 401 | 402 | jp2_file_fullpath_dict = func_file_to_fullpath_dict(jp2_file_path_list) 403 | sid_file_fullpath_dict = func_file_to_fullpath_dict(sid_file_path_list) 404 | add_file_fullpath_dict = func_file_to_fullpath_dict(add_file_path_list) 405 | 406 | sample_map_df = pd.read_csv(sample_map_path, dtype={'external_id':str}) 407 | 408 | external_id_to_img_path_dict = {} 409 | 410 | unmatched_external_id_list = [] 411 | 412 | for index, record in sample_map_df.iterrows(): 413 | external_id = record.external_id 414 | filename_without_extension = external_id.strip("'").replace('.','') 415 | 416 | full_path = '' 417 | if filename_without_extension in jp2_file_fullpath_dict: 418 | full_path = jp2_file_fullpath_dict[filename_without_extension] 419 | elif filename_without_extension in sid_file_fullpath_dict: 420 | full_path = sid_file_fullpath_dict[filename_without_extension] 421 | elif filename_without_extension in add_file_fullpath_dict: 422 | full_path = add_file_fullpath_dict[filename_without_extension] 423 | else: 424 | # print('image with external_id not found in image_dir:', external_id) 425 | unmatched_external_id_list.append(external_id) 426 | continue 427 | assert (len(full_path)!=0) 428 | 429 | external_id_to_img_path_dict[external_id] = full_path 430 | 431 | return external_id_to_img_path_dict, unmatched_external_id_list 432 | 433 | def get_img_path_from_external_id_and_image_no(jp2_root_dir = '/data/rumsey-jp2/', sid_root_dir = '/data2/rumsey_sid_to_jpg/', additional_root_dir='/data2/rumsey-luna-img/', sample_map_path = None,external_id_key = 'external_id') : 434 | # returns (1) a dict with external-id as key, full image path as value (2) list of external-id that can not find image path 435 | 436 | jp2_file_path_list = glob.glob(os.path.join(jp2_root_dir, '*/*.jp2')) 437 | sid_file_path_list = glob.glob(os.path.join(sid_root_dir, '*.jpg')) # use converted jpg directly 438 | add_file_path_list = glob.glob(os.path.join(additional_root_dir, '*')) 439 | 440 | jp2_file_fullpath_dict = func_file_to_fullpath_dict(jp2_file_path_list) 441 | sid_file_fullpath_dict = func_file_to_fullpath_dict(sid_file_path_list) 442 | add_file_fullpath_dict = func_file_to_fullpath_dict(add_file_path_list) 443 | 444 | sample_map_df = pd.read_csv(sample_map_path, dtype={'external_id':str}) 445 | 446 | external_id_to_img_path_dict = {} 447 | 448 | unmatched_external_id_list = [] 449 | for index, record in sample_map_df.iterrows(): 450 | external_id = record.external_id 451 | image_no = record.image_no 452 | # filename_without_extension = external_id.strip("'").replace('.','') 453 | filename_without_extension = image_no.strip("'").split('.')[0] 454 | 455 | full_path = '' 456 | if filename_without_extension in jp2_file_fullpath_dict: 457 | full_path = jp2_file_fullpath_dict[filename_without_extension] 458 | elif filename_without_extension in sid_file_fullpath_dict: 459 | full_path = sid_file_fullpath_dict[filename_without_extension] 460 | elif filename_without_extension in add_file_fullpath_dict: 461 | full_path = add_file_fullpath_dict[filename_without_extension] 462 | else: 463 | print('image with external_id not found in image_dir:', external_id) 464 | unmatched_external_id_list.append(external_id) 465 | continue 466 | assert (len(full_path)!=0) 467 | 468 | external_id_to_img_path_dict[external_id] = full_path 469 | 470 | return external_id_to_img_path_dict, unmatched_external_id_list 471 | 472 | 473 | if __name__ == '__main__': 474 | 475 | parser = argparse.ArgumentParser() 476 | parser.add_argument('--jp2_root_dir', type=str, default='/data/rumsey-jp2/', 477 | help='image dir of jp2 files.') 478 | parser.add_argument('--sid_root_dir', type=str, default='/data2/rumsey_sid_to_jpg/', 479 | help='image dir of sid files.') 480 | parser.add_argument('--additional_root_dir', type=str, default='/data2/rumsey-luna-img/', 481 | help='image dir of additional luna files.') 482 | parser.add_argument('--sample_map_path', type=str, default='data/initial_US_100_maps.csv', 483 | help='path to sample map csv, which contains gcps info') 484 | parser.add_argument('--external_id_key', type=str, default='external_id', 485 | help='key string for external id, could be external_id or ListNo') 486 | 487 | args = parser.parse_args() 488 | print(args) 489 | 490 | # get_img_path_from_external_id(jp2_root_dir = args.jp2_root_dir, sid_root_dir = args.sid_root_dir, additional_root_dir = args.additional_root_dir, 491 | # sample_map_path = args.sample_map_path,external_id_key = args.external_id_key) 492 | 493 | get_img_path_from_external_id_and_image_no(jp2_root_dir = args.jp2_root_dir, sid_root_dir = args.sid_root_dir, additional_root_dir = args.additional_root_dir, 494 | sample_map_path = args.sample_map_path,external_id_key = args.external_id_key) 495 | --------------------------------------------------------------------------------