├── .gitignore
├── README.md
├── external_id_search
└── script.py
├── m0_preprocessing
└── convert_sid_to_jpg.py
├── m1_geotiff
└── convert_image_to_geotiff.py
├── m2_detection_recognition
└── crop_img.py
├── m3_image_geojson
├── run.sh
└── stitch_output.py
├── m4_post_ocr
├── logstash_postocr.conf
├── post_ocr.py
├── post_ocr_main.py
└── preprocess.py
├── m5_geocoordinate_converter
└── convert_geojson_to_geocoord.py
├── m6_entity_linker
├── create_elasticsearch_index.py
├── create_spatial_index_postgres.py
├── entity_linking.py
├── logstash_osm_linker.conf
├── logstash_postgres_world.conf
├── postgres_logger.py
├── upload_osm_to_postgres_all_continents.py
└── upload_osm_to_postgres_ogr2ogr.py
├── m_sanborn
├── s1_geocoding.py
├── s2_clustering.py
└── s3_gen_geojson.py
├── requirements.txt
├── run.py
├── run_img.py
└── utils.py
/.gitignore:
--------------------------------------------------------------------------------
1 | data/
2 | data0/
3 | data1/
4 | rumsey_output/
5 | .idea/
6 | .env
7 | MrSID*
8 | __pycache__
9 | debug/
10 | .ipynb_checkpoints/
11 | run_linker.py
12 | osm_linker.csv
--------------------------------------------------------------------------------
/README.md:
--------------------------------------------------------------------------------
1 |
2 | ## About mapKurator System
3 |
4 | **[New]**: Our documentation website for mapKurator system is up! [https://knowledge-computing.github.io/mapkurator-doc/](https://knowledge-computing.github.io/mapkurator-doc/#/)
5 |
6 | [mapKurator](https://dl.acm.org/doi/abs/10.1145/3589132.3625579) is a fully automatic pipeline developed by the [**Knowledge Computing Lab**](https://knowledge-computing.github.io/) at the **University of Minnesota** to process a large number of scanned historical map images. Outputs include the recognized text labels, label bounding polygons, labels after post-OCR correction, and a geo-entity identifier from OpenStreetMap.
7 |
8 | ### mapKurator textspotter repository
9 | Please refer to this link to check all the spotter models in mapKurator : [Spotter-v2](https://github.com/knowledge-computing/mapkurator-spotter), [PALETTE](https://github.com/knowledge-computing/mapkurator-palette)
10 |
11 | ---------
12 |
13 | ## Data Card - Derived Dataset Processed by mapKurator System
14 |
15 | Map text recognized from the [Rumsey historical map collection](https://www.davidrumsey.com/) with 57K georeferenced maps.
16 |
17 | ### Dataset Download Link
18 |
19 | Text extraction and recognition results [https://s3.msi.umn.edu/rumsey_output/Round3/english.zip](https://s3.msi.umn.edu/rumsey_output/Round3/english.zip (~50GB)) (~50GB)
20 |
21 | ### Dataset Languages
22 |
23 | English
24 |
25 | ### Language Creators:
26 |
27 | Machine-generated
28 |
29 | ## Dataset Structure
30 |
31 | ### Data Fields
32 |
33 |
34 |
35 | ### Output File Name
36 |
37 | Output GeoJSON file is named after the external ID of original map image.
38 |
39 |
40 |
41 | ### Citation
42 | ```
43 | @inproceedings{kim2023mapkurator,
44 | title={The mapKurator System: A Complete Pipeline for Extracting and Linking Text from Historical Maps},
45 | author={Kim, Jina and Li, Zekun and Lin, Yijun and Namgung, Min and Jang, Leeje and Chiang, Yao-Yi},
46 | booktitle={Proceedings of the 31st ACM International Conference on Advances in Geographic Information Systems},
47 | pages={1--4},
48 | year={2023}
49 | }
50 | ```
51 |
52 | ### Licensing Information
53 |
54 | CC BY-NC 2.0
55 |
56 |
--------------------------------------------------------------------------------
/external_id_search/script.py:
--------------------------------------------------------------------------------
1 | from elasticsearch_dsl import Search, Q
2 | from elasticsearch import Elasticsearch, helpers
3 | from elasticsearch import RequestsHttpConnection
4 | import argparse
5 | import os
6 | import glob
7 | import json
8 | import nltk
9 | import logging
10 | from dotenv import load_dotenv
11 |
12 | import pandas as pd
13 | import numpy as np
14 | import logging
15 | import re
16 | import warnings
17 | warnings.filterwarnings("ignore")
18 |
19 |
20 |
21 | def db_connect():
22 | """Elasticsearch Connection on Sansa"""
23 | load_dotenv()
24 |
25 | DB_HOST = os.getenv("DB_HOST")
26 | USER_NAME = os.getenv("DB_USERNAME")
27 | PASSWORD = os.getenv("DB_PASSWORD")
28 |
29 | es = Elasticsearch([DB_HOST], connection_class=RequestsHttpConnection, http_auth=(USER_NAME, PASSWORD), verify_certs=False)
30 | return es
31 |
32 |
33 | def query(target):
34 | es = db_connect()
35 | inputs = target.upper()
36 | query = {"query": {"match": {"text": f"{inputs}"}}}
37 | test = es.search(index="meta", body=query, size=10000)["hits"]["hits"]
38 |
39 | id_list = []
40 | if len(test) != 0 :
41 | for i in range(len(test)):
42 | map_id = test[i]['_source']['external_id']
43 | id_list.append(map_id)
44 |
45 |
46 | result = sorted(list(set(id_list)))
47 | return result
48 |
49 |
50 | def main(args):
51 | keyword = args.target
52 | metadata_path = args.metadata
53 | meta_df = pd.read_csv(metadata_path)
54 | meta_df['tmp'] = meta_df['image_no'].str.split(".").str[0]
55 |
56 | results = query(keyword)
57 | # print(f' "{keyword}" exist in: {results}')
58 |
59 | tmp_df = meta_df[meta_df.tmp.isin(results)]
60 |
61 | print(f'"{keyword}" exist in:')
62 | for index, row in tmp_df.iterrows():
63 | print(f'{row.tmp} \t {row.title}')
64 |
65 |
66 | if __name__ == '__main__':
67 | parser = argparse.ArgumentParser()
68 | parser.add_argument('--target', type=str, default='east', help='')
69 | parser.add_argument('--metadata', type=str, default='/home/maplord/maplist_csv/luna_omo_metadata_56628_20220724.csv', help='')
70 |
71 | args = parser.parse_args()
72 | print(args)
73 |
74 | main(args)
75 |
--------------------------------------------------------------------------------
/m0_preprocessing/convert_sid_to_jpg.py:
--------------------------------------------------------------------------------
1 | import os
2 | import glob
3 | import time
4 | import multiprocessing
5 |
6 | sid_dir = '/data/rumsey-sid'
7 | sid_to_jpg_dir = '/data2/rumsey_sid_to_jpg/'
8 | num_process = 20
9 | if_print_command = True
10 |
11 | sid_list = glob.glob(os.path.join(sid_dir, '*/*.sid'))
12 |
13 | def execute_command(command, if_print_command):
14 | t1 = time.time()
15 |
16 | if if_print_command:
17 | print(command)
18 | os.system(command)
19 |
20 | t2 = time.time()
21 | time_usage = t2 - t1
22 | return time_usage
23 |
24 |
25 | def conversion(img_path):
26 | mrsiddecode_executable="/home/zekun/dr_maps/mapkurator-system/m1_geotiff/MrSID_DSDK-9.5.4.4709-rhel6.x86-64.gcc531/Raster_DSDK/bin/mrsiddecode"
27 | map_name = os.path.basename(img_path)[:-4]
28 |
29 | redirected_path = os.path.join(sid_to_jpg_dir, map_name + '.jpg')
30 |
31 | run_sid_to_jpg_command = mrsiddecode_executable + ' -quiet -i '+ img_path + ' -o '+redirected_path
32 | time_usage = execute_command(run_sid_to_jpg_command, if_print_command)
33 |
34 |
35 |
36 | if __name__ == "__main__":
37 | pool = multiprocessing.Pool(num_process)
38 | start_time = time.perf_counter()
39 | processes = [pool.apply_async(conversion, args=(sid_path,)) for sid_path in sid_list]
40 | result = [p.get() for p in processes]
41 | finish_time = time.perf_counter()
42 | print(f"Program finished in {finish_time-start_time} seconds")
43 |
44 |
--------------------------------------------------------------------------------
/m1_geotiff/convert_image_to_geotiff.py:
--------------------------------------------------------------------------------
1 | import os
2 | import glob
3 | import pandas as pd
4 | import ast
5 | import argparse
6 | import logging
7 | import pdb
8 |
9 | logging.basicConfig(level=logging.INFO)
10 |
11 | def func_file_to_fullpath_dict(file_path_list):
12 |
13 | file_fullpath_dict = dict()
14 | for file_path in file_path_list:
15 | file_fullpath_dict[os.path.basename(file_path).split('.')[0]] = file_path
16 |
17 | return file_fullpath_dict
18 |
19 | def main(args):
20 |
21 | jp2_root_dir = args.jp2_root_dir
22 | sid_root_dir = args.sid_root_dir
23 | additional_root_dir = args.additional_root_dir
24 | out_geotiff_dir = args.out_geotiff_dir
25 |
26 | sample_map_path = args.sample_map_path
27 | external_id_key = args.external_id_key
28 |
29 | jp2_file_path_list = glob.glob(os.path.join(jp2_root_dir, '*/*.jp2'))
30 | sid_file_path_list = glob.glob(os.path.join(sid_root_dir, '*.jpg')) # use converted jpg directly
31 | add_file_path_list = glob.glob(os.path.join(additional_root_dir, '*'))
32 |
33 | jp2_file_fullpath_dict = func_file_to_fullpath_dict(jp2_file_path_list)
34 | sid_file_fullpath_dict = func_file_to_fullpath_dict(sid_file_path_list)
35 | add_file_fullpath_dict = func_file_to_fullpath_dict(add_file_path_list)
36 |
37 | sample_map_df = pd.read_csv(sample_map_path, dtype={'external_id':str})
38 |
39 |
40 | for index, record in sample_map_df.iterrows():
41 | external_id = record.external_id
42 | transform_method = record.transformation_method
43 | gcps = record.gcps
44 | filename_without_extension = external_id.strip("'").replace('.','')
45 |
46 | full_path = ''
47 | if filename_without_extension in jp2_file_fullpath_dict:
48 | full_path = jp2_file_fullpath_dict[filename_without_extension]
49 | elif filename_without_extension in sid_file_fullpath_dict:
50 | full_path = sid_file_fullpath_dict[filename_without_extension]
51 | elif filename_without_extension in add_file_fullpath_dict:
52 | full_path = add_file_fullpath_dict[filename_without_extension]
53 | else:
54 | print('image with external_id not found in image_dir:', external_id)
55 | continue
56 | assert (len(full_path)!=0)
57 |
58 | gcps = ast.literal_eval(gcps)
59 |
60 | gcp_str = ''
61 | for gcp in gcps:
62 | lng, lat = gcp['location']
63 | x, y = gcp['pixel']
64 | gcp_str += '-gcp '+str(x) + ' ' + str(y) + ' ' + str(lng) + ' ' + str(lat) + ' '
65 |
66 | # gdal_translate to add GCP to raw image
67 | gdal_command = 'gdal_translate -of Gtiff '+gcp_str + full_path + ' ' + os.path.join(out_geotiff_dir, filename_without_extension) + '_temp.geotiff'
68 | print(gdal_command)
69 | os.system(gdal_command)
70 |
71 |
72 | assert transform_method in ['affine','polynomial','tps']
73 |
74 | # reprojection with gdal_warp
75 | if transform_method == 'affine':
76 | # first order
77 |
78 | warp_command = 'gdalwarp -s_srs EPSG:4326 -t_srs EPSG:3857 -r near -order 1 -of GTiff ' + os.path.join(out_geotiff_dir, filename_without_extension) + '_temp.geotiff' + ' ' + os.path.join(out_geotiff_dir, filename_without_extension) + '.geotiff'
79 |
80 | elif transform_method == 'polynomial':
81 | # second order
82 | warp_command = 'gdalwarp -s_srs EPSG:4326 -t_srs EPSG:3857 -r near -order 2 -of GTiff '+ os.path.join(out_geotiff_dir, filename_without_extension) + '_temp.geotiff' + ' ' + os.path.join(out_geotiff_dir, filename_without_extension) + '.geotiff'
83 |
84 | elif transform_method == 'tps':
85 | # Thin plate spline #debug/11558008.geotiff #10057000.geotiff
86 | warp_command = 'gdalwarp -s_srs EPSG:4326 -t_srs EPSG:3857 -r near -tps -of GTiff '+ os.path.join(out_geotiff_dir, filename_without_extension) + '_temp.geotiff' + ' ' + os.path.join(out_geotiff_dir, filename_without_extension) + '.geotiff'
87 |
88 | else:
89 | raise NotImplementedError
90 | print(warp_command)
91 | os.system(warp_command)
92 | # remove temporary tiff file
93 | # os.system('rm ' + os.path.join(out_geotiff_dir, filename_without_extension) + '_temp.geotiff')
94 |
95 |
96 | logging.info('Done generating geotiff for %s', external_id)
97 |
98 |
99 | if __name__ == '__main__':
100 |
101 | parser = argparse.ArgumentParser()
102 | parser.add_argument('--jp2_root_dir', type=str, default='/data/rumsey-jp2/',
103 | help='image dir of jp2 files.')
104 | parser.add_argument('--sid_root_dir', type=str, default='/data2/rumsey_sid_to_jpg/',
105 | help='image dir of sid files.')
106 | parser.add_argument('--additional_root_dir', type=str, default='/data2/rumsey-luna-img/',
107 | help='image dir of additional luna files.')
108 | parser.add_argument('--out_geotiff_dir', type=str, default='data/geotiff/',
109 | help='output dir for geotiff')
110 | parser.add_argument('--sample_map_path', type=str, default='data/initial_US_100_maps.csv',
111 | help='path to sample map csv, which contains gcps info')
112 | parser.add_argument('--external_id_key', type=str, default='external_id',
113 | help='key string for external id, could be external_id or ListNo')
114 |
115 | args = parser.parse_args()
116 | print(args)
117 |
118 |
119 | main(args)
120 |
--------------------------------------------------------------------------------
/m2_detection_recognition/crop_img.py:
--------------------------------------------------------------------------------
1 | import sys
2 | import os
3 | from PIL import Image, ImageFile
4 | import numpy as np
5 | import argparse
6 | import logging
7 |
8 | logging.basicConfig(level=logging.INFO)
9 | Image.MAX_IMAGE_PIXELS=None # allow reading huge images
10 |
11 | #add this one line and import ImageFile above
12 | ImageFile.LOAD_TRUNCATED_IMAGES = True
13 |
14 | def main(args):
15 |
16 | img_path = args.img_path
17 | output_dir = args.output_dir
18 |
19 | map_name = os.path.basename(img_path).split('.')[0] # get the map name without extension
20 | output_dir = os.path.join(output_dir, map_name)
21 |
22 | if not os.path.isdir(output_dir):
23 | os.makedirs(output_dir)
24 |
25 | map_img = Image.open(img_path)
26 | width, height = map_img.size
27 |
28 | #print(width, height)
29 |
30 | shift_size = 1000
31 |
32 | # pad the image to the size divisible by shift-size
33 | num_tiles_w = int(np.ceil(1. * width / shift_size))
34 | num_tiles_h = int(np.ceil(1. * height / shift_size))
35 | enlarged_width = int(shift_size * num_tiles_w)
36 | enlarged_height = int(shift_size * num_tiles_h)
37 |
38 | enlarged_map = Image.new(mode="RGB", size=(enlarged_width, enlarged_height))
39 | # paste map_imge to enlarged_map
40 | enlarged_map.paste(map_img)
41 |
42 | for idx in range(0, num_tiles_h):
43 | for jdx in range(0, num_tiles_w):
44 | img_clip = enlarged_map.crop((jdx * shift_size, idx * shift_size,(jdx + 1) * shift_size, (idx + 1) * shift_size, ))
45 |
46 | out_path = os.path.join(output_dir, 'h' + str(idx) + '_w' + str(jdx) + '.jpg')
47 | img_clip.save(out_path)
48 |
49 | logging.info('Done cropping %s' %img_path )
50 |
51 |
52 | if __name__ == '__main__':
53 |
54 | parser = argparse.ArgumentParser()
55 | parser.add_argument('--img_path', type=str, default='../data/100_maps/8628000.jp2',
56 | help='path to image file.')
57 | parser.add_argument('--output_dir', type=str, default='../data/100_maps_crop/',
58 | help='path to output dir')
59 |
60 | args = parser.parse_args()
61 | print(args)
62 |
63 |
64 | # if not os.path.isdir(args.output_dir):
65 | # os.makedirs(args.output_dir)
66 | # print('created dir',args.output_dir)
67 |
68 | main(args)
69 |
--------------------------------------------------------------------------------
/m3_image_geojson/run.sh:
--------------------------------------------------------------------------------
1 | # python3 stitch_output.py --input_dir='/data2/rumsey_output/57k_maps_r3/spotter/arabic/13415000' --output='/data2/rumsey_output/57k_maps_r3/stitch/arabic/13415000.geojson'
2 |
3 | # python3 stitch_output.py --input_dir='/data2/rumsey_output/57k_maps_r3/spotter/arabic/0845008' --output='/data2/rumsey_output/57k_maps_r3/stitch/arabic/0845008.geojson'
4 |
5 | # python3 stitch_output.py --input_dir='/data2/rumsey_output/57k_maps_r3/spotter/arabic/8407000' --output='/data2/rumsey_output/57k_maps_r3/stitch/arabic/8407000.geojson'
6 |
7 | # python3 stitch_output.py --input_dir='/data2/rumsey_output/57k_maps_r3/spotter/arabic/13272452' --output='/data2/rumsey_output/57k_maps_r3/stitch/arabic/13272452.geojson'
8 |
9 | # python3 stitch_output.py --input_dir='/data2/rumsey_output/57k_maps_r3/spotter/arabic/6855023' --output='/data2/rumsey_output/57k_maps_r3/stitch/arabic/6855023.geojson'
10 |
11 | # python3 stitch_output.py --input_dir='/data2/rumsey_output/57k_maps_r3/spotter/arabic/10198088' --output='/data2/rumsey_output/57k_maps_r3/stitch/arabic/10198088.geojson'
12 |
13 | # python3 stitch_output.py --input_dir='/data2/rumsey_output/57k_maps_r3/spotter/arabic/2119002' --output='/data2/rumsey_output/57k_maps_r3/stitch/arabic/2119002.geojson'
14 |
15 | # python3 stitch_output.py --input_dir='/data2/rumsey_output/57k_maps_r3/spotter/arabic/5850099' --output='/data2/rumsey_output/57k_maps_r3/stitch/arabic/5850099.geojson'
16 |
17 | # python3 stitch_output.py --input_dir='/data2/rumsey_output/57k_maps_r3/spotter/arabic/0352067' --output='/data2/rumsey_output/57k_maps_r3/stitch/arabic/0352067.geojson'
18 |
19 | # python3 stitch_output.py --input_dir='/data2/rumsey_output/57k_maps_r3/spotter/arabic/8496000' --output='/data2/rumsey_output/57k_maps_r3/stitch/arabic/8496000.geojson'
20 |
21 | # python3 stitch_output.py --input_dir='/data2/rumsey_output/57k_maps_r3/spotter/arabic/10285112' --output='/data2/rumsey_output/57k_maps_r3/stitch/arabic/10285112.geojson'
22 |
23 | # python3 stitch_output.py --input_dir='/data2/rumsey_output/57k_maps_r3/spotter/arabic/11201250' --output='/data2/rumsey_output/57k_maps_r3/stitch/arabic/11201250.geojson'
24 |
25 | # python3 stitch_output.py --input_dir='/data2/rumsey_output/57k_maps_r3/spotter/arabic/7924008' --output='/data2/rumsey_output/57k_maps_r3/stitch/arabic/7924008.geojson'
26 |
27 | # python3 stitch_output.py --input_dir='/data2/rumsey_output/57k_maps_r3/spotter/arabic/8859002' --output='/data2/rumsey_output/57k_maps_r3/stitch/arabic/8859002.geojson'
28 |
29 | # python3 stitch_output.py --input_dir='/data2/rumsey_output/57k_maps_r3/spotter/arabic/2239006' --output='/data2/rumsey_output/57k_maps_r3/stitch/arabic/2239006.geojson'
30 |
31 | # python3 stitch_output.py --input_dir='/data2/rumsey_output/57k_maps_r3/spotter/arabic/6954000' --output='/data2/rumsey_output/57k_maps_r3/stitch/arabic/6954000.geojson'
32 |
33 | # python3 stitch_output.py --input_dir='/data2/rumsey_output/57k_maps_r3/spotter/arabic/9085004' --output='/data2/rumsey_output/57k_maps_r3/stitch/arabic/9085004.geojson'
34 |
35 | # python3 stitch_output.py --input_dir='/data2/rumsey_output/57k_maps_r3/spotter/arabic/6353076' --output='/data2/rumsey_output/57k_maps_r3/stitch/arabic/6353076.geojson'
36 |
37 |
38 | python3 stitch_output.py --input_dir='/data2/rumsey_output/57k_maps_r3/spotter/english/0019149' --output='/data2/rumsey_output/57k_maps_r3/stitch/english/0019149.geojson'
39 | python3 stitch_output.py --input_dir='/data2/rumsey_output/57k_maps_r3/spotter/russian/0019149' --output='/data2/rumsey_output/57k_maps_r3/stitch/russian/0019149.geojson'
40 | python3 stitch_output.py --input_dir='/data2/rumsey_output/57k_maps_r3/spotter/arabic/0019149' --output='/data2/rumsey_output/57k_maps_r3/stitch/arabic/0019149.geojson'
41 | python3 stitch_output.py --input_dir='/data2/rumsey_output/57k_maps_r3/spotter/chinese/0019149' --output='/data2/rumsey_output/57k_maps_r3/stitch/chinese/0019149.geojson'
42 |
43 |
44 |
45 |
--------------------------------------------------------------------------------
/m3_image_geojson/stitch_output.py:
--------------------------------------------------------------------------------
1 | import os
2 | import glob
3 | import pandas as pd
4 | import numpy as np
5 | import argparse
6 | from geojson import Polygon, Feature, FeatureCollection, dump
7 | import logging
8 | import pdb
9 |
10 | # logging.basicConfig(level=logging.INFO)
11 |
12 | logging.basicConfig(level=logging.ERROR)
13 | pd.options.mode.chained_assignment = None
14 |
15 | def concatenate_and_convert_to_geojson(args):
16 | map_subdir = args.input_dir
17 | output_geojson = args.output_geojson
18 | shift_size = args.shift_size
19 | eval_bool = args.eval_only
20 |
21 | file_list = glob.glob(map_subdir + '/*.json')
22 | file_list = sorted(file_list)
23 | if len(file_list) == 0:
24 | logging.warning('No files found for %s' % map_subdir)
25 |
26 | map_data = []
27 | for file_path in file_list:
28 | patch_index_h, patch_index_w = os.path.basename(file_path).split('.')[0].split('_')
29 | patch_index_h = int(patch_index_h[1:])
30 | patch_index_w = int(patch_index_w[1:])
31 |
32 | try:
33 | # fix text column to be type 'object', to avoid errors (e.g. '6' -> 6.0 'NAn' -> nan)
34 | df = pd.read_json(file_path, dtype={'text':object})
35 | except pd.errors.EmptyDataError:
36 | logging.warning('%s is empty. Skipping.' % file_path)
37 | continue
38 | except KeyError as ke:
39 | logging.warning('%s has no detected labels. Skipping.' %file_path)
40 | continue
41 |
42 | for index, line_data in df.iterrows():
43 | df['polygon_x'][index] = np.array(df['polygon_x'][index]).astype(float) + shift_size * patch_index_w
44 | df['polygon_y'][index] = np.array(df['polygon_y'][index]).astype(float) + shift_size * patch_index_h
45 | map_data.append(df)
46 |
47 | if len(map_data) == 0:
48 | with open(output_geojson, 'w', encoding='utf8') as f:
49 | pass
50 | print('created empty geojson for', output_geojson)
51 | return 0
52 |
53 | map_df = pd.concat(map_data)
54 |
55 |
56 | features = []
57 | for index, line_data in map_df.iterrows():
58 | polygon_x, polygon_y = list(line_data['polygon_x']), list(line_data['polygon_y'])
59 |
60 | if eval_bool == False:
61 | # y is kept to be positive. Needs to be negative for QGIS visualization
62 | # For flip coordinates: [x,-y] for x,y in zip(polygon_x, polygon_y),
63 | # To form a closed loop polygon: [polygon_x[0], -polygon_y[0]], otherwise QGIS can not display the polygon
64 | polygon = Polygon([[[x,-y] for x,y in zip(polygon_x, polygon_y)]+[[polygon_x[0], -polygon_y[0]]]])
65 | else:
66 | polygon = Polygon([[[x,y] for x,y in zip(polygon_x, polygon_y)]+[[polygon_x[0], polygon_y[0]]]])
67 |
68 | text = line_data['text']
69 | score = line_data['score']
70 | features.append(Feature(geometry = polygon, properties={"text": text, "score": score} ))
71 |
72 | feature_collection = FeatureCollection(features)
73 | # with open(os.path.join(output_dir, map_subdir +'.geojson'), 'w') as f:
74 | # dump(feature_collection, f)
75 | with open(output_geojson, 'w', encoding='utf8') as f:
76 | dump(feature_collection, f, ensure_ascii=False)
77 |
78 | logging.info('Done generating geojson (img coord) for %s', map_subdir)
79 |
80 |
81 | if __name__ == '__main__':
82 |
83 | parser = argparse.ArgumentParser()
84 | parser.add_argument('--input_dir', type=str, default='data/100_maps_crop_abc/0063014',
85 | help='path to input json path.')
86 |
87 | parser.add_argument('--output_geojson', type=str, default='data/100_maps_geojson_abc/0063014.geojson',
88 | help='path to output geojson path')
89 |
90 | parser.add_argument('--shift_size', type=int, default = 1000,
91 | help='image patch size and shift size.')
92 |
93 | # This can not be of string type. Otherwise it will be interpreted to True all the time.
94 | parser.add_argument('--eval_only', default = False, action='store_true',
95 | help='keep positive coordinate')
96 |
97 | args = parser.parse_args()
98 | print(args)
99 |
100 | concatenate_and_convert_to_geojson(args)
101 |
102 |
103 |
104 |
105 |
106 |
--------------------------------------------------------------------------------
/m4_post_ocr/logstash_postocr.conf:
--------------------------------------------------------------------------------
1 | input {
2 | file {
3 | path => "total.csv"
4 | start_position => beginning
5 | sincedb_path => "/dev/null"
6 | }
7 | }
8 | filter {
9 | csv {
10 | separator => ","
11 | columns => [
12 | "name",
13 | "count"
14 | ]
15 | }
16 | }
17 | output {
18 | elasticsearch {
19 | hosts => "localhost:9200"
20 | index => "osm-voca"
21 | doc_as_upsert => true
22 | }
23 | }
--------------------------------------------------------------------------------
/m4_post_ocr/post_ocr.py:
--------------------------------------------------------------------------------
1 | import logging
2 | import requests
3 | import json
4 | import http.client as http_client
5 | import nltk
6 | import re
7 |
8 | import elasticsearch
9 | import elasticsearch.helpers
10 |
11 |
12 | def lexical_search_query(target_text, es):
13 | """ Query candidates and save them as 'postocr_label' """
14 |
15 | clean_txt = []
16 | if type(target_text) == str:
17 | if any(char.isdigit() for char in target_text) == False:
18 | for t in range(len(target_text)):
19 | txt = target_text[t]
20 | if txt.isalpha():
21 | clean_txt.append(txt)
22 |
23 | temp_label = ''.join([str(item) for item in clean_txt])
24 | if len(temp_label) != 0:
25 | target_text = temp_label
26 |
27 | process = re.findall('[A-Z][^A-Z]*', target_text)
28 | if all(c.isupper() for c in process) or len(process) == 1:
29 |
30 | if type(target_text) == str and any(c.isalpha() for c in target_text):
31 | # edist 0
32 | fuzziness = 0
33 | inputs = target_text.lower()
34 | q1 = {'query': {'fuzzy': {'name': {'value': inputs, 'fuzziness': 0}}}}
35 | try:
36 | es_results = elasticsearch.helpers.scan(es, index="osm-voca", preserve_order=True, query=q1)
37 | except elasticsearch.ElasticsearchException as es_error:
38 | print(es_error)
39 |
40 | test = [item['_source'] for item in es_results if item["_source"]['name'] is not None]
41 |
42 |
43 | edist = []
44 | edist_update = []
45 |
46 | edd_min_find = 0
47 | min_candidates = False
48 |
49 | if test != 'NaN':
50 | for tt in range(len(test)):
51 | if 'name' in test[tt]:
52 | candidate = test[tt]['name']
53 | edist.append(candidate)
54 |
55 | for e in range(len(edist)):
56 | edd = nltk.edit_distance(inputs.upper(), edist[e].upper())
57 |
58 | if edd == 0:
59 | edist_update.append(edist[e])
60 | min_candidates = edist[e]
61 | edd_min_find = 1
62 |
63 | # edd 1
64 | if edd_min_find != 1:
65 | # edist 1
66 | fuzziness = 1
67 |
68 | q2 = {'query': {'fuzzy': {'name': {'value': inputs, 'fuzziness': fuzziness}}}}
69 | try:
70 | es_results = elasticsearch.helpers.scan(es, index="osm-voca", preserve_order=True, query=q2)
71 | except elasticsearch.ElasticsearchException as es_error:
72 | print(es_error)
73 |
74 | test = [item['_source'] for item in es_results if item["_source"]['name'] is not None]
75 |
76 |
77 | edist = []
78 | edist_count = []
79 | edist_update = []
80 | edist_count_update = []
81 |
82 | if test != 'NaN':
83 | for tt in range(len(test)):
84 | if 'name' in test[tt]:
85 | candidate = test[tt]['message']
86 | cand = candidate.split(',')[0]
87 | count = candidate.split(',')[1]
88 | edist.append(cand)
89 | edist_count.append(count)
90 |
91 | for e in range(len(edist)):
92 | edd = nltk.edit_distance(inputs.upper(), edist[e].upper())
93 |
94 | if edd == 1:
95 | edist_update.append(edist[e])
96 | edist_count_update.append(edist_count[e])
97 |
98 | if len(edist_update) != 0:
99 | index = edist_count_update.index(max(edist_count_update))
100 | min_candidates = edist_update[index]
101 | edd_min_find = 1
102 |
103 | # edd 2
104 | if edd_min_find != 1:
105 | # edist 2
106 | fuzziness = 2
107 | q3 = {'query': {'fuzzy': {'name': {'value': inputs, 'fuzziness': fuzziness}}}}
108 | try:
109 | es_results = elasticsearch.helpers.scan(es, index="osm-voca", preserve_order=True, query=q3)
110 | except elasticsearch.ElasticsearchException as es_error:
111 | print(es_error)
112 |
113 | test = [item['_source'] for item in es_results if item["_source"]['name'] is not None]
114 |
115 | edist = []
116 | edist_count = []
117 | edist_update = []
118 | edist_count_update = []
119 |
120 | if test != 'NaN':
121 | for tt in range(len(test)):
122 | if 'name' in test[tt]:
123 | candidate = test[tt]['message']
124 | cand = candidate.split(',')[0]
125 | count = candidate.split(',')[1]
126 | edist.append(cand)
127 | edist_count.append(count)
128 |
129 | for e in range(len(edist)):
130 | edd = nltk.edit_distance(inputs.upper(), edist[e].upper())
131 |
132 | if edd == 2:
133 | edist_update.append(edist[e])
134 | edist_count_update.append(edist_count[e])
135 |
136 | if len(edist_update) != 0:
137 | index = edist_count_update.index(max(edist_count_update))
138 | min_candidates = edist_update[index]
139 | edd_min_find = 1
140 |
141 | if edd_min_find != 1:
142 | min_candidates = False
143 |
144 |
145 | if min_candidates != False:
146 | return str(min_candidates)
147 | else:
148 | return str(target_text)
149 |
150 | else: # added
151 | return str(target_text)
152 |
153 | else:
154 | # only numeric pred_text
155 | return str(target_text)
156 | else:
157 | # Combination of 140D -> 140D
158 | return str(target_text)
159 | else:
160 | return str(target_text)
--------------------------------------------------------------------------------
/m4_post_ocr/post_ocr_main.py:
--------------------------------------------------------------------------------
1 | import os
2 | import argparse
3 | import ast
4 | import re
5 | import pandas as pd
6 | import numpy as np
7 | import geojson
8 | import json
9 | from dotenv import load_dotenv
10 | from shapely.geometry import Polygon
11 | import psycopg2
12 | import reverse_geocoder as rg
13 | import pycountry_convert as pc
14 | from pyproj import Transformer, transform, Proj
15 | import sys
16 | import elasticsearch
17 | import elasticsearch.helpers
18 | from post_ocr import lexical_search_query
19 | import logging
20 | import time
21 |
22 | logging.basicConfig(level=logging.INFO)
23 |
24 |
25 |
26 | def save_postocr_results(in_geojson_data, unique_map_text_li, es_conn, output_dir, in_geojson_filename):
27 | result_dict_postocr = dict()
28 | for map_text in set(unique_map_text_li):
29 | map_text_candidate = lexical_search_query(map_text, es_conn)
30 | result_dict_postocr[map_text] = map_text_candidate
31 |
32 | for feature_data in in_geojson_data["features"]:
33 | feature_data["properties"]["postocr_label"] = result_dict_postocr[str(feature_data["properties"]["text"]).lower()]
34 |
35 | with open(os.path.join(output_dir, in_geojson_filename.split("/")[-1]), 'w', encoding='utf8') as output_geojson:
36 | geojson.dump(in_geojson_data, output_geojson, ensure_ascii=False)
37 |
38 |
39 |
40 | def main(args):
41 | geojson_file = args.in_geojson_file
42 | output_dir = args.out_geojson_dir
43 |
44 |
45 | try:
46 | es = elasticsearch.Elasticsearch([{'host': "127.0.0.1", 'port': 9200}], timeout=1000)
47 | es_connected = es.ping()
48 | except:
49 | logging.warning('elasticsearch.ConnectionError.ElasticConnectionError while running %s', geojson_file.split("/")[-1])
50 | return
51 | if not es_connected:
52 | logging.warning('Error on elasticsearch connection while running %s', geojson_file.split("/")[-1])
53 | return
54 | es_logger = elasticsearch.logger
55 | es_logger.setLevel(elasticsearch.logging.WARNING)
56 |
57 | with open(geojson_file) as f:
58 | # Need update
59 | try:
60 | data = geojson.load(f)
61 | except json.decoder.JSONDecodeError:
62 | if os.path.getsize(geojson_file) == 0:
63 | with open(os.path.join(output_dir, geojson_file.split("/")[-1]), 'w') as fp:
64 | pass
65 | else:
66 | logging.info('JSONDecodeError %s', geojson_file)
67 | # sys.exit(1)
68 | return
69 |
70 | min_x, min_y, max_x, max_y = float('inf'), float('inf'), float('-inf') ,float('-inf')
71 | unique_map_text = []
72 | for feature_data in data['features']:
73 | unique_map_text.append(str(feature_data['properties']['text']).lower())
74 |
75 | # if postocr_only:
76 | save_postocr_results(data, unique_map_text, es, output_dir, geojson_file)
77 | logging.info('Done generating standalone post-ocr geojson for %s', geojson_file.split("/")[-1])
78 | # return
79 |
80 |
81 | if __name__ == '__main__':
82 | parser = argparse.ArgumentParser()
83 | parser.add_argument('--in_geojson_file', type=str, default='data/100_maps_geojson_abc_geocoord/',
84 | help='input geojson')
85 | parser.add_argument('--out_geojson_dir', type=str, default='data/100_maps_geojson_abc_linked/',
86 | help='output dir for converted geojson files')
87 |
88 | args = parser.parse_args()
89 |
90 | main(args)
--------------------------------------------------------------------------------
/m4_post_ocr/preprocess.py:
--------------------------------------------------------------------------------
1 | from elasticsearch import Elasticsearch
2 | import logging
3 | import requests
4 | import json
5 |
6 | import http.client as http_client
7 |
8 | import pandas as pd
9 | import string
10 | import emoji
11 | import time
12 | import string
13 |
14 | import glob
15 | import os
16 |
17 |
18 |
19 | def read_name():
20 | http_client.HTTPConnection.debuglevel = 1
21 |
22 | logging.basicConfig()
23 | logging.getLogger().setLevel(logging.DEBUG)
24 |
25 | requests_log = logging.getLogger("requests.packages.urllib3")
26 | requests_log.setLevel(logging.DEBUG)
27 | requests_log.propagate = True
28 |
29 |
30 | #Popularity Count
31 | headers = {
32 | 'Content-Type': 'application/json',
33 | }
34 |
35 | json_body = '{"track_total_hits": true}'
36 |
37 | resp = requests.get(f'http://localhost:9200/osm/_search?&pretty=true', \
38 | data=json_body, \
39 | headers = headers)
40 | resp_json = json.loads(resp.text)
41 | total_value = resp_json["hits"]["total"]["value"]
42 |
43 |
44 | # Initialize
45 | json_body_page = '{"track_total_hits": true, "size": 10000, "sort": [{"ogc_fid": {"order" : "desc" }}]}'
46 | resp_page = requests.post(f'http://localhost:9200/osm/_search?', \
47 | data=json_body_page, \
48 | headers = headers)
49 | resp_page_json = json.loads(resp_page.text)
50 |
51 | name_list = []
52 |
53 | st = []
54 | for h in range(len(resp_page_json["hits"]["hits"])):
55 | st = resp_page_json["hits"]["hits"][h]["sort"]
56 | text = resp_page_json["hits"]["hits"][h]["_source"]["name"]
57 | token_list = text.split(" ")
58 | for t in range(len(token_list)):
59 | name_list.append(token_list[t].lower())
60 |
61 | n_val = len(resp_page_json["hits"]["hits"])
62 | st_list = [st[0]]
63 | error_track = []
64 |
65 | # Iterate over pages
66 | while n_val != total_value:
67 |
68 | try: #osm_id.keyword
69 | json_body_page2 = '{"track_total_hits": true, "size": 10000, "sort": [{"ogc_fid": {"order" : "desc" }}], "search_after": ['+str(st[0])+']}'
70 | resp_page2 = requests.get(f'http://localhost:9200/osm/_search?', \
71 | data=json_body_page2, \
72 | headers = headers)
73 | resp_page_json2 = json.loads(resp_page2.text)
74 |
75 | for h in range(len(resp_page_json2["hits"]["hits"])):
76 | st = resp_page_json2["hits"]["hits"][h]["sort"]
77 | text = resp_page_json2["hits"]["hits"][h]["_source"]["name"]
78 | token_list = text.split(" ")
79 | for t in range(len(token_list)):
80 | name_list.append(token_list[t].lower())
81 |
82 | n_val += len(resp_page_json2["hits"]["hits"])
83 | st_list.append(st[0])
84 | print(f'n_val: {n_val} done!')
85 |
86 | except Exception as e:
87 | print(e.message)
88 | error_track.append(str(st[0]))
89 |
90 | with open('error_id.txt', 'w') as fp:
91 | for item in error_track:
92 | fp.write("%s\n" % item)
93 | print('Done')
94 |
95 | with open('name_mid.txt', 'w') as fp:
96 | for item in name_list:
97 | fp.write("%s\n" % item)
98 | print('Done')
99 |
100 | with open('last_sort_id.txt', 'w') as fp:
101 | for item in st_list:
102 | fp.write("%s\n" % item)
103 | print('Done')
104 |
105 | with open('name.txt', 'w') as fp:
106 | for item in name_list:
107 | fp.write("%s\n" % item)
108 | print('Done')
109 |
110 | with open('name_set.txt', 'w') as fp:
111 | name_set = list(set(name_list))
112 | for item in name_set:
113 | fp.write("%s\n" % item)
114 | print('Done')
115 |
116 |
117 |
118 | def counting():
119 | input_txt = "name.txt"
120 |
121 | if os.path.exists(input_txt):
122 |
123 | punc = '''!()-[]{};:'"\,<>./?@#$%^&*_~'''
124 | start2 = time.time()
125 | set_lst2 = []
126 | with open(input_txt) as file:
127 | for item in file:
128 | name = emoji.replace_emoji(item.strip(), '') #filter out emoji
129 | name = name.translate(str.maketrans('', '', string.punctuation))
130 | if len(name) > 0:
131 | set_lst2.append(name.upper())
132 |
133 | end2 = time.time()
134 | start = time.time()
135 |
136 | dic = {}
137 | count = 0
138 |
139 | for word in set_lst2:
140 | count += 1
141 | if word in dic:
142 | dic[word] += 1
143 | else:
144 | dic[word] = 1
145 |
146 | end = time.time()
147 |
148 | print(end - start)
149 | print(end2 - start2)
150 | dff = pd.DataFrame.from_dict([dic]).T
151 |
152 | dff.reset_index(inplace=True)
153 | dff = dff.rename(columns = {'index':'name', '0': 'count'})
154 | dff.to_csv("out.csv", index=False)
155 |
156 |
157 | if __name__ == '__main__':
158 | read_name()
159 | counting()
160 |
--------------------------------------------------------------------------------
/m5_geocoordinate_converter/convert_geojson_to_geocoord.py:
--------------------------------------------------------------------------------
1 | import os
2 | import argparse
3 | import logging
4 | import ast
5 |
6 | import pandas as pd
7 | import numpy as np
8 | import geojson
9 | import os
10 |
11 | logging.basicConfig(level=logging.INFO)
12 |
13 |
14 | def main(args):
15 | geojson_file = args.in_geojson_file
16 | output_dir = args.out_geojson_dir
17 |
18 | sample_map_df = pd.read_csv(args.sample_map_path, dtype={'image_no': str})
19 | sample_map_df['image_no'] = sample_map_df['image_no'].str.replace('.1.jp2', '', regex=False).str.replace('.jp2', '', regex=False)
20 | sample_map_df['image_no'] = sample_map_df['image_no'].apply(lambda x: x[:-2] if x[-2:] == '.1' else x)
21 |
22 | geojson_filename_id = geojson_file.split(".")[0].split("/")[-1]
23 |
24 | if not os.path.isdir(os.path.join(output_dir, "tmp/")):
25 | os.makedirs(os.path.join(output_dir, "tmp/"))
26 |
27 | row = sample_map_df[sample_map_df['image_no'] == geojson_filename_id]
28 | if not row.empty:
29 | gcps = ast.literal_eval(row.iloc[0]['gcps'])
30 | gcp_str = ''
31 | for gcp in gcps:
32 | lng, lat = gcp['location']
33 | x, y = gcp['pixel']
34 | gcp_str += '-gcp ' + str(x) + ' ' + str(y) + ' ' + str(lng) + ' ' + str(lat) + ' '
35 |
36 | transform_method = row.iloc[0]['transformation_method']
37 | assert transform_method in ['affine', 'polynomial', 'tps']
38 |
39 | # minus in y
40 | with open(geojson_file) as img_geojson:
41 | try:
42 | img_data = geojson.load(img_geojson)
43 | except json.decoder.JSONDecodeError:
44 | if os.stat(geojson_file).st_size == 0:
45 | with open(os.path.join(output_dir, geojson_filename_id + '.geojson'), 'w') as fp:
46 | pass
47 | logging.info('Done generating empty geocoord geojson for %s', geojson_file)
48 | else:
49 | logging.info('JSONDecodeError %s', geojson_file)
50 | return
51 |
52 | for img_feature in img_data['features']:
53 | arr = np.array(img_feature['geometry']['coordinates'])
54 | img_feature['properties']['img_coordinates'] = np.array(arr).reshape(-1, 2).tolist()
55 |
56 | arr[:, :, 1] *= -1
57 | img_feature['geometry']['coordinates'] = arr.tolist()
58 |
59 | with open(os.path.join(os.path.join(output_dir, "tmp/"), geojson_filename_id + '.geojson'), 'w', encoding='utf8') as geocoord_geojson:
60 | geojson.dump(img_data, geocoord_geojson, ensure_ascii=False)
61 |
62 | input = '"' + output_dir + "/tmp/" + geojson_filename_id + '.geojson"'
63 | output = '"' + output_dir + "/" + geojson_filename_id + '.geojson"'
64 |
65 | if transform_method == 'affine':
66 | gecoord_convert_command = 'ogr2ogr -f "GeoJSON" ' + output + " " + input + ' -order 1 -s_srs epsg:4326 -t_srs epsg:3857 -skipfailures ' + gcp_str
67 |
68 | elif transform_method == 'polynomial':
69 | gecoord_convert_command = 'ogr2ogr -f "GeoJSON" ' + output + " " + input + ' -order 2 -s_srs epsg:4326 -t_srs epsg:3857 -skipfailures ' + gcp_str
70 |
71 | elif transform_method == 'tps':
72 | gecoord_convert_command = 'ogr2ogr -f "GeoJSON" ' + output + " " + input + ' -tps -s_srs epsg:4326 -t_srs epsg:3857 -skipfailures ' + gcp_str
73 |
74 | else:
75 | raise NotImplementedError
76 |
77 | ret_value = os.system(gecoord_convert_command)
78 | if os.path.exists(os.path.join(os.path.join(output_dir, "tmp/"), geojson_filename_id + '.geojson')):
79 | os.remove(os.path.join(os.path.join(output_dir, "tmp/"), geojson_filename_id + '.geojson'))
80 |
81 | if ret_value != 0:
82 | logging.info('Failed generating geocoord geojson for %s', geojson_file)
83 | else:
84 | logging.info('Done generating geocoord geojson for %s', geojson_file)
85 |
86 |
87 | if __name__ == '__main__':
88 | parser = argparse.ArgumentParser()
89 | parser.add_argument('--sample_map_path', type=str, default='data/initial_US_100_maps.csv',
90 | help='path to sample map csv, which contains gcps info')
91 | parser.add_argument('--in_geojson_file', type=str,
92 | help='input geojson file; results of M2')
93 | parser.add_argument('--out_geojson_dir', type=str, default='data/100_maps_geojson_abc_geocoord/',
94 | help='output dir for converted geojson files')
95 |
96 | args = parser.parse_args()
97 |
98 | main(args)
99 |
--------------------------------------------------------------------------------
/m6_entity_linker/create_elasticsearch_index.py:
--------------------------------------------------------------------------------
1 | import argparse
2 | import logging
3 |
4 | import pandas as pd
5 |
6 | import elasticsearch
7 | from elasticsearch import helpers
8 |
9 | logging.basicConfig(level=logging.INFO)
10 |
11 | def main(args):
12 | # elasticsearch connection
13 | try:
14 | es = elasticsearch.Elasticsearch([{'host': '127.0.0.1', 'port': 9200}], timeout=1000)
15 | es_connected = es.ping()
16 | except:
17 | logging.warning('elasticsearch.ConnectionError.ElasticConnectionError while running %s', geojson_file.split("/")[-1])
18 | return
19 | if not es_connected:
20 | logging.warning('Error on elasticsearch connection while running %s', geojson_file.split("/")[-1])
21 | return
22 | es_logger = elasticsearch.logger
23 | es_logger.setLevel(elasticsearch.logging.WARNING)
24 |
25 | df = pd.read_csv(args.in_csv)
26 |
27 | for index, row in df.iterrows():
28 | if index % 1000 == 0: print(index, "processed ...")
29 |
30 | es_query = {"query": {
31 | "bool": {
32 | "must": [
33 | {
34 | "match": {'name': str(row['name']).lower().replace("'","\'")}
35 | }
36 | ]
37 | }
38 | }}
39 |
40 | try:
41 | osm_count = es.count(index="osm", body=es_query)["count"]
42 | except elasticsearch.ElasticsearchException as es_error:
43 | logging.warning('ElasticsearchException while running %s', geojson_file.split("/")[-1])
44 | continue
45 |
46 | # skip word that has more than 10000 matched cases in OSM
47 | if osm_count > 10000:
48 | # logging.info('ElasticsearchException while running %s', geojson_file.split("/")[-1])
49 | continue
50 |
51 | try:
52 | es_results = elasticsearch.helpers.scan(es, index="osm", query=es_query)
53 | except elasticsearch.ElasticsearchException as es_error:
54 | logging.warning('ElasticsearchException while running %s', geojson_file.split("/")[-1])
55 | continue
56 |
57 | es_results = [(hit["_source"]['source_table'], hit["_source"]['osm_id']) for hit in es_results]
58 | if len(es_results) == 0:
59 | # logging.info('No elasticsearch results of word %s while running %s', map_text, geojson_file.split("/")[-1])
60 | continue
61 |
62 | df.loc[index, 'source_table_osm_id'] = str(es_results)
63 |
64 | df = df.dropna()
65 | df.to_csv(args.out_csv, index=False)
66 |
67 | if __name__ == '__main__':
68 | parser = argparse.ArgumentParser()
69 | parser.add_argument('--in_csv', type=str, default='out.csv', help='input csv')
70 | parser.add_argument('--out_csv', type=str, default='./m6_entity_linker/osm_linker.csv', help='output csv')
71 | args = parser.parse_args()
72 |
73 | main(args)
--------------------------------------------------------------------------------
/m6_entity_linker/create_spatial_index_postgres.py:
--------------------------------------------------------------------------------
1 | from dotenv import load_dotenv
2 | import psycopg2
3 |
4 |
5 | load_dotenv()
6 | DB_HOST = os.getenv("DB_HOST")
7 | DB_PORT = os.getenv("DB_PORT")
8 | DB_USERNAME = os.getenv("DB_USERNAME")
9 | DB_PASSWORD = os.getenv("DB_PASSWORD")
10 | DB_NAME = os.getenv("DB_NAME")
11 |
12 | conn = psycopg2.connect(database=DB_NAME, host=DB_HOST, user=DB_USERNAME, password=DB_PASSWORD, port=DB_PORT)
13 |
14 | continents = ['africa', 'asia', 'central_america', 'north_america', 'south_america', 'antarctica', 'australia_oceania', 'europe']
15 | tables = ['points', 'lines', 'multilinestrings','multipolygons','other_relations']
16 |
17 | for continent in continents:
18 | for table in tables:
19 | name = continent + "." + table
20 | cur.execute(f'''CREATE INDEX {name.replace(".","_")+"_index"} ON {name} USING gist (wkb_geometry);''')
21 | cur.execute(f'''CREATE INDEX {name.replace(".","_")+"_osm_index"} ON {name} (osm_id);''')
22 | print(name, " creating index...")
23 |
24 | conn.commit()
25 | cur.close()
26 | conn.close()
27 |
--------------------------------------------------------------------------------
/m6_entity_linker/entity_linking.py:
--------------------------------------------------------------------------------
1 | import os
2 | import argparse
3 | import ast
4 | import logging
5 | import time
6 |
7 | import pandas as pd
8 | import numpy as np
9 | import geojson
10 | import json
11 |
12 | from shapely.ops import transform
13 | from shapely.geometry import Polygon
14 | import pyproj
15 |
16 | import elasticsearch
17 |
18 | from dotenv import load_dotenv
19 | import psycopg2
20 | from postgres_logger import LinkerLoggingConnection
21 |
22 | logging.basicConfig(level=logging.INFO)
23 | logger = logging.getLogger(__name__)
24 |
25 |
26 | def main(args):
27 | input_dir = args.in_geojson_dir
28 | output_dir = args.out_geojson_dir
29 |
30 | # elasticsearch connection
31 | try:
32 | es = elasticsearch.Elasticsearch([{'host': '127.0.0.1', 'port': 9200}], timeout=1000)
33 | es_connected = es.ping()
34 | except:
35 | logging.warning('elasticsearch.ConnectionError.ElasticConnectionError')
36 | return
37 | if not es_connected:
38 | logging.warning('Error on elasticsearch connection')
39 | return
40 | es_logger = elasticsearch.logger
41 | es_logger.setLevel(elasticsearch.logging.WARNING)
42 |
43 | # postgres connection
44 | load_dotenv()
45 | DB_HOST = os.getenv("DB_HOST")
46 | DB_PORT = os.getenv("DB_PORT")
47 | DB_USERNAME = os.getenv("DB_USERNAME")
48 | DB_PASSWORD = os.getenv("DB_PASSWORD")
49 | DB_NAME = os.getenv("DB_NAME")
50 |
51 | try:
52 | conn = psycopg2.connect(host=DB_HOST, database=DB_NAME, user=DB_USERNAME, password=DB_PASSWORD, port=DB_PORT, connection_factory=LinkerLoggingConnection)
53 | except Exception as e:
54 | logging.warning('Error on psycopg2 connection ', e)
55 | return
56 |
57 | sample_map_df = pd.read_csv(args.sample_map_path, dtype={'image_no': str})
58 | sample_map_df['image_no'] = sample_map_df['image_no'].str.replace('.1.jp2', '', regex=False).str.replace('.jp2', '', regex=False)
59 | sample_map_df['image_no'] = sample_map_df['image_no'].apply(lambda x: x[:-2] if x[-2:] == '.1' else x)
60 |
61 | conn.initialize(logger)
62 | conn.autocommit = True
63 |
64 | with conn.cursor() as cur:
65 | for index, record in sample_map_df.iterrows():
66 | input_geojson_file = os.path.join(input_dir, record.image_no + ".geojson")
67 |
68 | if not os.path.exists(input_geojson_file):
69 | logging.warning('PostOCR output does not exist %s', record.image_no + ".geojson")
70 | continue
71 |
72 | if os.path.exists(os.path.join(output_dir, input_geojson_file.split("/")[-1])):
73 | logging.info('EntityLinker output already exists %s', record.image_no + ".geojson")
74 | continue
75 |
76 | with open(input_geojson_file) as f:
77 | try:
78 | data = geojson.load(f)
79 | except json.decoder.JSONDecodeError:
80 | if os.path.getsize(input_geojson_file) == 0:
81 | with open(os.path.join(output_dir, input_geojson_file.split("/")[-1]), 'w') as fp:
82 | continue
83 | else:
84 | logging.info('JSONDecodeError %s', input_geojson_file)
85 | continue
86 |
87 | for feature_data in data['features']:
88 | map_text = str(feature_data['properties']['postocr_label'])
89 |
90 | # skip null geometry
91 | if feature_data['geometry'] is None:
92 | feature_data["properties"]["osm_id"] = []
93 | continue
94 |
95 | # skip text less than 3 characters
96 | if len(map_text) <= 3:
97 | feature_data["properties"]["osm_id"] = []
98 | continue
99 |
100 | pts = np.array(feature_data['geometry']['coordinates']).reshape(-1, 2)
101 | map_polygon = Polygon(pts)
102 |
103 | es_query = {
104 | "bool": {
105 | "must": [
106 | {
107 | "match": {'name': map_text.lower().replace("'","\'")}
108 | }
109 | ]
110 | }
111 | }
112 |
113 | try:
114 | es_results = es.search(index="osm-linker", query=es_query)
115 | except elasticsearch.ElasticsearchException as es_error:
116 | logging.warning('ElasticsearchException while running %s', input_geojson_file.split("/")[-1])
117 | continue
118 |
119 | if es_results['hits']['total']['value'] == 0:
120 | # logging.info('No elasticsearch results of word %s while running %s', map_text, input_geojson_file.split("/")[-1])
121 | feature_data["properties"]["osm_id"] = []
122 | continue
123 |
124 | es_results = [ast.literal_eval(hit["_source"]['source_table_osm_id']) for hit in es_results['hits']['hits']][0]
125 | output_osm_ids = []
126 | source_tables = set([table for table, _ in es_results if "other_relations" not in table])
127 |
128 | for source_table in source_tables:
129 | sql = ""
130 | osm_ids = [osm_id for table, osm_id in es_results if table == source_table]
131 |
132 | if "points" in source_table:
133 | sql = f"""SELECT osm_id
134 | FROM {source_table}
135 | WHERE ST_CONTAINS(ST_TRANSFORM(ST_SetSRID(ST_MakeValid('{map_polygon}'), 3857), 4326), wkb_geometry)
136 | AND osm_id = ANY (%s)
137 | """
138 |
139 | elif "line" in source_table:
140 | sql = f"""SELECT osm_id
141 | FROM {source_table}
142 | WHERE ST_INTERSECTS(ST_TRANSFORM(ST_SetSRID(ST_MakeValid('{map_polygon}'), 3857), 4326), wkb_geometry)
143 | AND osm_id = ANY (%s)
144 | """
145 |
146 | elif "polygon" in source_table:
147 | sql = f"""SELECT osm_id
148 | FROM {source_table}
149 | WHERE ST_INTERSECTS(ST_TRANSFORM(ST_SetSRID(ST_MakeValid('{map_polygon}'), 3857), 4326), ST_MakeValid(wkb_geometry, 'method=structure'))
150 | AND osm_id = ANY (%s)
151 | """
152 |
153 | try:
154 | cur.execute(sql,(osm_ids,))
155 | except Exception as e:
156 | logging.warning('Error occured while executing sql for %s', input_geojson_file.split("/")[-1], e)
157 | if "TopologyException" in repr(e):
158 | continue
159 | else:
160 | return
161 |
162 | sql_result = cur.fetchall()
163 | if len(sql_result) != 0:
164 | output_osm_ids.extend([x[0] for x in sql_result])
165 |
166 | feature_data["properties"]["osm_id"] = output_osm_ids
167 |
168 | with open(os.path.join(output_dir, input_geojson_file.split("/")[-1]), 'w', encoding='utf8') as output_geojson:
169 | geojson.dump(data, output_geojson, ensure_ascii=False)
170 | logging.info('Done generating geojson for %s', input_geojson_file.split("/")[-1])
171 |
172 | if __name__ == '__main__':
173 | parser = argparse.ArgumentParser()
174 | parser.add_argument('--sample_map_path', type=str, default='data/initial_US_100_maps.csv',
175 | help='path to sample map csv, which contains gcps info')
176 | parser.add_argument('--in_geojson_dir', type=str, default='data/100_maps_geojson_abc_geocoord/',
177 | help='input geojson')
178 | parser.add_argument('--out_geojson_dir', type=str, default='data/100_maps_geojson_abc_linked/',
179 | help='output dir for converted geojson files')
180 | args = parser.parse_args()
181 | main(args)
182 |
--------------------------------------------------------------------------------
/m6_entity_linker/logstash_osm_linker.conf:
--------------------------------------------------------------------------------
1 | input {
2 | file {
3 | path => "./m6_entity_linker/osm_linker.csv"
4 | start_position => beginning
5 | }
6 | }
7 | filter {
8 | csv {
9 | separator => ","
10 | columns => [
11 | "name",
12 | "0",
13 | "source_table_osm_id"
14 | ]
15 | }
16 | }
17 | output {
18 | elasticsearch {
19 | hosts => "localhost:9200"
20 | index => "osm-linker"
21 | }
22 | }
--------------------------------------------------------------------------------
/m6_entity_linker/logstash_postgres_world.conf:
--------------------------------------------------------------------------------
1 | input {
2 | jdbc {
3 | jdbc_connection_string => "jdbc:postgresql://localhost:5432/osm"
4 | jdbc_user => ""
5 | jdbc_password => ""
6 | jdbc_paging_enabled => true
7 | jdbc_fetch_size => 100000
8 | jdbc_driver_library => "/usr/share/logstash/logstash-core/lib/jars/postgresql-42.6.0.jar"
9 | jdbc_driver_class => "org.postgresql.Driver"
10 | statement => "SELECT ogc_fid, osm_id, name, source_table FROM entire_continents ORDER BY ogc_fid"
11 | }
12 | }
13 | output {
14 | elasticsearch {
15 | hosts => "localhost:9200"
16 | index => "osm"
17 | document_id => "%{ogc_fid}"
18 | doc_as_upsert => true
19 | }
20 | }
--------------------------------------------------------------------------------
/m6_entity_linker/postgres_logger.py:
--------------------------------------------------------------------------------
1 | import time
2 |
3 | import psycopg2
4 | from psycopg2.extras import LoggingConnection, LoggingCursor
5 |
6 | class LinkerLoggingCursor(LoggingCursor):
7 | def execute(self, query, vars=None):
8 | self.timestamp = time.time()
9 | return super(LinkerLoggingCursor, self).execute(query, vars)
10 |
11 | def callproc(self, procname, vars=None):
12 | self.timestamp = time.time()
13 | return super(LinkerLoggingCursor, self).callproc(procname, vars)
14 |
15 | class LinkerLoggingConnection(LoggingConnection):
16 | def filter(self, msg, curs):
17 | return msg.decode(psycopg2.extensions.encodings[self.encoding], 'replace') + " %d ms" % int((time.time() - curs.timestamp) * 1000)
18 |
19 | def cursor(self, *args, **kwargs):
20 | kwargs.setdefault('cursor_factory', LinkerLoggingCursor)
21 | return LoggingConnection.cursor(self, *args, **kwargs)
22 |
--------------------------------------------------------------------------------
/m6_entity_linker/upload_osm_to_postgres_all_continents.py:
--------------------------------------------------------------------------------
1 | from dotenv import load_dotenv
2 | import psycopg2
3 |
4 |
5 | load_dotenv()
6 | DB_HOST = os.getenv("DB_HOST")
7 | DB_PORT = os.getenv("DB_PORT")
8 | DB_USERNAME = os.getenv("DB_USERNAME")
9 | DB_PASSWORD = os.getenv("DB_PASSWORD")
10 | DB_NAME = os.getenv("DB_NAME")
11 |
12 | conn = psycopg2.connect(database=DB_NAME, host=DB_HOST, user=DB_USERNAME, password=DB_PASSWORD, port=DB_PORT)
13 | cur = conn.cursor()
14 | cur.execute('''CREATE TABLE entire_continents (
15 | ogc_fid SERIAL PRIMARY KEY,
16 | osm_id character varying,
17 | name character varying,
18 | source_table character varying
19 | );''')
20 |
21 |
22 | continents = ['africa', 'asia', 'central_america', 'north_america', 'south_america', 'antarctica', 'australia_oceania', 'europe']
23 | tables = ['points', 'lines', 'multilinestrings','multipolygons','other_relations']
24 |
25 | for continent in continents:
26 | for table in tables:
27 | name = continent + "." + table
28 | cur.execute(f'''INSERT INTO entire_continents(osm_id, name, source_table)
29 | SELECT osm_id, name, '{name}' FROM {name}
30 | WHERE name IS NOT NULL AND osm_id IS NOT NULL ;''')
31 | print(name, " inserting into entire_continents...")
32 |
33 | conn.commit()
34 | cur.close()
35 | conn.close()
--------------------------------------------------------------------------------
/m6_entity_linker/upload_osm_to_postgres_ogr2ogr.py:
--------------------------------------------------------------------------------
1 | from dotenv import load_dotenv
2 | import subprocess
3 | import os
4 |
5 | import psycopg2
6 |
7 | continents = ['africa', 'asia', 'centeral_america', 'north_america', 'south_america', 'antarctica', 'australia_oceania', 'europe']
8 |
9 | load_dotenv()
10 | DB_HOST = os.getenv("DB_HOST")
11 | DB_PORT = os.getenv("DB_PORT")
12 | DB_USERNAME = os.getenv("DB_USERNAME")
13 | DB_PASSWORD = os.getenv("DB_PASSWORD")
14 | DB_NAME = os.getenv("DB_NAME")
15 |
16 |
17 | try:
18 | conn = psycopg2.connect(host=DB_HOST, database=DB_NAME, user=DB_USERNAME, password=DB_PASSWORD)
19 | except:
20 | logging.warning('Error on psycopg2 connection while running %s', geojson_file.split("/")[-1])
21 |
22 | cur = conn.cursor()
23 |
24 | for continent in continents:
25 | cur.execute(f'''CREATE SCHEMA {continent};''')
26 | print(continent, " creating schema...")
27 |
28 | conn.commit()
29 | cur.close()
30 | conn.close()
31 |
32 | for continent in continents:
33 | cmd = f'''ogr2ogr -f PostgreSQL PG:"dbname='{DB_NAME}' host='{DB_HOST}' port='{DB_PORT}' user='{DB_USERNAME}' password='{DB_PASSWORD}'" {continent.replace('_','-')}-latest.osm.pbf -nlt PROMOTE_TO_MULTI -lco SCHEMA={continent}'''
34 | print("--", continent, "--")
35 | print(cmd)
36 | subprocess.call(cmd, shell=True)
--------------------------------------------------------------------------------
/m_sanborn/s1_geocoding.py:
--------------------------------------------------------------------------------
1 | import os
2 | import argparse
3 | import geojson
4 | import geocoder
5 | import json
6 | import time
7 | import pdb
8 |
9 |
10 | def arcgic_geocoding(place_name, maxRows = 5):
11 | try:
12 | response = geocoder.arcgis(place_name,maxRows=maxRows)
13 | return response.json
14 | except exception as e:
15 | print(e)
16 | return -1
17 |
18 |
19 | def google_geocoding(place_name, api_key = None, maxRows = 5):
20 | try:
21 | response = geocoder.google(place_name, key=api_key, maxRows = maxRows)
22 | return response.json
23 | except exception as e:
24 | print(e)
25 | return -1
26 |
27 | def osm_geocoding(place_name, maxRows = 5):
28 | try:
29 | response = geocoder.osm(place_name, maxRows = maxRows)
30 | return response.json
31 | except exception as e:
32 | print(e)
33 | return -1
34 |
35 |
36 | def geonames_geocoding(place_name, user_name = None, maxRows = 5):
37 | try:
38 | response = geocoder.geonames(place_name, key = user_name, maxRows=maxRows)
39 | # hourly limit of 1000 credits
40 | time.sleep(4)
41 | return response.json
42 | except exception as e:
43 | print(e)
44 | return -1
45 |
46 |
47 | def geocoding(args):
48 | output_folder = args.output_folder
49 | input_map_geojson_path = args.input_map_geojson_path
50 | api_key = args.api_key
51 | user_name = args.user_name
52 | geocoder_option = args.geocoder_option
53 | max_results = args.max_results
54 | suffix = args.suffix
55 |
56 | with open(input_map_geojson_path, 'r') as f:
57 | data = geojson.load(f)
58 |
59 | map_name = os.path.basename(input_map_geojson_path).split('.')[0]
60 | output_folder = os.path.join(output_folder, geocoder_option)
61 |
62 | if not os.path.isdir(output_folder):
63 | os.makedirs(output_folder)
64 |
65 | output_path = os.path.join(output_folder, map_name) + '.json'
66 |
67 | with open(output_path, 'w') as f:
68 | pass # flush output file
69 |
70 | features = data['features']
71 | for feature in features: # iterate through all the detected text labels
72 | geometry = feature['geometry']
73 | text = feature['properties']['text']
74 | score = feature['properties']['score']
75 |
76 | # suffix = ', Los Angeles'
77 | text = str(text) + suffix
78 |
79 | print(text)
80 |
81 | if geocoder_option == 'arcgis':
82 | results = arcgic_geocoding(text, maxRows = max_results)
83 | elif geocoder_option == 'google':
84 | results = google_geocoding(text, api_key = api_key, maxRows = max_results)
85 | elif geocoder_option == 'geonames':
86 | results = geonames_geocoding(text, user_name = user_name, maxRows = max_results)
87 | elif geocoder_option == 'osm':
88 | results = osm_geocoding(text, maxRows = max_results)
89 | else:
90 | raise NotImplementedError
91 |
92 | if results == -1:
93 | # geocoder can not find match
94 | pass
95 | else:
96 | # save results
97 | with open(output_path, 'a') as f:
98 | json.dump({'text':text, 'score':score, 'geometry': geometry, 'geocoding':results}, f)
99 | f.write('\n')
100 |
101 | # pdb.set_trace()
102 |
103 |
104 | def main():
105 | parser = argparse.ArgumentParser()
106 |
107 | parser.add_argument('--output_folder', type=str, default='/data2/sanborn_maps_output/LA_sanborn/geocoding/')
108 | parser.add_argument('--input_map_geojson_path', type=str, default='/data2/sanborn_maps_output/LA_sanborn/geojson_testr/service-gmd-gmd436m-g4364m-g4364lm-g4364lm_g00656189401-00656_01_1894-0001l.geojson')
109 | parser.add_argument('--api_key', type=str, default=None, help='Specify API key if needed')
110 | parser.add_argument('--user_name', type=str, default=None, help='Specify user name if needed')
111 |
112 | parser.add_argument('--suffix', type=str, default=None, help='placename suffix (e.g. city name)')
113 |
114 | parser.add_argument('--max_results', type=int, default=5, help='max number of results returend by geocoder')
115 |
116 | parser.add_argument('--geocoder_option', type=str, default='arcgis',
117 | choices=['arcgis', 'google','geonames','osm'],
118 | help='Select text spotting model option from ["arcgis","google","geonames","osm"]') # select text spotting model
119 |
120 |
121 | args = parser.parse_args()
122 | print('\n')
123 | print(args)
124 | print('\n')
125 |
126 | if not os.path.isdir(args.output_folder):
127 | os.makedirs(args.output_folder)
128 |
129 | geocoding(args)
130 |
131 |
132 | if __name__ == '__main__':
133 |
134 | main()
135 |
136 |
137 |
138 |
--------------------------------------------------------------------------------
/m_sanborn/s2_clustering.py:
--------------------------------------------------------------------------------
1 | import os
2 | import json
3 | import argparse
4 | from sklearn.cluster import DBSCAN
5 | from matplotlib import pyplot as plt
6 | import geopandas as gpd
7 | import pandas as pd
8 | from bs4 import BeautifulSoup
9 | from mpl_toolkits.basemap import Basemap
10 | from pyproj import Proj, transform
11 |
12 | from shapely.geometry import Point
13 | from shapely.geometry.polygon import Polygon
14 | import numpy as np
15 | from shapely.geometry import MultiPoint
16 | from geopy.distance import great_circle
17 |
18 |
19 | county_index_dict = {'Cuyahoga County (OH)': 193,
20 | 'Fulton County (GA)': 73,
21 | 'Kern County (CA)': 2872,
22 | 'Lancaster County (NE)': 1629,
23 | 'Los Angeles County (CA)': 44,
24 | 'Mexico': -1,
25 | 'Nevada County (CA)': 46,
26 | 'New Orleans (LA)': -1,
27 | 'Pima County (AZ)': 2797,
28 | 'Placer County (CA)': 1273,
29 | 'Providence County (RI)\xa0': 1124,
30 | 'Saint Louis (MO)': -1,
31 | 'San Francisco County (CA)': 1261,
32 | 'San Joaquin County (CA)': 1213,
33 | 'Santa Clara (CA)': 48,
34 | 'Santa Cruz (CA)': 2386,
35 | 'Suffolk County (MA)': 272,
36 | 'Tulsa County (OK)': 526,
37 | 'Washington County (AK)': -1,
38 | 'Washington DC': -1}
39 |
40 | def get_centermost_point(cluster):
41 | centroid = (MultiPoint(cluster).centroid.x, MultiPoint(cluster).centroid.y)
42 | centermost_point = min(cluster, key=lambda point: great_circle(point, centroid).m)
43 | return tuple(centermost_point)
44 |
45 | def clustering_func(lat_list, lng_list):
46 | X = [[a,b] for a,b in zip(lat_list, lng_list)]
47 | coords = np.array(X)
48 |
49 | # https://geoffboeing.com/2014/08/clustering-to-reduce-spatial-data-set-size/
50 | kms_per_radian = 6371.0088
51 | epsilon = 1.5 / kms_per_radian
52 | db = DBSCAN(eps=epsilon, min_samples=1, algorithm='ball_tree', metric='haversine').fit(np.radians(coords))
53 | cluster_labels = db.labels_
54 | num_clusters = len(set(cluster_labels))
55 | clusters = pd.Series([coords[cluster_labels == n] for n in range(num_clusters)])
56 |
57 | centermost_points = get_centermost_point(clusters[0])
58 | return centermost_points
59 |
60 | def plot_points(lat_list, lng_list, target_lat_list=None, target_lng_list = None, pred_lat=None, pred_lng = None, title = None):
61 |
62 | plt.figure(figsize=(10,6))
63 | plt.title(title)
64 |
65 | plt.scatter(lng_list, lat_list, marker='o', c = 'violet', alpha=0.5)
66 | if pred_lat is not None and pred_lng is not None:
67 | plt.scatter(pred_lng, pred_lat, marker='o', c = 'red')
68 |
69 | if target_lat_list is not None and target_lng_list is not None:
70 | plt.scatter(target_lng_list, target_lat_list, 10, c = 'blue')
71 | plt.show()
72 |
73 | def plot_points_basemap(lat_list, lng_list, target_lat_list=None, target_lng_list = None, pred_lat=None, pred_lng = None, title = None):
74 |
75 | plt.figure(figsize=(10,6))
76 | plt.title(title)
77 |
78 | if len(lat_list) >0 and len(lng_list) > 0:
79 | anchor_lat, anchor_lng = lat_list[0], lng_list[0]
80 | elif target_lat_list is not None:
81 | anchor_lat, anchor_lng = target_lat_list[0], target_lng_list[0]
82 | else:
83 | anchor_lat, anchor_lng = 45, -100
84 |
85 | m = Basemap(projection='lcc', resolution=None,
86 | width=8E4, height=8E4,
87 | lat_0=anchor_lat, lon_0=anchor_lng)
88 | m.etopo(scale=0.5, alpha=0.5)
89 | # m.arcgisimage(service='ESRI_Imagery_World_2D', xpixels = 2000, verbose= True)
90 | # m.arcgisimage(service='ESRI_Imagery_World_2D',scale=0.5, alpha=0.5)
91 | # m.arcgisimage(service='ESRI_Imagery_World_2D', xpixels = 2000, verbose= True)
92 |
93 | lng_list, lat_list = m(lng_list, lat_list) # transform coordinates
94 | plt.scatter(lng_list, lat_list, marker='o', c = 'violet', alpha=0.5)
95 |
96 |
97 | if target_lat_list is not None and target_lng_list is not None:
98 | target_lng_list, target_lat_list = m(target_lng_list, target_lat_list)
99 | plt.scatter(target_lng_list, target_lat_list, marker='o', c = 'blue',edgecolor='blue')
100 |
101 | if pred_lat is not None and pred_lng is not None:
102 | pred_lng, pred_lat = m(pred_lng, pred_lat)
103 | plt.scatter(pred_lng, pred_lat, marker='o', c = 'red', edgecolor='black')
104 |
105 | plt.show()
106 |
107 | def plotting_func(loc_sanborn_dir, pred_dict, lat_lng_dict, dataset_name, geocoding_name):
108 |
109 | for map_name, pred in pred_dict.items():
110 |
111 | title = dataset_name + '-' + geocoding_name + '-' + map_name
112 | lat_list = lat_lng_dict[map_name]['lat_list']
113 | lng_list = lat_lng_dict[map_name]['lng_list']
114 |
115 | if dataset_name == 'LoC_sanborn':
116 | xml_path = os.path.join(loc_sanborn_dir,map_name + '.tif.aux.xml')
117 | try:
118 | with open(xml_path) as fp:
119 | soup = BeautifulSoup(fp)
120 |
121 | target_gcp_list = soup.findAll("metadata")[1].targetgcps.findAll("double")
122 | except Exception as e:
123 | print(xml_path)
124 | continue
125 |
126 | xy_list = []
127 | for target_gcp in target_gcp_list:
128 | xy_list.append(float(target_gcp.contents[0]))
129 |
130 | x_list = xy_list[0::2]
131 | y_list = xy_list[1::2]
132 |
133 | lng2_list, lat2_list = [],[]
134 | for x1,y1 in zip(x_list, y_list):
135 | x2,y2 = transform(inProj,outProj,x1,y1)
136 | #print (x2,y2)
137 | lng2_list.append(x2)
138 | lat2_list.append(y2)
139 |
140 | plot_points(lat_list, lng_list, lat2_list, lng2_list, pred_lat = pred[0], pred_lng = pred[1], title=title)
141 | else:
142 | plot_points(lat_list, lng_list,pred_lat = pred[0], pred_lng = pred[1], title=title)
143 |
144 |
145 | def clustering(args):
146 | dataset_name = args.dataset_name
147 | geocoding_name = args.geocoding_name
148 | remove_duplicate_location = args.remove_duplicate_location
149 | visualize = args.visualize
150 |
151 | sanborn_output_dir = '/data2/sanborn_maps_output'
152 |
153 | input_dir=os.path.join(sanborn_output_dir, dataset_name, 'geocoding_suffix_testr', geocoding_name)
154 | if remove_duplicate_location:
155 | output_dir = os.path.join(sanborn_output_dir, dataset_name, 'clustering_testr_removeduplicate', geocoding_name)
156 | else:
157 | output_dir = os.path.join(sanborn_output_dir, dataset_name, 'clustering_testr', geocoding_name)
158 |
159 | county_boundary_path = '/home/zekun/Sanborn/cb_2018_us_county_500k/cb_2018_us_county_500k.shp'
160 |
161 | if not os.path.isdir(output_dir):
162 | os.makedirs(output_dir)
163 |
164 | inProj = Proj(init='epsg:3857')
165 | outProj = Proj(init='epsg:4326')
166 |
167 | county_boundary_df = gpd.read_file(county_boundary_path)
168 |
169 | if dataset_name == 'LoC_sanborn':
170 | loc_sanborn_dir = '/data2/sanborn_maps/Sanborn100_Georef/' # for comparing with GT
171 | metadata_tsv_path = '/home/zekun/Sanborn/Sheet_List.tsv'
172 | meta_df = pd.read_csv(metadata_tsv_path, sep='\t')
173 |
174 | file_list = os.listdir(input_dir)
175 |
176 | pred_dict = dict()
177 | lat_lng_dict = dict()
178 | for file_path in file_list:
179 |
180 | map_name = os.path.basename(file_path).split('.')[0]
181 | if dataset_name == 'LoC_sanborn':
182 | county_name = meta_df[meta_df['filename'] == map_name]['County'].values[0]
183 | elif dataset_name == 'LA_sanborn' or 'two_more':
184 | county_name = 'Los Angeles County (CA)'
185 | else:
186 | raise NotImplementedError
187 |
188 | index = county_index_dict[county_name]
189 | if index >= 0:
190 | poly_geometry = county_boundary_df.iloc[index].geometry
191 |
192 | with open(os.path.join(input_dir,file_path), 'r') as f:
193 | data = f.readlines()
194 |
195 | lat_list = []
196 | lng_list = []
197 | for line in data:
198 |
199 | line_dict = json.loads(line)
200 | geocoding_dict = line_dict['geocoding']
201 | text = line_dict['text']
202 | score = line_dict['score']
203 | geometry = line_dict['geometry']
204 |
205 | if geocoding_dict is None:
206 | continue # if no geolocation returned by geocoder, then skip
207 |
208 | if 'lat' not in geocoding_dict or 'lng' not in geocoding_dict:
209 | #print(geocoding_dict)
210 | continue
211 |
212 | lat = float(geocoding_dict['lat'])
213 | lng = float(geocoding_dict['lng'])
214 |
215 | point = Point(lng, lat)
216 |
217 | if index >= 0:
218 | if point.within(poly_geometry): # geocoding point within county boundary
219 | lat_list.append(lat)
220 | lng_list.append(lng)
221 | else:
222 | pass
223 | else: # cluster based on all results
224 | lat_list.append(lat)
225 | lng_list.append(lng)
226 |
227 | if remove_duplicate_location:
228 | lat_list = list(set(lat_list))
229 | lng_list = list(set(lng_list))
230 |
231 | if len(lat_list) >0 and len(lng_list) > 0:
232 | pred = clustering_func(lat_list, lng_list)
233 | # print(pred)
234 | else:
235 | print('No data to cluster')
236 |
237 | print(map_name, pred)
238 | pred_dict[map_name] = pred
239 | lat_lng_dict[map_name]={'lat_list':lat_list, 'lng_list':lng_list}
240 |
241 | if visualize:
242 | plotting_func(loc_sanborn_dir = loc_sanborn_dir, pred_dict = pred_dict, lat_lng_dict = lat_lng_dict,
243 | dataset_name = dataset_name, geocoding_name = geocoding_name)
244 |
245 | with open(os.path.join(output_dir, 'pred_center.json'),'w') as f:
246 | json.dump(pred_dict, f)
247 |
248 |
249 | def main():
250 | parser = argparse.ArgumentParser()
251 |
252 | parser.add_argument('--dataset_name', type=str, default=None,
253 | choices=['LA_sanborn', 'LoC_sanborn',],
254 | help='dataset name, same as expt_name')
255 | parser.add_argument('--geocoding_name', type=str, default=None,
256 | choices=['google','arcgis','geonames','osm'],
257 | help='geocoder name')
258 | parser.add_argument('--visualize', default = False, action = 'store_true') # Enable this when in notebook
259 | parser.add_argument('--remove_duplicate_location', default=False, action='store_true') # whether remove duplicate geolocations for clustering
260 |
261 | # parser.add_argument('--output_folder', type=str, default='/data2/sanborn_maps_output/LA_sanborn/geocoding/')
262 | # parser.add_argument('--input_map_geojson_path', type=str, default='/data2/sanborn_maps_output/LA_sanborn/geojson_testr/service-gmd-gmd436m-g4364m-g4364lm-g4364lm_g00656189401-00656_01_1894-0001l.geojson')
263 |
264 |
265 | args = parser.parse_args()
266 | print('\n')
267 | print(args)
268 | print('\n')
269 |
270 | clustering(args)
271 |
272 |
273 | if __name__ == '__main__':
274 |
275 | main()
276 |
--------------------------------------------------------------------------------
/m_sanborn/s3_gen_geojson.py:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/knowledge-computing/mapkurator-system/5b765d99c4898ce07654d904b6f3b608b9e76189/m_sanborn/s3_gen_geojson.py
--------------------------------------------------------------------------------
/requirements.txt:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/knowledge-computing/mapkurator-system/5b765d99c4898ce07654d904b6f3b608b9e76189/requirements.txt
--------------------------------------------------------------------------------
/run.py:
--------------------------------------------------------------------------------
1 | import os
2 | import glob
3 | import argparse
4 | import time
5 | import logging
6 | import pandas as pd
7 | import datetime
8 | from PIL import Image
9 | from utils import get_img_path_from_external_id, get_img_path_from_external_id_and_image_no,run_pipeline
10 |
11 | import subprocess
12 |
13 |
14 | logging.basicConfig(level=logging.INFO)
15 | Image.MAX_IMAGE_PIXELS=None # allow reading huge images
16 |
17 | def main():
18 | parser = argparse.ArgumentParser()
19 |
20 | parser.add_argument('--map_kurator_system_dir', type=str, default='/home/maplord/rumsey/mapkurator-system/')
21 | parser.add_argument('--text_spotting_model_dir', type=str, default='/home/maplord/rumsey/TESTR/')
22 |
23 | parser.add_argument('--sample_map_csv_path', type=str, default=None)
24 |
25 | parser.add_argument('--output_folder', type=str, default='/data2/rumsey_output') # Original: /data2/rumsey_output
26 | parser.add_argument('--expt_name', type=str, default='1000_maps') # output prefix
27 |
28 | parser.add_argument('--module_get_dimension', default=False, action='store_true')
29 | parser.add_argument('--module_gen_geotiff', default=False, action='store_true')
30 | parser.add_argument('--module_cropping', default=False, action='store_true')
31 | parser.add_argument('--module_text_spotting', default=False, action='store_true')
32 | parser.add_argument('--module_img_geojson', default=False, action='store_true')
33 | parser.add_argument('--module_geocoord_geojson', default=False, action='store_true')
34 | parser.add_argument('--module_post_ocr_entity_linking', default=False, action='store_true')
35 | parser.add_argument('--module_post_ocr_only', default=False, action='store_true')
36 | parser.add_argument('--module_post_ocr', default=False, action='store_true')
37 |
38 | parser.add_argument('--spotter_model', type=str, default='spotter-v2', choices=['testr', 'spotter-v2', "palette"],
39 | help='Select text spotting model option from ["testr", "spotter-v2", "palette"]') # select text spotting model
40 | parser.add_argument('--spotter_config', type=str, default='/home/maplord/rumsey/TESTR/configs/TESTR/SynMap/SynMap_Polygon.yaml',
41 | help='Path to the config file for text spotting model')
42 | parser.add_argument('--spotter_expt_name', type=str, default='exp',
43 | help='Name of spotter experiment, if empty using config file name')
44 |
45 | # Running spotter-testr
46 | # python run.py --text_spotting_model_dir /home/maplord/rumsey/spotter-testr/TESTR/
47 | # --sample_map_csv_path /home/maplord/maplist_csv/luna_omo_splits/luna_omo_metadata_56628_20220724_part1.csv
48 | # --expt_name 57k_maps_r3 --module_text_spotting
49 | # --spotter_model testr --spotter_config /home/maplord/rumsey/spotter-testr/TESTR/configs/TESTR/SynthMap/SynthMap_Polygon.yaml --spotter_expt_name test
50 | # Running spotter-v2
51 | # python run.py --text_spotting_model_dir /home/maplord/rumsey/spotter-v2/PALEJUN/
52 | # --sample_map_csv_path /home/maplord/maplist_csv/luna_omo_splits/luna_omo_metadata_56628_20220724_part1.csv
53 | # --expt_name 57k_maps_r3 --module_text_spotting
54 | # --spotter_model spotter-v2 --spotter_config /home/maplord/rumsey/spotter-v2/PALEJUN/configs/PALEJUN/SynthMap/SynthMap_Polygon.yaml --spotter_expt_name test
55 | # Running spotter-palette
56 | # python run.py --text_spotting_model_dir /home/maplord/rumsey/spotter-palette/PALETTE/
57 | # --sample_map_csv_path /home/maplord/maplist_csv/luna_omo_splits/luna_omo_metadata_56628_20220724_part1.csv
58 | # --expt_name 57k_maps_r3 --module_text_spotting
59 | # --spotter_model palette --spotter_config /home/maplord/rumsey/spotter-palette/PALETTR/configs/PALETTE/Pretrain/SynthMap_Polygon.yaml --spotter_expt_name test
60 |
61 | parser.add_argument('--print_command', default=False, action='store_true')
62 | parser.add_argument('--gpu_id', type=int, default=0)
63 |
64 |
65 | args = parser.parse_args()
66 | print('\n')
67 | print(args)
68 | print('\n')
69 |
70 | run_pipeline(args)
71 |
72 |
73 |
74 | if __name__ == '__main__':
75 |
76 | main()
77 |
78 |
79 |
--------------------------------------------------------------------------------
/run_img.py:
--------------------------------------------------------------------------------
1 | import os
2 | import subprocess
3 | import glob
4 | import argparse
5 | import time
6 | import logging
7 | import pandas as pd
8 | import pdb
9 | import datetime
10 | from PIL import Image
11 | from utils import run_pipeline
12 |
13 | import subprocess
14 |
15 | #this code is the case for getting an input as folders which include images.
16 | #tested image : /home/maplord/rumsey/mapkurator-system/data/100_maps_crop/crop_leeje_2/test_run_img/
17 | logging.basicConfig(level=logging.INFO)
18 | Image.MAX_IMAGE_PIXELS=None # allow reading huge images
19 |
20 |
21 |
22 | def main():
23 | parser = argparse.ArgumentParser()
24 |
25 | parser.add_argument('--map_kurator_system_dir', type=str, default='/home/maplord/rumsey/mapkurator-system/')
26 | parser.add_argument('--text_spotting_model_dir', type=str, default='/home/maplord/rumsey/TESTR/')
27 |
28 | parser.add_argument('--input_dir_path', type=str, default=None)
29 |
30 | parser.add_argument('--output_folder', type=str, default='/data2/rumsey_output')
31 | parser.add_argument('--expt_name', type=str, default='1000_maps') # output prefix
32 |
33 | parser.add_argument('--module_get_dimension', default=False, action='store_true')
34 | parser.add_argument('--module_gen_geotiff', default=False, action='store_true')
35 | parser.add_argument('--module_cropping', default=False, action='store_true')
36 | parser.add_argument('--module_text_spotting', default=False, action='store_true')
37 | parser.add_argument('--module_img_geojson', default=False, action='store_true')
38 |
39 |
40 | parser.add_argument('--spotter_model', type=str, default='spotter-v2', choices=['testr', 'spotter-v2', "palette"],
41 | help='Select text spotting model option from ["testr", "spotter-v2", "palette"]') # select text spotting model
42 | parser.add_argument('--spotter_config', type=str, default='/home/maplord/rumsey/TESTR/configs/TESTR/SynMap/SynMap_Polygon.yaml',
43 | help='Path to the config file for text spotting model')
44 | parser.add_argument('--spotter_expt_name', type=str, default='exp',
45 | help='Name of spotter experiment, if empty using config file name')
46 |
47 | parser.add_argument('--print_command', default=False, action='store_true')
48 | parser.add_argument('--gpu_id', type=int, default=0)
49 |
50 | args = parser.parse_args()
51 | print('\n')
52 | print(args)
53 | print('\n')
54 |
55 | run_pipeline(args)
56 |
57 |
58 |
59 | if __name__ == '__main__':
60 |
61 | main()
62 |
63 |
64 |
--------------------------------------------------------------------------------
/utils.py:
--------------------------------------------------------------------------------
1 | import os
2 | import glob
3 | import pandas as pd
4 | import ast
5 | import argparse
6 | import logging
7 | import pdb
8 | from PIL import Image
9 | import datetime
10 | import subprocess
11 | import time
12 |
13 | logging.basicConfig(level=logging.INFO)
14 | Image.MAX_IMAGE_PIXELS=None
15 |
16 |
17 | def execute_command(command, if_print_command):
18 | t1 = time.time()
19 |
20 | if if_print_command:
21 | print(command)
22 |
23 | try:
24 | subprocess.run(command, shell=True,check=True, capture_output = True) #stderr=subprocess.STDOUT)
25 | t2 = time.time()
26 | time_usage = t2 - t1
27 | return {'time_usage':time_usage}
28 | except subprocess.CalledProcessError as err:
29 | error = err.stderr.decode('utf8')
30 | # format error message to one line
31 | error = error.replace('\n','\t')
32 | error = error.replace(',',';')
33 | return {'error': error}
34 |
35 |
36 | def get_img_dimension(img_path):
37 | map_img = Image.open(img_path)
38 | width, height = map_img.size
39 |
40 | return width, height
41 |
42 |
43 | def run_pipeline(args):
44 | # ------------------------- Pass arguments -----------------------------------------
45 | map_kurator_system_dir = args.map_kurator_system_dir
46 | text_spotting_model_dir = args.text_spotting_model_dir
47 |
48 | if hasattr(args, "sample_map_csv_path"):
49 | # if typeof === 'undefined':
50 | sample_map_path = args.sample_map_csv_path
51 | module_geocoord_geojson = args.module_geocoord_geojson
52 | module_post_ocr_entity_linking = args.module_post_ocr_entity_linking
53 | module_post_ocr_only = args.module_post_ocr_only
54 | module_post_ocr = args.module_post_ocr
55 |
56 | elif hasattr(args, "input_dir_path"):
57 | input_dir_path = args.input_dir_path
58 |
59 | expt_name = args.expt_name
60 | output_folder = args.output_folder
61 |
62 | module_get_dimension = args.module_get_dimension
63 | module_gen_geotiff = args.module_gen_geotiff
64 | module_cropping = args.module_cropping
65 | module_text_spotting = args.module_text_spotting
66 | module_img_geojson = args.module_img_geojson
67 |
68 | spotter_model = args.spotter_model
69 | spotter_config = args.spotter_config
70 | spotter_expt_name = args.spotter_expt_name
71 | gpu_id = args.gpu_id
72 |
73 | if_print_command = args.print_command
74 | error_reason_dict = dict()
75 |
76 | if "sample_map_path" in locals():
77 | # ------------------------- Read sample map list and prepare output dir ----------------
78 | if sample_map_path is not None:
79 | input_csv_path = sample_map_path
80 | if input_csv_path[-4:] == '.csv':
81 | sample_map_df = pd.read_csv(sample_map_path, dtype={'external_id':str})
82 | elif input_csv_path[-4:] == '.tsv':
83 | sample_map_df = pd.read_csv(sample_map_path, dtype={'external_id':str}, sep='\t')
84 | else:
85 | raise NotImplementedError
86 |
87 | external_id_to_img_path_dict, unmatched_external_id_list = get_img_path_from_external_id_and_image_no( sample_map_path = input_csv_path)
88 |
89 | # initialize error reason dict
90 |
91 | for ex_id in unmatched_external_id_list:
92 | error_reason_dict[ex_id] = {'img_path':None, 'error':'Can not find image given external_id.'}
93 |
94 | elif "input_dir_path" in locals():
95 | if input_dir_path is not None:
96 | input_img_path = input_dir_path
97 | sample_map_df = pd.DataFrame(columns = ["external_id"])
98 | for images in os.listdir(input_img_path):
99 | tmp_path={"external_id": os.path.join(input_img_path,images)}
100 | sample_map_df=sample_map_df.append(tmp_path,ignore_index=True)
101 | else:
102 | raise NotImplementedError
103 | else:
104 | raise NotImplementedError
105 |
106 |
107 | expt_out_dir = os.path.join(output_folder, expt_name)
108 | geotiff_output_dir = os.path.join(output_folder, expt_name, 'geotiff')
109 | cropping_output_dir = os.path.join(output_folder, expt_name, 'crop/')
110 | spotting_output_dir = os.path.join(output_folder, expt_name, 'spotter/' + spotter_expt_name)
111 | stitch_output_dir = os.path.join(output_folder, expt_name, 'stitch/' + spotter_expt_name)
112 | geocoord_output_dir = os.path.join(output_folder, expt_name, 'geocoord/' + spotter_expt_name)
113 | postocr_linking_output_dir = os.path.join(output_folder, expt_name, 'postocr_linking/'+ spotter_expt_name)
114 | postocr_only_output_dir = os.path.join(output_folder, expt_name, 'postocr_only/'+ spotter_expt_name)
115 |
116 |
117 | if not os.path.isdir(expt_out_dir):
118 | os.makedirs(expt_out_dir)
119 |
120 | # ------------------------ Get image dimension ------------------------------
121 | if module_get_dimension:
122 | for index, record in sample_map_df.iterrows():
123 | external_id = record.external_id
124 | # pdb.set_trace()
125 | if "sample_map_path" in locals():
126 | if external_id not in external_id_to_img_path_dict:
127 | error_reason_dict[external_id] = {'img_path':None, 'error':'key not in external_id_to_img_path_dict'}
128 | continue
129 |
130 | img_path = external_id_to_img_path_dict[external_id]
131 |
132 | try:
133 | width, height = get_img_dimension(img_path)
134 | except Exception as e:
135 | error_reason_dict[external_id] = {'img_path':img_path, 'error': e }
136 |
137 | elif "input_dir_path" in locals():
138 | img_path = sample_map_df['external_id'].iloc[index]
139 | width, height = get_img_dimension(img_path)
140 |
141 | map_name = os.path.basename(img_path).split('.')[0]
142 |
143 | # ------------------------- Generate geotiff ------------------------------
144 | time_start = time.time()
145 | if module_gen_geotiff:
146 | os.chdir(os.path.join(map_kurator_system_dir ,'m1_geotiff'))
147 |
148 | if not os.path.isdir(geotiff_output_dir):
149 | os.makedirs(geotiff_output_dir)
150 |
151 | # use converted jpg folder instead of original sid folder
152 | if "sample_map_path" in locals():
153 | merged_input_path=sample_map_path
154 | else:
155 | merged_input_path=input_dir_path
156 |
157 | run_geotiff_command = 'python convert_image_to_geotiff.py --sid_root_dir /data2/rumsey_sid_to_jpg/ --sample_map_path '+ merged_input_path +' --out_geotiff_dir '+geotiff_output_dir # can change params in argparse
158 | exe_ret = execute_command(run_geotiff_command, if_print_command)
159 | if 'error' in exe_ret:
160 | error = exe_ret['error']
161 |
162 |
163 |
164 | # ------------------------- Image cropping ------------------------------
165 | if module_cropping:
166 | for index, record in sample_map_df.iterrows():
167 | external_id = record.external_id
168 | if "sample_map_path" in locals():
169 | if external_id not in external_id_to_img_path_dict:
170 | error_reason_dict[external_id] = {'img_path':None, 'error':'key not in external_id_to_img_path_dict'}
171 | continue
172 | img_path = external_id_to_img_path_dict[external_id]
173 | else:
174 | img_path = sample_map_df['external_id'].iloc[index]
175 |
176 | map_name = os.path.basename(img_path).split('.')[0]
177 |
178 | os.chdir(os.path.join(map_kurator_system_dir ,'m2_detection_recognition'))
179 | if not os.path.isdir(cropping_output_dir):
180 | os.makedirs(cropping_output_dir)
181 |
182 | run_crop_command = 'python crop_img.py --img_path '+img_path + ' --output_dir '+ cropping_output_dir
183 |
184 | exe_ret = execute_command(run_crop_command, if_print_command)
185 |
186 | if "sample_map_path" in locals():
187 | if 'error' in exe_ret:
188 | error = exe_ret['error']
189 | error_reason_dict[external_id] = {'img_path':img_path, 'error': error }
190 |
191 |
192 |
193 | time_cropping = time.time()
194 |
195 | # ------------------------- Text Spotting (patch level) ------------------------------
196 | if module_text_spotting:
197 | assert os.path.exists(spotter_config), "Config file for spotter must exist!"
198 | os.chdir(text_spotting_model_dir)
199 | os.system("python setup.py build develop 1> /dev/null")
200 |
201 | for index, record in sample_map_df.iterrows():
202 |
203 | external_id = record.external_id
204 | if "sample_map_path" in locals():
205 | if external_id not in external_id_to_img_path_dict:
206 | error_reason_dict[external_id] = {'img_path':None, 'error':'key not in external_id_to_img_path_dict'}
207 | continue
208 | img_path = external_id_to_img_path_dict[external_id]
209 | else:
210 | img_path = sample_map_df['external_id'].iloc[index]
211 |
212 | map_name = os.path.basename(img_path).split('.')[0]
213 | # print(map_name)
214 |
215 | map_spotting_output_dir = os.path.join(spotting_output_dir, map_name)
216 |
217 | if not os.path.isdir(map_spotting_output_dir):
218 | os.makedirs(map_spotting_output_dir)
219 | else:
220 | num_existing_json = len(glob.glob(os.path.join(map_spotting_output_dir, '*.json')))
221 | num_existing_images = len(glob.glob(os.path.join(cropping_output_dir, map_name, '*jpg')))
222 | if num_existing_json == num_existing_images:
223 | continue
224 | else:
225 | print(f'{index}/{len(sample_map_df)}: Re-run spotting for map {map_name}')
226 | import shutil
227 | shutil.rmtree(map_spotting_output_dir)
228 | os.makedirs(map_spotting_output_dir)
229 |
230 | if spotter_model in ['testr', 'spotter-v2', 'palette']:
231 | run_spotting_command = f'CUDA_VISIBLE_DEVICES={gpu_id} python tools/inference.py --config-file {spotter_config} --output_json --input {os.path.join(cropping_output_dir,map_name)} --output {map_spotting_output_dir}'
232 | else:
233 | raise NotImplementedError
234 |
235 | # print(run_spotting_command)
236 | run_spotting_command += ' 1> /dev/null'
237 |
238 | exe_ret = execute_command(run_spotting_command, if_print_command)
239 | if "sample_map_path" in locals():
240 | if 'error' in exe_ret:
241 | error = exe_ret['error']
242 | error_reason_dict[external_id] = {'img_path':img_path, 'error': error }
243 |
244 | # elif 'time_usage' in exe_ret:
245 | # time_usage = exe_ret['time_usage']
246 | # time_usage_dict[external_id]['spotting'] = time_usage
247 | # else:
248 | # raise NotImplementedError
249 |
250 | logging.info(f'{index}/{len(sample_map_df)}: Done text spotting for {map_name}')
251 |
252 | # time_text_spotting = time.time()
253 |
254 |
255 | # ------------------------- Image coord geojson (map level) ------------------------------
256 | if module_img_geojson:
257 | os.chdir(os.path.join(map_kurator_system_dir ,'m3_image_geojson'))
258 |
259 | if not os.path.isdir(stitch_output_dir):
260 | os.makedirs(stitch_output_dir)
261 |
262 | for index, record in sample_map_df.iterrows():
263 | external_id = record.external_id
264 | if "sample_map_path" in locals():
265 | if external_id not in external_id_to_img_path_dict:
266 | error_reason_dict[external_id] = {'img_path':None, 'error':'key not in external_id_to_img_path_dict'}
267 | continue
268 | img_path = external_id_to_img_path_dict[external_id]
269 | else:
270 | img_path = sample_map_df['external_id'].iloc[index]
271 | map_name = os.path.basename(img_path).split('.')[0]
272 |
273 | stitch_input_dir = os.path.join(spotting_output_dir, map_name)
274 | output_geojson = os.path.join(stitch_output_dir, map_name + '.geojson')
275 |
276 | run_stitch_command = 'python stitch_output.py --input_dir '+stitch_input_dir + ' --output_geojson ' + output_geojson
277 |
278 | exe_ret = execute_command(run_stitch_command, if_print_command)
279 |
280 | if "sample_map_path" in locals():
281 | if 'error' in exe_ret:
282 | error = exe_ret['error']
283 | error_reason_dict[external_id] = {'img_path':img_path, 'error': error }
284 |
285 | # elif 'time_usage' in exe_ret:
286 | # time_usage = exe_ret['time_usage']
287 | # time_usage_dict[external_id]['stitch'] = time_usage
288 | # else:
289 | # raise NotImplementedError
290 |
291 | # time_img_geojson = time.time()
292 |
293 |
294 | # ------------------------- post-OCR ------------------------------
295 | if "sample_map_path" in locals():
296 | if module_post_ocr:
297 | os.chdir(os.path.join(map_kurator_system_dir, 'm4_post_ocr'))
298 |
299 | if not os.path.isdir(postocr_only_output_dir):
300 | os.makedirs(postocr_only_output_dir)
301 |
302 | for index, record in sample_map_df.iterrows():
303 |
304 | external_id = record.external_id
305 | if external_id not in external_id_to_img_path_dict:
306 | error_reason_dict[external_id] = {'img_path': None, 'error': 'key not in external_id_to_img_path_dict'}
307 | continue
308 |
309 | img_path = external_id_to_img_path_dict[external_id]
310 | map_name = os.path.basename(img_path).split('.')[0]
311 |
312 | input_geojson_file = os.path.join(geocoord_output_dir, map_name + '.geojson')
313 |
314 | run_postocr_command = 'python post_ocr_main.py --in_geojson_file '+ input_geojson_file + ' --out_geojson_dir ' + os.path.join(map_kurator_system_dir, postocr_only_output_dir)
315 |
316 | exe_ret = execute_command(run_postocr_command, if_print_command)
317 |
318 | if 'error' in exe_ret:
319 | error = exe_ret['error']
320 | error_reason_dict[external_id] = {'img_path':img_path, 'error': error }
321 |
322 | # elif 'time_usage' in exe_ret:
323 | # time_usage = exe_ret['time_usage']
324 | # time_usage_dict[external_id]['geocoord_geojson'] = time_usage
325 | # else:
326 | # raise NotImplementedError
327 |
328 | # time_geocoord_geojson = time.time()
329 |
330 | # ------------------------- Convert image coordinates to geocoordinates ------------------------------
331 | if "sample_map_path" in locals():
332 | if module_geocoord_geojson:
333 | os.chdir(os.path.join(map_kurator_system_dir, 'm5_geocoordinate_converter'))
334 |
335 | if not os.path.isdir(geocoord_output_dir):
336 | os.makedirs(geocoord_output_dir)
337 |
338 | for index, record in sample_map_df.iterrows():
339 | external_id = record.external_id
340 | if external_id not in external_id_to_img_path_dict:
341 | error_reason_dict[external_id] = {'img_path': None,
342 | 'error': 'key not in external_id_to_img_path_dict'}
343 | continue
344 |
345 | img_path = external_id_to_img_path_dict[external_id]
346 | map_name = os.path.basename(img_path).split('.')[0]
347 |
348 | # current_files_list = glob.glob(os.path.join(map_kurator_system_dir, geocoord_output_dir, "*.geojson"))
349 |
350 | # saved_map_list = []
351 | # for mapname in current_files_list:
352 | # only_map = mapname.split("/")[-1]#.strip().replace(".geojson", "")
353 | # saved_map_list.append(only_map)
354 |
355 | in_geojson = os.path.join(stitch_output_dir, map_name + '.geojson')
356 |
357 | # current_map = in_geojson.split("/")[-1]
358 |
359 | # if current_map not in saved_map_list:
360 | # print("running missing file",current_map)
361 |
362 | run_converter_command = 'python convert_geojson_to_geocoord.py --sample_map_path ' + os.path.join(map_kurator_system_dir, input_csv_path) + ' --in_geojson_file ' + in_geojson + ' --out_geojson_dir ' + os.path.join(map_kurator_system_dir, geocoord_output_dir)
363 |
364 | exe_ret = execute_command(run_converter_command, if_print_command)
365 |
366 | if 'error' in exe_ret:
367 | error = exe_ret['error']
368 | error_reason_dict[external_id] = {'img_path': img_path, 'error': error}
369 |
370 | # elif 'time_usage' in exe_ret:
371 | # time_usage = exe_ret['time_usage']
372 | # time_usage_dict[external_id]['geocoord_geojson'] = time_usage
373 | # else:
374 | # raise NotImplementedError
375 |
376 | # time_geocoord_geojson = time.time()
377 |
378 |
379 | # --------------------- Error logging --------------------------
380 | print('\n')
381 | current_time = datetime.datetime.now().strftime("%Y_%m_%d-%I:%M:%S_%p")
382 | error_reason_df = pd.DataFrame.from_dict(error_reason_dict, orient='index')
383 | error_reason_log_path = os.path.join(output_folder, expt_name, 'error_reason_' + current_time +'.csv')
384 | error_reason_df.to_csv(error_reason_log_path, index_label='external_id')
385 |
386 |
387 | def func_file_to_fullpath_dict(file_path_list):
388 |
389 | file_fullpath_dict = dict()
390 | for file_path in file_path_list:
391 | file_fullpath_dict[os.path.basename(file_path).split('.')[0]] = file_path
392 |
393 | return file_fullpath_dict
394 |
395 | def get_img_path_from_external_id(jp2_root_dir = '/data/rumsey-jp2/', sid_root_dir = '/data2/rumsey_sid_to_jpg/', additional_root_dir='/data2/rumsey-luna-img/', sample_map_path = None,external_id_key = 'external_id') :
396 | # returns (1) a dict with external-id as key, full image path as value (2) list of external-id that can not find image path
397 |
398 | jp2_file_path_list = glob.glob(os.path.join(jp2_root_dir, '*/*.jp2'))
399 | sid_file_path_list = glob.glob(os.path.join(sid_root_dir, '*.jpg'))
400 | add_file_path_list = glob.glob(os.path.join(additional_root_dir, '*'))
401 |
402 | jp2_file_fullpath_dict = func_file_to_fullpath_dict(jp2_file_path_list)
403 | sid_file_fullpath_dict = func_file_to_fullpath_dict(sid_file_path_list)
404 | add_file_fullpath_dict = func_file_to_fullpath_dict(add_file_path_list)
405 |
406 | sample_map_df = pd.read_csv(sample_map_path, dtype={'external_id':str})
407 |
408 | external_id_to_img_path_dict = {}
409 |
410 | unmatched_external_id_list = []
411 |
412 | for index, record in sample_map_df.iterrows():
413 | external_id = record.external_id
414 | filename_without_extension = external_id.strip("'").replace('.','')
415 |
416 | full_path = ''
417 | if filename_without_extension in jp2_file_fullpath_dict:
418 | full_path = jp2_file_fullpath_dict[filename_without_extension]
419 | elif filename_without_extension in sid_file_fullpath_dict:
420 | full_path = sid_file_fullpath_dict[filename_without_extension]
421 | elif filename_without_extension in add_file_fullpath_dict:
422 | full_path = add_file_fullpath_dict[filename_without_extension]
423 | else:
424 | # print('image with external_id not found in image_dir:', external_id)
425 | unmatched_external_id_list.append(external_id)
426 | continue
427 | assert (len(full_path)!=0)
428 |
429 | external_id_to_img_path_dict[external_id] = full_path
430 |
431 | return external_id_to_img_path_dict, unmatched_external_id_list
432 |
433 | def get_img_path_from_external_id_and_image_no(jp2_root_dir = '/data/rumsey-jp2/', sid_root_dir = '/data2/rumsey_sid_to_jpg/', additional_root_dir='/data2/rumsey-luna-img/', sample_map_path = None,external_id_key = 'external_id') :
434 | # returns (1) a dict with external-id as key, full image path as value (2) list of external-id that can not find image path
435 |
436 | jp2_file_path_list = glob.glob(os.path.join(jp2_root_dir, '*/*.jp2'))
437 | sid_file_path_list = glob.glob(os.path.join(sid_root_dir, '*.jpg')) # use converted jpg directly
438 | add_file_path_list = glob.glob(os.path.join(additional_root_dir, '*'))
439 |
440 | jp2_file_fullpath_dict = func_file_to_fullpath_dict(jp2_file_path_list)
441 | sid_file_fullpath_dict = func_file_to_fullpath_dict(sid_file_path_list)
442 | add_file_fullpath_dict = func_file_to_fullpath_dict(add_file_path_list)
443 |
444 | sample_map_df = pd.read_csv(sample_map_path, dtype={'external_id':str})
445 |
446 | external_id_to_img_path_dict = {}
447 |
448 | unmatched_external_id_list = []
449 | for index, record in sample_map_df.iterrows():
450 | external_id = record.external_id
451 | image_no = record.image_no
452 | # filename_without_extension = external_id.strip("'").replace('.','')
453 | filename_without_extension = image_no.strip("'").split('.')[0]
454 |
455 | full_path = ''
456 | if filename_without_extension in jp2_file_fullpath_dict:
457 | full_path = jp2_file_fullpath_dict[filename_without_extension]
458 | elif filename_without_extension in sid_file_fullpath_dict:
459 | full_path = sid_file_fullpath_dict[filename_without_extension]
460 | elif filename_without_extension in add_file_fullpath_dict:
461 | full_path = add_file_fullpath_dict[filename_without_extension]
462 | else:
463 | print('image with external_id not found in image_dir:', external_id)
464 | unmatched_external_id_list.append(external_id)
465 | continue
466 | assert (len(full_path)!=0)
467 |
468 | external_id_to_img_path_dict[external_id] = full_path
469 |
470 | return external_id_to_img_path_dict, unmatched_external_id_list
471 |
472 |
473 | if __name__ == '__main__':
474 |
475 | parser = argparse.ArgumentParser()
476 | parser.add_argument('--jp2_root_dir', type=str, default='/data/rumsey-jp2/',
477 | help='image dir of jp2 files.')
478 | parser.add_argument('--sid_root_dir', type=str, default='/data2/rumsey_sid_to_jpg/',
479 | help='image dir of sid files.')
480 | parser.add_argument('--additional_root_dir', type=str, default='/data2/rumsey-luna-img/',
481 | help='image dir of additional luna files.')
482 | parser.add_argument('--sample_map_path', type=str, default='data/initial_US_100_maps.csv',
483 | help='path to sample map csv, which contains gcps info')
484 | parser.add_argument('--external_id_key', type=str, default='external_id',
485 | help='key string for external id, could be external_id or ListNo')
486 |
487 | args = parser.parse_args()
488 | print(args)
489 |
490 | # get_img_path_from_external_id(jp2_root_dir = args.jp2_root_dir, sid_root_dir = args.sid_root_dir, additional_root_dir = args.additional_root_dir,
491 | # sample_map_path = args.sample_map_path,external_id_key = args.external_id_key)
492 |
493 | get_img_path_from_external_id_and_image_no(jp2_root_dir = args.jp2_root_dir, sid_root_dir = args.sid_root_dir, additional_root_dir = args.additional_root_dir,
494 | sample_map_path = args.sample_map_path,external_id_key = args.external_id_key)
495 |
--------------------------------------------------------------------------------