├── samples
├── sample_label.txt
├── sample_crop.jpg
├── sample_pano.jpg
├── getFullLabelList.sql
├── metadata-seattle.csv
├── labeldata.csv
└── sample_meta.xml
├── decode_depthmap
├── requirements.txt
├── .gitignore
├── Dockerfile
├── flag_panos
├── README.md
├── json_to_csv.py
├── index.html
└── index.js
├── DownloadRunnerDockerEntrypoint.sh
├── config.py
├── README.md
├── CropRunner.py
└── DownloadRunner.py
/samples/sample_label.txt:
--------------------------------------------------------------------------------
1 | pano_x 5110
2 | pano_y -688
3 |
4 | 7442
5 | -708
6 |
--------------------------------------------------------------------------------
/decode_depthmap:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/ProjectSidewalk/sidewalk-panorama-tools/HEAD/decode_depthmap
--------------------------------------------------------------------------------
/requirements.txt:
--------------------------------------------------------------------------------
1 | pandas>=1.2.3
2 | Pillow>=8.1.2
3 | requests==2.25.1
4 | aiohttp>=3.7.4
5 | backoff>=1.10.0
--------------------------------------------------------------------------------
/samples/sample_crop.jpg:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/ProjectSidewalk/sidewalk-panorama-tools/HEAD/samples/sample_crop.jpg
--------------------------------------------------------------------------------
/samples/sample_pano.jpg:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/ProjectSidewalk/sidewalk-panorama-tools/HEAD/samples/sample_pano.jpg
--------------------------------------------------------------------------------
/.gitignore:
--------------------------------------------------------------------------------
1 | .vscode
2 | *.pyc
3 | *.swp
4 | id_rsa
5 | venv
6 | *.log
7 | crop.log.bak.txt
8 | .idea
9 | cropsbak
10 |
--------------------------------------------------------------------------------
/Dockerfile:
--------------------------------------------------------------------------------
1 | FROM ubuntu:20.04
2 | ARG DEBIAN_FRONTEND=noninteractive
3 | COPY . /app
4 | WORKDIR /app
5 | RUN apt-get update
6 | RUN apt-get install -y sshfs python3 python3-pip
7 | RUN pip3 install -r requirements.txt
8 | ENTRYPOINT ["./DownloadRunnerDockerEntrypoint.sh"]
9 | CMD []
10 |
--------------------------------------------------------------------------------
/samples/getFullLabelList.sql:
--------------------------------------------------------------------------------
1 | SELECT gsv_data.gsv_panorama_id, pano_x, pano_y, label_type_id, camera_heading, heading, pitch, label.label_id
2 | FROM label_point
3 | INNER JOIN label
4 | INNER JOIN gsv_data ON label.gsv_panorama_id = gsv_data.gsv_panorama_id
5 | ON label.label_id = label_point.label_id;
6 |
--------------------------------------------------------------------------------
/flag_panos/README.md:
--------------------------------------------------------------------------------
1 | # Flag Panos
2 |
3 | This is a small web tool that Michael Duan (@michaelduan8) created to check the GSV API to see if it has metadata for panos that we don't currently have in our database. It was a one-time-use tool that was created to deal with the period right after Google shut down their depth data endpoint. Our code required access to it, so there was a lapse period where we did not have data. We are keeping this tool around in case it is ever useful again, but it won't be useful to others.
4 |
--------------------------------------------------------------------------------
/flag_panos/json_to_csv.py:
--------------------------------------------------------------------------------
1 | import pandas as pd
2 |
3 | CITY = 'amsterdam'
4 | # JSON_TO_CONVERT = f'{CITY}_unretrievable_panos.json'
5 | # CSV_OUTPUT = f'{CITY}_unretrievable_panos.csv'
6 |
7 | JSON_TO_CSV_MAP = {
8 | f'{CITY}_pano_image_data.json': f'{CITY}_pano_image_data.csv',
9 | f'{CITY}_unretrievable_panos.json': f'{CITY}_unretrievable_panos.csv'
10 | }
11 |
12 | for json_file in JSON_TO_CSV_MAP.keys():
13 | with open(json_file, encoding='utf-8') as f:
14 | df = pd.read_json(f)
15 |
16 | df.to_csv(JSON_TO_CSV_MAP[json_file], encoding='utf-8', index=False)
17 |
--------------------------------------------------------------------------------
/samples/metadata-seattle.csv:
--------------------------------------------------------------------------------
1 | gsv_panorama_id,pano_x,pano_y,zoom,label_type_id,camera_heading,heading,pitch,label_id,width,height,tile_width,tile_height,image_date,imagery_type,pano_lat,pano_lng,label_lat,label_lng,computation_method,copyright
2 | _qVKgG3dGOoClMQI6QgVRg,10536,5049,3,1,143.214004516602,193.879470825195,-21.1875,120954,16384,8192,512,512,2019-06,1,47.6119232177734,-122.326042175293,47.6118603446453,-122.32606911454,approximation2,
3 | 7nNj0FtXp4bXcUGARDOcCg,6383,4553,1,1,181.617828369141,118,-22.0625,47125,16384,8192,512,512,2018-10,1,47.6568489074707,-122.313278198242,47.6567573547363,-122.313179016113,depth,
4 |
--------------------------------------------------------------------------------
/samples/labeldata.csv:
--------------------------------------------------------------------------------
1 | gsv_panorama_id,pano_x,pano_y,label_type_id,camera_heading,heading,pitch,label_id
2 | 7nNj0FtXp4bXcUGARDOcCg,9785,4677,1,181.617828369141,202.9375,-17.5625,47126
3 | 7nNj0FtXp4bXcUGARDOcCg,10679,4747,1,181.617828369141,231.589279174805,-16.390625,74048
4 | 7nNj0FtXp4bXcUGARDOcCg,1912,4550,1,181.617828369141,51.2209815979004,-14.09375,74054
5 | _qVKgG3dGOoClMQI6QgVRg,9044,5043,1,143.214004516602,158.910720825195,-21.234375,120955
6 | _qVKgG3dGOoClMQI6QgVRg,15154,4963,1,143.214004516602,306.191955566406,-18.65625,120952
7 | _qVKgG3dGOoClMQI6QgVRg,15996,4827,3,143.214004516602,323.113830566406,-16.078125,120950
8 |
--------------------------------------------------------------------------------
/DownloadRunnerDockerEntrypoint.sh:
--------------------------------------------------------------------------------
1 | #!/bin/bash
2 | # ./DownloadRunnerDockerEntrypoint sidewalk_server_fqdn
3 | # ./DownloadRunnerDockerEntrypoint sidewalk_server_fqdn user@host:/remote/path port
4 |
5 | mkdir -p /tmp/download_dest
6 | chmod 600 /app/id_rsa
7 |
8 | # Parse optional parameters at the end
9 | all_panos=""
10 | attempt_depth=""
11 |
12 | # Process arguments from the end
13 | while [[ $# -gt 0 ]]; do
14 | case "${@: -1}" in
15 | "--all-panos")
16 | all_panos="--all-panos"
17 | set -- "${@:1:$(($#-1))}"
18 | ;;
19 | "--attempt-depth")
20 | attempt_depth="--attempt-depth"
21 | set -- "${@:1:$(($#-1))}"
22 | ;;
23 | *)
24 | # Not an optional parameter, stop processing
25 | break
26 | ;;
27 | esac
28 | done
29 |
30 | # If one param, just download to /tmp. If three params, this means a host and port has been supplied.
31 | if [ $# -eq 1 ]; then
32 | python3 DownloadRunner.py $1 /tmp/download_dest $all_panos
33 | elif [ $# -eq 3 ]; then
34 | echo "Mounting $2 port $3 for $1"
35 | sshfs -o IdentityFile=/app/id_rsa,StrictHostKeyChecking=no $2 /tmp/download_dest -p $3 && python3 DownloadRunner.py $1 /tmp/download_dest $all_panos; umount /tmp/download_dest
36 | else
37 | echo "Usage:"
38 | echo " ./DownloadRunnerDockerEntrypoint sidewalk_server_fqdn"
39 | echo " ./DownloadRunnerDockerEntrypoint sidewalk_server_fqdn user@host:/remote/path port"
40 | fi
41 |
--------------------------------------------------------------------------------
/flag_panos/index.html:
--------------------------------------------------------------------------------
1 |
2 |
3 |
4 | Street View split-map-panes
5 |
6 |
7 |
8 |
9 |
32 |
33 |
34 |
48 |
49 |
50 |
51 |
55 |
62 |
63 |
64 |
--------------------------------------------------------------------------------
/flag_panos/index.js:
--------------------------------------------------------------------------------
1 | const CITY = 'amsterdam'
2 | const SIDEWALK_SERVER_FQDN = `https://sidewalk-${CITY}.cs.washington.edu`
3 |
4 | const OUTPUT_JSON = `${CITY}_pano_image_data.json`
5 | const UNRETRIEVABLE_PANOS_JSON = `${CITY}_unretrievable_panos.json`
6 |
7 | const CHUNK_SIZE = 10000;
8 |
9 | function getPanos(url, callback) {
10 | // grab panorama info from Project Sidewalk endpoint
11 | fetch(url)
12 | .then(response => response.json())
13 | .then(result => callback(result));
14 | }
15 |
16 | async function flag_panos_for_redownload(pano_data) {
17 | // initially, filter out panos that already have image data or have empty pano_id
18 | filtered_pano_data = pano_data.filter(pano => pano["gsv_panorama_id"] && (!pano["width"] || !pano["height"]));
19 | console.log(filtered_pano_data.length);
20 |
21 | // instantiate streetviewservice instance
22 | let streetViewService = new google.maps.StreetViewService();
23 |
24 | let new_pano_data = [];
25 | let failed_to_retrieve_metadata = [];
26 |
27 | // Check pano metadata in chunks
28 | for (let i = 0; i < filtered_pano_data.length; i += CHUNK_SIZE) {
29 | let metadata_promises = [];
30 |
31 | pano_slice = filtered_pano_data.slice(i, i + CHUNK_SIZE);
32 | for (let pano of pano_slice) {
33 | // console.log(pano)
34 | let metadata_promise = streetViewService.getPanorama({pano: pano["gsv_panorama_id"]}, function(svPanoData, status) {
35 | if (status === google.maps.StreetViewStatus.OK) {
36 | tiles = svPanoData.tiles;
37 | new_pano_data.push({
38 | gsv_panorama_id: pano["gsv_panorama_id"],
39 | image_width: tiles.worldSize.width,
40 | image_height: tiles.worldSize.height,
41 | tile_width: tiles.tileSize.width,
42 | tile_height: tiles.tileSize.height,
43 | copyright: svPanoData.copyright,
44 | center_heading: tiles.centerHeading,
45 | origin_heading: tiles.originHeading,
46 | origin_pitch: tiles.originPitch
47 | });
48 | } else {
49 | // no street view data available for this panorama.
50 | //console.error(`Error loading Street View imagery for ${pano["gsv_panorama_id"]}: ${status}`);
51 | failed_to_retrieve_metadata.push({gsv_panorama_id: pano["gsv_panorama_id"]});
52 | }
53 | });
54 |
55 | metadata_promises.push(metadata_promise);
56 | }
57 |
58 | // wait for all metadata promises to resolve
59 | // TODO: add a final flag in order to post everything when all batches iterated over
60 | results = await Promise.allSettled(metadata_promises)
61 |
62 | // .then(results => {
63 | // see how many failed in chunk
64 | console.log(results.filter(result => result.status == "rejected").length);
65 |
66 | // check updated new_pano_data length
67 | console.log(new_pano_data.length);
68 |
69 | // check if this chunk was the last chunk
70 | last_chunk = i + CHUNK_SIZE >= filtered_pano_data.length;
71 |
72 | if (last_chunk) {
73 | // turn pano_data list into JSON
74 | let json_pano_data = JSON.stringify(new_pano_data);
75 |
76 | // use Blob in order to create download URL for the JSON file
77 | let pano_data_blob = new Blob([json_pano_data], {type: "application/json"});
78 | let pano_data_url = URL.createObjectURL(pano_data_blob);
79 |
80 | // visualize link on webpage
81 | let a_pano_data = document.createElement('a');
82 | a_pano_data.href = pano_data_url;
83 | a_pano_data.download = OUTPUT_JSON;
84 | a_pano_data.textContent = `Download ${OUTPUT_JSON}`;
85 |
86 | document.getElementById('json-download').appendChild(a_pano_data);
87 |
88 | // turn unretrievable panos list into JSON
89 | let unretrievable_panos_json = JSON.stringify(failed_to_retrieve_metadata);
90 |
91 | // use Blob in order to create download URL for the JSON file
92 | let unretrievable_panos_blob = new Blob([unretrievable_panos_json], {type: "application/json"});
93 | let unretrievable_panos_url = URL.createObjectURL(unretrievable_panos_blob);
94 |
95 | // visualize link on webpage
96 | let a_unretrievable = document.createElement('a');
97 | a_unretrievable.href = unretrievable_panos_url;
98 | a_unretrievable.download = UNRETRIEVABLE_PANOS_JSON;
99 | a_unretrievable.textContent = `Download ${UNRETRIEVABLE_PANOS_JSON}`;
100 |
101 | document.getElementById('json-download').appendChild(a_unretrievable);
102 | } else {
103 | // sleep for a minute to not exceed QPM rate-limit on Google's end.
104 | console.log("Sleeping for 1 min to not exceed QPM limit")
105 | await new Promise(r => setTimeout(r, 60000));
106 | console.log("Done Sleeping")
107 | }
108 | }
109 | }
110 |
111 | function initialize() {
112 | // Get pano_ids from Project Sidewalk api.
113 | // Afterwards, filter for panos with no image size data and query for said image metadata.
114 | getPanos(SIDEWALK_SERVER_FQDN + '/adminapi/panos', (data) => flag_panos_for_redownload(data));
115 | }
--------------------------------------------------------------------------------
/config.py:
--------------------------------------------------------------------------------
1 | # Threads to use for asyncio - test but usually more threads the better as I/O task
2 | thread_count = 8
3 |
4 | # Proxy settings - if proxy not added, leave as is
5 | proxies = {
6 | "http": "http://",
7 | "https": "http://",
8 | }
9 |
10 | # -------------------------------
11 | # Windows Headers
12 | # -------------------------------
13 |
14 | # Edge
15 | headers_list = [
16 | {
17 | 'Connection': 'keep-alive',
18 | 'Upgrade-Insecure-Requests': '1',
19 | 'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/88.0.4324.182 Safari/537.36 Edg/88.0.705.81',
20 | 'Accept': 'text/html,application/xhtml+xml,application/xml;q=0.9,image/webp,image/apng,*/*;q=0.8,application/signed-exchange;v=b3;q=0.9',
21 | 'Accept-Language': 'en-US,en;q=0.9',
22 | 'Referer': 'http://maps.google.com',
23 | },
24 |
25 | # Firefox 85 on Windows 10
26 | {
27 | 'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64; rv:85.0) Gecko/20100101 Firefox/85.0',
28 | 'Accept': 'text/html,application/xhtml+xml,application/xml;q=0.9,image/webp,*/*;q=0.8',
29 | 'Accept-Language': 'en-US,en;q=0.5',
30 | 'Referer': 'http://maps.google.com',
31 | 'DNT': '1',
32 | 'Connection': 'keep-alive',
33 | 'Upgrade-Insecure-Requests': '1',
34 | },
35 |
36 | # Chrome 88 Windows 10
37 | {
38 | 'Connection': 'keep-alive',
39 | 'Upgrade-Insecure-Requests': '1',
40 | 'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/88.0.4324.190 Safari/537.36',
41 | 'Accept': 'image/avif,image/webp,image/apng,image/svg+xml,image/*,*/*;q=0.8',
42 | 'Accept-Language': 'en-GB,en;q=0.9',
43 | 'Referer': 'http://maps.google.com',
44 | },
45 |
46 | # Opera for Windows 10
47 | {
48 | 'Connection': 'keep-alive',
49 | 'Upgrade-Insecure-Requests': '1',
50 | 'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/88.0.4324.150 Safari/537.36 OPR/74.0.3911.160',
51 | 'Accept': 'text/html,application/xhtml+xml,application/xml;q=0.9,image/avif,image/webp,image/apng,*/*;q=0.8,application/signed-exchange;v=b3;q=0.9',
52 | 'Accept-Language': 'en-US,en;q=0.9',
53 | 'Referer': 'http://maps.google.com',
54 | },
55 |
56 | # -------------------------------
57 | # Mac Headers
58 | # -------------------------------
59 |
60 | # Edge 88 Mac
61 | {
62 | 'Connection': 'keep-alive',
63 | 'Upgrade-Insecure-Requests': '1',
64 | 'User-Agent': 'Mozilla/5.0 (Macintosh; Intel Mac OS X 11_2_2) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/88.0.4324.182 Safari/537.36 Edg/88.0.705.81',
65 | 'Accept': 'text/html,application/xhtml+xml,application/xml;q=0.9,image/webp,image/apng,*/*;q=0.8,application/signed-exchange;v=b3;q=0.9',
66 | 'Accept-Language': 'en-GB,en;q=0.9,en-US;q=0.8',
67 | 'Referer': 'http://maps.google.com',
68 | },
69 |
70 | # Opera 74 Mac
71 | {
72 | 'Connection': 'keep-alive',
73 | 'Upgrade-Insecure-Requests': '1',
74 | 'User-Agent': 'Mozilla/5.0 (Macintosh; Intel Mac OS X 11_2_2) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/88.0.4324.150 Safari/537.36 OPR/74.0.3911.160',
75 | 'Accept': 'text/html,application/xhtml+xml,application/xml;q=0.9,image/avif,image/webp,image/apng,*/*;q=0.8,application/signed-exchange;v=b3;q=0.9',
76 | 'Accept-Language': 'en-GB,en;q=0.9',
77 | 'Referer': 'http://maps.google.com',
78 | },
79 |
80 | # Chrome 88 Mac
81 | {
82 | 'Connection': 'keep-alive',
83 | 'Upgrade-Insecure-Requests': '1',
84 | 'User-Agent': 'Mozilla/5.0 (Macintosh; Intel Mac OS X 11_2_2) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/88.0.4324.192 Safari/537.36',
85 | 'Accept': 'text/html,application/xhtml+xml,application/xml;q=0.9,image/avif,image/webp,image/apng,*/*;q=0.8,application/signed-exchange;v=b3;q=0.9',
86 | 'Referer': 'http://maps.google.com',
87 | 'Accept-Language': 'en-GB,en;q=0.9',
88 | },
89 |
90 | # Firefox 85 Mac
91 | {
92 | 'User-Agent': 'Mozilla/5.0 (Macintosh; Intel Mac OS X 10.16; rv:86.0) Gecko/20100101 Firefox/86.0',
93 | 'Accept': 'image/webp,*/*',
94 | 'Accept-Language': 'en-US,en;q=0.5',
95 | 'Referer': 'http://maps.google.com',
96 | 'DNT': '1',
97 | 'Connection': 'keep-alive',
98 | },
99 |
100 | # Safari 14 Mac
101 | {
102 | 'Accept': 'text/html,application/xhtml+xml,application/xml;q=0.9,*/*;q=0.8',
103 | 'Upgrade-Insecure-Requests': '1',
104 | 'Host': 'maps.google.com',
105 | 'User-Agent': 'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_6) AppleWebKit/605.1.15 (KHTML, like Gecko) Version/14.0.3 Safari/605.1.15',
106 | 'Accept-Language': 'en-ie',
107 | 'Connection': 'keep-alive',
108 | },
109 |
110 | # -------------------------------
111 | # Ubuntu Headers
112 | # -------------------------------
113 |
114 | # Firefox 86 Ubuntu
115 | {
116 | 'User-Agent': 'Mozilla/5.0 (X11; Ubuntu; Linux x86_64; rv:86.0) Gecko/20100101 Firefox/86.0',
117 | 'Accept': 'image/webp,*/*',
118 | 'Accept-Language': 'en-GB,en;q=0.5',
119 | 'Referer': 'http://maps.google.com',
120 | 'DNT': '1',
121 | 'Connection': 'keep-alive',
122 | },
123 |
124 | # Chrome 88 Ubuntu
125 | {
126 | 'Connection': 'keep-alive',
127 | 'Upgrade-Insecure-Requests': '1',
128 | 'User-Agent': 'Mozilla/5.0 (X11; Linux x86_64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/88.0.4324.182 Safari/537.36',
129 | 'Accept': 'text/html,application/xhtml+xml,application/xml;q=0.9,image/avif,image/webp,image/apng,*/*;q=0.8,application/signed-exchange;v=b3;q=0.9',
130 | 'Accept-Language': 'en-GB,en;q=0.9',
131 | 'Referer': 'http://maps.google.com',
132 | },
133 |
134 | # Opera 74 Ubuntu
135 | {
136 | 'Connection': 'keep-alive',
137 | 'Upgrade-Insecure-Requests': '1',
138 | 'User-Agent': 'Mozilla/5.0 (X11; Linux x86_64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/88.0.4324.150 Safari/537.36 OPR/74.0.3911.160',
139 | 'Accept': 'text/html,application/xhtml+xml,application/xml;q=0.9,image/avif,image/webp,image/apng,*/*;q=0.8,application/signed-exchange;v=b3;q=0.9',
140 | 'Accept-Language': 'en-GB,en;q=0.9',
141 | 'Referer': 'http://maps.google.com',
142 | }
143 | ]
144 |
--------------------------------------------------------------------------------
/README.md:
--------------------------------------------------------------------------------
1 | # sidewalk-panorama-tools
2 |
3 | ## About
4 | This repository contains a set of Python scripts, intended to be used with data from [Project Sidewalk](https://github.com/ProjectSidewalk/SidewalkWebpage). The purpose of these scripts are to create crops of sidewalk accessibility issues/features usable for ML and computer vision applications from Google Streetview Panoramas via crowd-sourced label data from Project Sidewalk.
5 |
6 | The scripts are intended to be run inside a Docker container running Ubuntu 20.04 64-bit. However, one should be able to run these scripts on most Linux distros without the need for Docker, assuming the Python packages listed in `requirements.txt` can be installed. Additional effort would be required to use the downloader on a Mac or Windows machine without Docker.
7 |
8 | There are two main scripts of note: [DownloadRunner.py](DownloadRunner.py) and [CropRunner.py](CropRunner.py). Both should be fully functional, but only the downloader is actively in use (a new version is in the works), so we may not notice bugs with the cropper as quickly. More details on both below!
9 |
10 | **Note:** At least 2GB RAM is recommended, as these scripts may crash on very low memory systems due to the size of the images processed.
11 |
12 | ## Downloader
13 | 1. [Install Docker Desktop](https://www.docker.com/get-started).
14 | 1. Run `git clone https://github.com/ProjectSidewalk/sidewalk-panorama-tools.git` in the directory where you want to put the code.
15 | 1. Create the Docker image
16 | ```
17 | docker build --no-cache --pull -t projectsidewalk/scraper:v5
18 | ```
19 | 1. You can then run the downloader using the following command:
20 | ```
21 | docker run --cap-add SYS_ADMIN --device=/dev/fuse --security-opt apparmor:unconfined projectsidewalk/scraper:v5
22 | ```
23 | Where the `` looks like `sidewalk-columbus.cs.washington.edu` if you want data from Columbus. If you visit that URL, you will see a dropdown menu with a list of publicly deployed cities that you can pull data from.
24 | 1. Right now the data is stored in a temporary directory in the Docker container. You could set up a shared volume for it, but for now you can just copy the data over using `docker cp :/tmp/download_dest/ `, where `` is the place on your local machine where you want to save the files. You can find the `` using `docker ps -a`.
25 |
26 | Additional settings can be configured for `DownloadRunner.py` in the configuration file `config.py`.
27 | * `thread_count` - the number of threads you wish to run in parallel. As this uses asyncio and is an I/O task, the higher the count the faster the operation, but you will need to test what the upper limit is for your own device and network connection.
28 | * `proxies` - if you wish to use a proxy when downloading, update this dictionary with the relevant details, otherwise leave as is and no proxy will be used.
29 | * `headers` - this is a list of real headers that is used when making requests. You can add to this list, edit it, or leave as is.
30 |
31 | ## Cropper
32 |
33 | `CropRunner.py` creates crops of the accessibility features from the downloaded GSV panoramas images via label data from Project Sidewalk, provided by their API.
34 |
35 | Usage:
36 | ```python
37 | python CropRunner.py [-h] (-d [D] | -f [F]) [-s S] [-c C]
38 | ```
39 | * To fetch label metadata from webserver or a file, use respectively (mutually exclusive, required):
40 | * ``-d ``
41 | * ``-f ``
42 | * ``-s `` (optional). Specify if using a different directory containing panoramas. Panoramas are used to crop the labels.
43 | * ``-o `` (optional). Specify if want to set a different directory for crops to be stored.
44 |
45 | As an example:
46 | ```python
47 | python CropRunner.py -d sidewalk-columbus.cs.washington.edu -s /sidewalk/columbus/panos/ -o /sidewalk/columbus/crops/
48 | ```
49 |
50 | **Note** You will likely want to filter out labels where `disagree_count > agree_count`. These are based on human-provided validations from other Project Sidewalk users. This is not written in the code by default. There is also an option for a filter that is even more strict. This of course has the tradeoff of using less data, so this depends on the the needs of your project: more data vs more accurate data. To do this, you would query the `/v2/access/attributesWithLabels` API endpoint for the city you're looking at. Then you would only include labels where the `label_id` is also present in the attributesWithLabels API. This is a more aggressive filter that removes labels from some users that we suspect are providing low quality data based on some heuristics.
51 |
52 | **Note** We have noticed some error in the y-position of labels on the panorama. We believe that this either comes from a bug in the GSV API, or it may be there there is some metadata that Google is not providing us. The errors are relatively small and in the y-direction. As of Apr 2023 we are working on an alternative cropper that attempts to correct for these errors, but it is in development. The version here should work pretty well for now though!
53 |
54 | ## Definitions of variables found in APIs
55 |
56 | ### Downloader: /adminapi/panos
57 | | Attribute | Definition |
58 | | ------------- | ------------- |
59 | | gsv_panorama_id | A unique ID, provided by Google, for the panoramic image |
60 | | width | The width of the pano image in pixels |
61 | | height | The height of the pano image in pixels |
62 | | lat | The latitude of the camera when the image was taken |
63 | | lng | The longitude of the camera when the image was taken |
64 | | camera_heading | The heading (in degrees) of the center of the image with respect to true north |
65 | | camera_pitch | The pitch (in degrees) of the camera with respect to horizontal |
66 |
67 |
68 | ### Cropper: /adminapi/labels/cvMetadata
69 | You won't need most of this data in your work, but it's all here for reference. Everything through `notsure_count` might be useful, then there are a few that are duplicates from the API described above, then everything starting with `canvas_width` probably won't matter for you.
70 |
71 | | Attribute | Definition |
72 | | ------------- | ------------- |
73 | | label_id | A unique ID for each label (within a given city), provided by Project Sidewalk |
74 | | gsv_panorama_id | A unique ID, provided by Google, for the panoramic image [same as /adminapi/panos] |
75 | | label_type_id | An integer ID denoting the type of label placed, defined in the chart below |
76 | | pano_x | The x-pixel location of the label on the pano, where top-left is (0,0) |
77 | | pano_y | The y-pixel location of the label on the pano, where top-left is (0,0) |
78 | | agree_count | The number of "agree" validations provided by Project Sidewalk users |
79 | | disagree_count | The number of "disagree" validations provided by Project Sidewalk users |
80 | | notsure_count | The number of "not sure" validations provided by Project Sidewalk users |
81 | | pano_width | The width of the pano image in pixels [same as /adminapi/panos] |
82 | | pano_height | The height of the pano image in pixels [same as /adminapi/panos] |
83 | | camera_heading | The heading (in degrees) of the center of the image with respect to true north [same as /adminapi/panos] |
84 | | camera_pitch | The pitch (in degrees) of the camera with respect to horizontal [same as /adminapi/panos] |
85 | | canvas_width | The width of the canvas where the user placed a label in Project Sidewalk |
86 | | canvas_height | The height of the canvas where the user placed a label in Project Sidewalk |
87 | | canvas_x | The x-pixel location where the user clicked on the canvas to place the label, where top-left is (0,0) |
88 | | canvas_y | The y-pixel location where the user clicked on the canvas to place the label, where top-left is (0,0) |
89 | | heading | The heading (in degrees) of the center of the canvas with respect to true north when the label was placed |
90 | | pitch | The pitch (in degrees) of the center of the canvas with respect to _the camera's pitch_ when the label was placed |
91 | | zoom | The zoom level in the GSV interface when the user placed the label |
92 |
93 |
94 | Note that the numbers in the `label_type_id` column correspond to these label types (yes, 8 was skipped! :shrug:):
95 |
96 | | label_type_id | label type |
97 | | ------------- | ------------- |
98 | | 1 | Curb Ramp |
99 | | 2 | Missing Curb Ramp |
100 | | 3 | Obstacle in a Path |
101 | | 4 | Surface Problem |
102 | | 5 | Other |
103 | | 6 | Can't see the sidewalk |
104 | | 7 | No Sidewalk |
105 | | 9 | Crosswalk |
106 | | 10 | Pedestrian Signal |
107 |
108 | ## Suggested Improvements
109 |
110 | * `CropRunner.py` - implement multi core usage when creating crops. Currently runs on a single core, most modern machines
111 | have more than one core so would give a speed up for cropping 10's of thousands of images and objects.
112 | * Add logic to `progress_check()` function so that it can register if their is a network failure and does not log the pano id as visited and failed.
113 | * Project Sidewalk group to delete old or commented code once they decide it is no longer required (all code which used the previously available XML data).
114 |
115 | ## Depth Maps
116 | Depth maps are calculated using downloaded metadata from Google Street View. The endpoint being used to gather the needed XML metadata for depth map calculation isn't a publicly supported API endpoint from Google. It has been only sporadically available throughout 2022, and as of Apr 2023, has been unavailable for the past nine months. We continue to include the code to download the XML and decode the depth data in our download scripts on the off chance that the endpoint comes back online at some point.
117 |
118 | **Note:** Decoding the depth maps on an OS other than Linux will likely require recompiling the `decode_depthmap` binary for your system using [this source](https://github.com/jianxiongxiao/ProfXkit/blob/master/GoogleMapsScraper/decode_depthmap.cpp).
119 |
120 | ## Old Code We've Removed
121 | In PR [#26](https://github.com/ProjectSidewalk/sidewalk-panorama-tools/pull/26), we removed some old code. Some was related to our Tohme paper from 2014, some had to do with using depth maps for cropping images. Given that no one seems to be using the Tohme code (those on our team don't even know how it works) and Google has removed access to their depth data API, we removed this code in Apr 2023. We are hoping that this will simplify the repository, making it easier to make use of our newer work, while making it easier to maintain the code that's actually being used.
122 |
123 | If any of this code ever needs to be revived, it exists in the git history, and can be found in the PR linked above!
124 |
--------------------------------------------------------------------------------
/CropRunner.py:
--------------------------------------------------------------------------------
1 | """
2 | ** Crop Extractor for Project Sidewalk **
3 |
4 | Given label metadata from the Project Sidewalk database, this script will extract JPEG crops of the features that have
5 | been labeled. The required metadata should be obtained through an API endpoint on the Project Sidewalk server for a
6 | given city, passed as an argument to this script. Alternatively, if you have a CSV containing this data (from running
7 | the samples/getFullLabelList.sql script) you can pass in the name of that CSV file as an argument.
8 |
9 | Additionally, you should have downloaded original panorama images from Street View using DownloadRunner.py. You will
10 | need to supply the path to the folder containing these files.
11 |
12 | """
13 |
14 | import sys
15 | import logging
16 | import os
17 | from PIL import Image, ImageDraw
18 | import json
19 | import requests
20 | from requests.adapters import HTTPAdapter
21 | import urllib3
22 | from urllib3.util.retry import Retry
23 | import pandas as pd
24 | import argparse
25 | try:
26 | from xml.etree import cElementTree as ET
27 | except ImportError as e:
28 | from xml.etree import ElementTree as ET
29 |
30 | # Mark the center of the crop?
31 | MARK_LABEL = True
32 |
33 | logging.basicConfig(filename='crop.log', level=logging.DEBUG)
34 |
35 | parser = argparse.ArgumentParser()
36 | group_parser = parser.add_mutually_exclusive_group(required=True)
37 | group_parser.add_argument('-d', nargs='?', help='sidewalk_server_domain (preferred over metadata_file) - FDQN of SidewalkWebpage server to fetch label list from, i.e. sidewalk-columbus.cs.washington.edu')
38 | group_parser.add_argument('-f', nargs='?', help='metadata_file - path to file containing label_ids and their properties. It may be CSV or JSON. i.e. samples/labeldata.csv')
39 | parser.add_argument('-s', default='/tmp/download_dest/', help='pano_storage_directory - path to directory containing panoramas downloaded using DownloadRunner.py. default=/tmp/download_dest/')
40 | parser.add_argument('-o', default='/crops/', help='crop_output_directory - path to location for saving the crops. default=/crops/')
41 | args = parser.parse_args()
42 |
43 | # FDQN SidewalkWebpage server
44 | sidewalk_server_fdqn = args.d
45 | # Path to json or CSV data from database
46 | label_metadata_file = args.f
47 | # Path to panoramas downloaded using DownloadRunner.py.
48 | gsv_pano_path = args.s
49 | # Path to location for saving the crops
50 | crop_destination_path = args.o
51 |
52 | def request_session():
53 | """
54 | Sets up a request session to handle server HTTP requests, retrying in case of errors.
55 | :return: session
56 | """
57 | session = requests.Session()
58 | retries = Retry(total=5, connect=5, status_forcelist=[429, 500, 502, 503, 504], backoff_factor=1, raise_on_status=False)
59 | adapter = HTTPAdapter(max_retries=retries)
60 | session.mount('https://', adapter)
61 | return session
62 |
63 | def fetch_label_ids_csv(metadata_csv_path):
64 | """
65 | Reads metadata from a csv. Useful for old csv formats of cvMetadata such as cv-metadata-seattle.csv
66 | :param metadata_csv_path: The path to the metadata csv file and the file's name eg. sample/metadata-seattle.csv
67 | :return: A list of dicts containing the follow metadata: gsv_panorama_id, pano_x, pano_y, zoom, label_type_id,
68 | camera_heading, heading, pitch, label_id, width, height, tile_width, tile_height, image_date, imagery_type,
69 | pano_lat, pano_lng, label_lat, label_lng, computation_method, copyright
70 | """
71 | df_meta = pd.read_csv(metadata_csv_path)
72 | df_meta = df_meta.drop_duplicates(subset=['label_id']).to_dict('records')
73 | return df_meta
74 |
75 | def json_to_list(jsondata):
76 | """
77 | Transforms json like object to a list of dict to be read in bulk_extract_crops() to crop panos with label metadata
78 | :param jsondata: json object containing label ids and their associated properties
79 | :return: A list of dicts containing the following metadata: label_id, gsv_panorama_id, label_type_id, agree_count,
80 | disagree_count, notsure_count, pano_width, pano_height, pano_x, pano_y, canvas_width, canvas_height, canvas_x,
81 | canvas_y, zoom, heading, pitch, camera_heading, camera_pitch
82 | """
83 | unique_label_ids = set()
84 | label_info = []
85 |
86 | for value in jsondata:
87 | label_id = value["label_id"]
88 | if label_id not in unique_label_ids:
89 | unique_label_ids.add(label_id)
90 | label_info.append(value)
91 | else:
92 | print("Duplicate label ID")
93 | assert len(unique_label_ids) == len(label_info)
94 | return label_info
95 |
96 | def fetch_cvMetadata_from_file(metadata_json_path):
97 | """
98 | Reads json file to exctact labels.
99 | :param metadata_file_path: the path of the json file containing all label ids and their associated data.
100 | :return: A list of dicts containing the following metadata: label_id, gsv_panorama_id, label_type_id, agree_count,
101 | disagree_count, notsure_count, pano_width, pano_height, pano_x, pano_y, canvas_width, canvas_height, canvas_x,
102 | canvas_y, zoom, heading, pitch, camera_heading, camera_pitch
103 | """
104 | with open(metadata_json_path) as json_file:
105 | json_meta = json.load(json_file)
106 | return json_to_list(json_meta)
107 |
108 | # https://stackoverflow.com/questions/54356759/python-requests-how-to-determine-response-code-on-maxretries-exception
109 | def fetch_cvMetadata_from_server(server_fdqn):
110 | """
111 | Function that uses HTTP request to server to fetch cvMetadata. Then parses the data to json and transforms it
112 | into list of dicts. Each element is associated with a single label.
113 | :return: list of labels
114 | """
115 | url = 'https://' + server_fdqn + '/adminapi/labels/cvMetadata'
116 | session = request_session()
117 | try:
118 | print("Getting metadata from web server")
119 | response = session.get(url)
120 | response.raise_for_status()
121 | except requests.exceptions.HTTPError as e:
122 | logging.error('HTTPError: {}'.format(e))
123 | print("Cannot fetch metadata from webserver. Check log file.")
124 | sys.exit(1)
125 | except urllib3.exceptions.MaxRetryError as e:
126 | logging.error('Retries: '.format(e))
127 | print("Cannot fetch metadata from webserver. Check log file.")
128 | sys.exit(1)
129 |
130 | jsondata = response.json()
131 | return json_to_list(jsondata)
132 |
133 | def predict_crop_size(pano_y, pano_height):
134 | """
135 | As it stands, this algorithm:
136 | 1. Converts `pano_y` and `pano_height` to the old version of `pano_y` that we had when this alg was written.
137 | 2. Approximates the distance to label from camera using an experimentally determined formula.
138 | 3. Predict an ideal crop size using an experimentally determined formula based on the estimated distance.
139 |
140 | Here is some context for the current formulae:
141 | https://github.com/ProjectSidewalk/sidewalk-cv-tools/issues/2#issuecomment-510609873
142 | https://github.com/ProjectSidewalk/SidewalkWebpage/issues/633#issuecomment-307283178
143 |
144 | There are some clear areas to improve this function:
145 | 1. We have an updated distance estimation formula that takes into account zoom level:
146 | https://github.com/ProjectSidewalk/SidewalkWebpage/blob/develop/public/javascripts/SVLabel/src/SVLabel/label/Label.js#L17
147 | 2. That distance estimation formula should be recreated given some of the bugs we've fixed in the past few years.
148 | """
149 | old_pano_y = pano_height / 2 - pano_y
150 | crop_size = 0
151 | distance = max(0, 19.80546390 + 0.01523952 * old_pano_y)
152 |
153 | if distance > 0:
154 | crop_size = 8725.6 * (distance ** -1.192)
155 | if crop_size > 1500 or distance == 0:
156 | crop_size = 1500
157 | if crop_size < 50:
158 | crop_size = 50
159 |
160 | return crop_size
161 |
162 |
163 | def make_single_crop(path_to_image, pano_x, pano_y, output_filename, draw_mark=False):
164 | """
165 | Makes a crop around the object of interest
166 | :param path_to_image: where the GSV pano is stored
167 | :param pano_x: x-pixel of label on the GSV image
168 | :param pano_y: y-pixel of label on the GSV image
169 | :param output_filename: name of file for saving
170 | :param draw_mark: if a dot should be drawn in the centre of the object/image
171 | :return: none
172 | """
173 | pano = Image.open(path_to_image)
174 | draw = ImageDraw.Draw(pano)
175 |
176 | pano_width = pano.size[0]
177 | pano_height = pano.size[1]
178 | print(pano_width, pano_height)
179 |
180 | predicted_crop_size = predict_crop_size(pano_y, pano_height)
181 | crop_width = predicted_crop_size
182 | crop_height = predicted_crop_size
183 |
184 | r = 10
185 | if draw_mark:
186 | draw.ellipse((pano_x - r, pano_y - r, pano_x + r, pano_y + r), fill=128)
187 |
188 | print("Plotting at " + str(pano_x) + "," + str(pano_y))
189 |
190 | top_left_x = pano_x - crop_width / 2
191 | top_left_y = pano_y - crop_height / 2
192 | cropped_square = pano.crop((top_left_x, top_left_y, top_left_x + crop_width, top_left_y + crop_height))
193 | cropped_square.save(output_filename)
194 |
195 | return
196 |
197 |
198 | def bulk_extract_crops(labels_to_crop, path_to_gsv_scrapes, destination_dir, mark_label=False):
199 | total_labels = len(labels_to_crop)
200 | no_metadata_fail = 0
201 | no_pano_fail = 0
202 | success = 0
203 |
204 | for row in labels_to_crop:
205 | pano_id = row['gsv_panorama_id']
206 | print(pano_id)
207 | pano_x = float(row['pano_x'])
208 | pano_y = float(row['pano_y'])
209 | label_type = int(row['label_type_id'])
210 | label_id = int(row['label_id'])
211 |
212 | pano_img_path = os.path.join(path_to_gsv_scrapes, pano_id[:2], pano_id + ".jpg")
213 |
214 | print(f'Cropping label {1 + no_pano_fail + no_metadata_fail + success} of {total_labels}')
215 | print(pano_img_path)
216 | # Extract the crop.
217 | if os.path.exists(pano_img_path):
218 | destination_folder = os.path.join(destination_dir, str(label_type))
219 | if not os.path.isdir(destination_folder):
220 | os.makedirs(destination_folder)
221 |
222 | crop_destination = os.path.join(destination_dir, str(label_type), str(label_id) + ".jpg")
223 |
224 | if not os.path.exists(crop_destination):
225 | make_single_crop(pano_img_path, pano_x, pano_y, crop_destination, draw_mark=mark_label)
226 | print("Successfully extracted crop to " + str(label_id) + ".jpg")
227 | logging.info(f'{str(label_id)}.jpg {pano_id} {str(pano_x)} {str(pano_y)} {str(label_id)}')
228 | logging.info("---------------------------------------------------")
229 | success += 1
230 | else:
231 | no_pano_fail += 1
232 | print("Panorama image not found.")
233 | logging.warning("Skipped label id " + str(label_id) + " due to missing image.")
234 |
235 | print("Finished.")
236 | print(f"{no_pano_fail} extractions failed because panorama image was not found.")
237 | print(f"{no_metadata_fail} extractions failed because metadata was not found.")
238 | print(f"{success} extractions were successful.")
239 | return
240 |
241 |
242 | print("Cropping labels")
243 |
244 | if label_metadata_file is not None:
245 | file_path = os.path.splitext(label_metadata_file)
246 | if file_path[-1] == ".csv":
247 | label_infos = fetch_label_ids_csv(label_metadata_file)
248 | elif file_path[-1] == ".json":
249 | label_infos = fetch_cvMetadata_from_file(label_metadata_file)
250 | else:
251 | label_infos = fetch_cvMetadata_from_server(sidewalk_server_fdqn)
252 |
253 | bulk_extract_crops(label_infos, gsv_pano_path, crop_destination_path, mark_label=MARK_LABEL)
254 |
--------------------------------------------------------------------------------
/samples/sample_meta.xml:
--------------------------------------------------------------------------------
1 | © 2016 Google4500 Sheriff Rd NE4500Washington, District of ColumbiaUnited StatesSheriff Rd NESheriff Rd NESheriff Rd 
--------------------------------------------------------------------------------
/DownloadRunner.py:
--------------------------------------------------------------------------------
1 | # !/usr/bin/python3
2 |
3 | import os
4 | from os.path import exists
5 | import stat
6 | import http.client
7 | import json
8 | import logging
9 | from datetime import datetime
10 | import time
11 | import requests
12 | from requests.adapters import HTTPAdapter
13 | from requests.packages.urllib3.util.retry import Retry
14 | from PIL import Image
15 | import fnmatch
16 | import pandas as pd
17 | import random
18 | from config import headers_list, proxies, thread_count
19 | import argparse
20 | import asyncio
21 | import aiohttp
22 | from aiohttp import web
23 | import backoff
24 | from io import BytesIO
25 | import math
26 |
27 | try:
28 | from xml.etree import cElementTree as ET
29 | except ImportError as e:
30 | from xml.etree import ElementTree as ET
31 |
32 |
33 | class Enum(object):
34 | def __init__(self, tuplelist):
35 | self.tuplelist = tuplelist
36 |
37 | def __getattr__(self, name):
38 | return self.tuplelist.index(name)
39 |
40 |
41 | DownloadResult = Enum(('skipped', 'success', 'fallback_success', 'failure'))
42 |
43 | delay = 0
44 |
45 | # Check proxy settings, if none provided (default) set proxies to False
46 | if proxies['http'] == "http://" or proxies['https'] == "https://":
47 | proxies['http'] = None
48 | proxies['https'] = None
49 |
50 | parser = argparse.ArgumentParser()
51 | parser.add_argument('d', help='sidewalk_server_domain - FDQN of SidewalkWebpage server to fetch pano list from, i.e. sidewalk-columbus.cs.washington.edu')
52 | parser.add_argument('s', help='storage_path - location to store scraped panos')
53 | parser.add_argument('-c', nargs='?', default=None, help='csv_path - location of csv from which to read pano metadata')
54 | parser.add_argument('--all-panos', action='store_true', help='Run on all panos that users visited, even if no labels were added on them.')
55 | parser.add_argument('--attempt-depth', action='store_true', help='Attempt do download depth data (we believe that this endpoint was removed in 2022 and expect depth download to always fail).')
56 | args = parser.parse_args()
57 |
58 | sidewalk_server_fqdn = args.d
59 | storage_location = args.s
60 | pano_metadata_csv = args.c
61 | all_panos = args.all_panos
62 | attempt_depth = args.attempt_depth
63 |
64 | print(sidewalk_server_fqdn)
65 | print(storage_location)
66 | print(pano_metadata_csv)
67 | print(all_panos)
68 | # sidewalk_server_fqdn = "sidewalk-columbus.cs.washington.edu" # TODO: use as defaults?
69 | # storage_location = "download_data/" # The path to where you want to store downloaded GSV panos
70 |
71 | if not os.path.exists(storage_location):
72 | os.makedirs(storage_location)
73 |
74 | print("Starting run with pano list fetched from %s and destination path %s" % (sidewalk_server_fqdn, storage_location))
75 |
76 |
77 | def new_random_delay():
78 | """
79 | New random delay value generated
80 | :return: int between 50 and 250 in steps of 3
81 | """
82 | return random.randrange(100, 200, 3)
83 |
84 |
85 | def random_header():
86 | """
87 | Takes the headers provided from the config file and randomly selections and returns one each time this function
88 | is called.
89 | :return: a randomly selected header file.
90 | """
91 | headers = random.choice(headers_list)
92 | return headers
93 |
94 |
95 | # Set up the requests session for better robustness/respect of crawling
96 | # https://stackoverflow.com/questions/23013220/max-retries-exceeded-with-url-in-requests
97 | # Server errors while using proxy - https://findwork.dev/blog/advanced-usage-python-requests-timeouts-retries-hooks/
98 | def request_session():
99 | """
100 | Sets up a request session to be used for duration of scripts operation.
101 | :return: session
102 | """
103 | session = requests.Session()
104 | retry = Retry(total=5, connect=5, status_forcelist=[429, 500, 502, 503, 504], backoff_factor=1)
105 | adapter = HTTPAdapter(max_retries=retry)
106 | session.mount('http://', adapter)
107 | session.mount('https://', adapter)
108 | return session
109 |
110 |
111 | def get_response(url, session, stream=False):
112 | """
113 | Uses requests library to get response
114 | :param url: url to visit
115 | :param session: requests session
116 | :param stream: Default False
117 | :return: response
118 | """
119 | response = session.get(url, headers=random_header(), proxies=proxies, stream=stream)
120 |
121 | if not stream:
122 | return response
123 | else:
124 | return response.raw
125 |
126 |
127 | def progress_check(csv_pano_log_path):
128 | """
129 | Checks download status via a csv: log as skipped if downloaded == 1, failure if download == 0.
130 | This speeds things up instead of trying to re-download broken links or images.
131 | NB: This will not check if the failure was due to internet connection being unavailable etc. so use with caution.
132 | :param csv_pano_log_path:
133 | :return: pano_ids processed, total count of processed, count of success, count of failure
134 | """
135 | # temporary skip/speed up of processed panos
136 | df_pano_id_check = pd.read_csv(csv_pano_log_path)
137 | df_id_set = set(df_pano_id_check['gsv_pano_id'])
138 | total_processed = len(df_pano_id_check.index)
139 | total_success = df_pano_id_check['downloaded'].sum()
140 | total_failed = total_processed - total_success
141 | return df_id_set, total_processed, total_success, total_failed
142 |
143 |
144 | # Not currently used - data retrieved from Project Sidewalk API
145 | def extract_panowidthheight(path_to_metadata_xml):
146 | pano = {}
147 | pano_xml = open(path_to_metadata_xml, 'rb')
148 | tree = ET.parse(pano_xml)
149 | root = tree.getroot()
150 | for child in root:
151 | if child.tag == 'data_properties':
152 | pano[child.tag] = child.attrib
153 |
154 | return int(pano['data_properties']['width']), int(pano['data_properties']['height'])
155 |
156 | # Fallback function to get unique pano_ids in case we want to determine panoramas for scraping from a CSV.
157 | def fetch_pano_ids_csv(metadata_csv_path):
158 | """
159 | Function loads the provided metadata csv file (downloaded from the server) as a dataframe. This dataframe replaces
160 | all the information that is needed to be gathered from Google maps, such as image size, image capture, coordinates.
161 | :param metadata_csv_path: The path to the metadata csv file and the file's name eg. sample/metadata-seattle.csv
162 | :return: A dataframe containing the follow metadata: gsv_panorama_id, pano_x, pano_y, zoom, label_type_id,
163 | camera_heading, heading, pitch, label_id, width, height, tile_width, tile_height, image_date, imagery_type,
164 | pano_lat, pano_lng, label_lat, label_lng, computation_method, copyright
165 | """
166 | df_meta = pd.read_csv(metadata_csv_path)
167 | df_meta = df_meta.drop_duplicates(subset=['gsv_panorama_id']).to_dict('records')
168 | return df_meta
169 |
170 |
171 | def fetch_pano_ids_from_webserver(include_all_panos):
172 | """
173 | Fetch panoramic image IDs from the web server.
174 |
175 | Args:
176 | include_all_panos (bool): If True, output all panos, whether or not they have a label.
177 | If False, output only panos with labels on them.
178 |
179 | Returns:
180 | list[str]: A list of panoramic image ID strings retrieved from the server.
181 | """
182 | unique_ids = set()
183 | pano_info = []
184 | conn = http.client.HTTPSConnection(sidewalk_server_fqdn)
185 | conn.request("GET", "/adminapi/panos")
186 | r1 = conn.getresponse()
187 | data = r1.read()
188 | jsondata = json.loads(data)
189 |
190 | # Structure of JSON data
191 | # [
192 | # {
193 | # "gsv_panorama_id": String,
194 | # "width": Int,
195 | # "height": Int,
196 | # "lat": Float,
197 | # "lng": Float,
198 | # "camera_heading": Float,
199 | # "camera_pitch": Float
200 | # },
201 | # ...
202 | # ]
203 | for value in jsondata:
204 | pano_id = value["gsv_panorama_id"]
205 | has_labels = value["has_labels"]
206 | if (include_all_panos or has_labels) and pano_id not in unique_ids:
207 | # Check if the pano_id is an empty string.
208 | if pano_id and pano_id != 'tutorial':
209 | unique_ids.add(pano_id)
210 | pano_info.append(value)
211 | else:
212 | print("Pano ID is an empty string or is for tutorial")
213 | assert len(unique_ids) == len(pano_info)
214 | return pano_info
215 |
216 |
217 | def download_panorama_images(storage_path, pano_infos):
218 | logging.basicConfig(filename='scrape.log', level=logging.DEBUG)
219 | success_count, skipped_count, fallback_success_count, fail_count, total_completed = 0, 0, 0, 0, 0
220 | total_panos = len(pano_infos)
221 |
222 | # csv log file for pano_id failures, place in 'storage' folder (alongside pano results)
223 | csv_pano_log_path = os.path.join(storage_path, "gsv_panorama_id_log.csv")
224 | columns = ['gsv_pano_id', 'downloaded']
225 | if not exists(csv_pano_log_path):
226 | df_pano_id_log = pd.DataFrame(columns=columns)
227 | df_pano_id_log.to_csv(csv_pano_log_path, mode='w', header=True, index=False)
228 | else:
229 | df_pano_id_log = pd.read_csv(csv_pano_log_path)
230 | processed_ids = list(df_pano_id_log['gsv_pano_id'])
231 |
232 | df_id_set, total_completed, skipped_count, fail_count = progress_check(csv_pano_log_path)
233 |
234 | for pano_info in pano_infos:
235 | pano_id = pano_info['gsv_panorama_id']
236 | if pano_id in df_id_set:
237 | continue
238 | start_time = time.time()
239 | print("IMAGEDOWNLOAD: Processing pano %s " % (pano_id))
240 | try:
241 | pano_dims = (pano_info['width'], pano_info['height'])
242 | result_code = download_single_pano(storage_path, pano_id, pano_dims)
243 | if result_code == DownloadResult.success:
244 | success_count += 1
245 | elif result_code == DownloadResult.fallback_success:
246 | fallback_success_count += 1
247 | elif result_code == DownloadResult.skipped:
248 | skipped_count += 1
249 | elif result_code == DownloadResult.failure:
250 | fail_count += 1
251 | downloaded = 0 if result_code == DownloadResult.failure else 1
252 |
253 | except Exception as e:
254 | fail_count += 1
255 | downloaded = 0
256 | logging.error("IMAGEDOWNLOAD: Failed to download pano %s due to error %s", pano_id, str(e))
257 | total_completed = success_count + fallback_success_count + fail_count + skipped_count
258 |
259 | if pano_id not in processed_ids:
260 | df_data_append = pd.DataFrame([[pano_id, downloaded]], columns=columns)
261 | df_data_append.to_csv(csv_pano_log_path, mode='a', header=False, index=False)
262 | else:
263 | df_pano_id_log = pd.read_csv(csv_pano_log_path)
264 | df_pano_id_log.loc[df_pano_id_log['gsv_pano_id'] == pano_id, 'downloaded'] = downloaded
265 | df_pano_id_log.to_csv(csv_pano_log_path, mode='w', header=True, index=False)
266 | processed_ids.append(pano_id)
267 |
268 | print("IMAGEDOWNLOAD: Completed %d of %d (%d success, %d fallback success, %d failed, %d skipped)"
269 | % (total_completed, total_panos, success_count, fallback_success_count, fail_count, skipped_count))
270 | print("--- %s seconds ---" % (time.time() - start_time))
271 |
272 | logging.debug(
273 | "IMAGEDOWNLOAD: Final result: Completed %d of %d (%d success, %d fallback success, %d failed, %d skipped)",
274 | total_completed,
275 | total_panos,
276 | success_count,
277 | fallback_success_count,
278 | fail_count,
279 | skipped_count)
280 |
281 | return success_count, fallback_success_count, fail_count, skipped_count, total_completed
282 |
283 |
284 | def download_single_pano(storage_path, pano_id, pano_dims):
285 | base_url = 'https://maps.google.com/cbk?output=tile&cb_client=maps_sv&fover=2&onerr=3&renderer=spherical&v=4'
286 |
287 | destination_dir = os.path.join(storage_path, pano_id[:2])
288 | if not os.path.isdir(destination_dir):
289 | os.makedirs(destination_dir)
290 | os.chmod(destination_dir, 0o775 | stat.S_ISGID)
291 |
292 | filename = pano_id + ".jpg"
293 | out_image_name = os.path.join(destination_dir, filename)
294 |
295 | # Skip download if image already exists
296 | if os.path.isfile(out_image_name):
297 | return DownloadResult.skipped
298 |
299 | final_image_width = int(pano_dims[0]) if pano_dims[0] is not None else None
300 | final_image_height = int(pano_dims[1]) if pano_dims[1] is not None else None
301 | zoom = None
302 |
303 | session = request_session()
304 |
305 | # Check XML metadata for image width/height max zoom if its downloaded.
306 | xml_metadata_path = os.path.join(destination_dir, pano_id + ".xml")
307 | if os.path.isfile(xml_metadata_path):
308 | print(xml_metadata_path)
309 | with open(xml_metadata_path, 'rb') as pano_xml:
310 | try:
311 | tree = ET.parse(pano_xml)
312 | root = tree.getroot()
313 |
314 | # Get the number of zoom levels.
315 | for child in root:
316 | if child.tag == 'data_properties':
317 | zoom = int(child.attrib['num_zoom_levels'])
318 | if final_image_width is None: final_image_width = int(child.attrib['width'])
319 | if final_image_height is None: final_image_height = int(child.attrib['height'])
320 |
321 | # If there is no zoom in the XML, then we skip this and try some zoom levels below.
322 | if zoom is not None:
323 | # Check if the image exists (occasionally we will have XML but no JPG).
324 | test_url = f'{base_url}&zoom={zoom}&x=0&y=0&panoid={pano_id}'
325 | test_request = get_response(test_url, session, stream=True)
326 | test_tile = Image.open(test_request)
327 | if test_tile.convert("L").getextrema() == (0, 0):
328 | return DownloadResult.failure
329 | except Exception as e:
330 | pass
331 |
332 | # If we did not find image width/height from API or XML, then set download to failure.
333 | if final_image_width is None or final_image_height is None:
334 | return DownloadResult.failure
335 |
336 | # If we did not find a zoom level in the XML above, then try a couple zoom level options here.
337 | if zoom is None:
338 | url_zoom_3 = f'{base_url}&zoom=3&x=0&y=0&panoid={pano_id}'
339 | url_zoom_5 = f'{base_url}&zoom=5&x=0&y=0&panoid={pano_id}'
340 |
341 | req_zoom_3 = get_response(url_zoom_3, session, stream=True)
342 | im_zoom_3 = Image.open(req_zoom_3)
343 | req_zoom_5 = get_response(url_zoom_5, session, stream=True)
344 | im_zoom_5 = Image.open(req_zoom_5)
345 |
346 | # In some cases (e.g., old GSV images), we don't have zoom level 5, so Google returns a
347 | # transparent image. This means we need to set the zoom level to 3. Google also returns a
348 | # transparent image if there is no imagery. So check at both zoom levels. How to check:
349 | # http://stackoverflow.com/questions/14041562/python-pil-detect-if-an-image-is-completely-black-or-white
350 | if im_zoom_5.convert("L").getextrema() != (0, 0):
351 | zoom = 5
352 | elif im_zoom_3.convert("L").getextrema() != (0, 0):
353 | zoom = 3
354 | else:
355 | # can't determine zoom
356 | return DownloadResult.failure
357 |
358 | final_im_dimension = (final_image_width, final_image_height)
359 |
360 | def generate_gsv_urls(zoom):
361 | """
362 | Generates all valid urls of GSV tiles to be downloaded for stitching into single panorama.
363 | :param zoom: the valid/working zoom value for this pano_id
364 | :return: a list of all valid urls to be accessed for downloading the panorama
365 | """
366 | sites_gsv = []
367 | for y in range(int(math.ceil(final_image_height / 512.0))):
368 | for x in range(int(math.ceil(final_image_width / 512.0))):
369 | url = f'{base_url}&zoom={zoom}&x={str(x)}&y={str(y)}&panoid={pano_id}'
370 | sites_gsv.append((str(x) + " " + str(y), url))
371 | return sites_gsv
372 |
373 | @backoff.on_exception(backoff.expo, (aiohttp.web.HTTPServerError, aiohttp.ClientError, aiohttp.ClientResponseError,
374 | aiohttp.ServerConnectionError, aiohttp.ServerDisconnectedError,
375 | aiohttp.ClientHttpProxyError), max_tries=10)
376 | async def download_single_gsv(session, url):
377 | """
378 | Downloads a single 512x512 panorama tile
379 | :param session: requests sessions object
380 | :param url: the url to be accessed where the target image is
381 | :return: a list containing - x and y position of the download image, downloaded image
382 | """
383 | # TODO: possibly not needed
384 | # # If not using proxies, delay for a little bit to avoid hammering the server
385 | # if proxies["http"] is None:
386 | # time.sleep(new_random_delay() / 1000)
387 | async with session.get(url[1], proxy=proxies["http"], headers=random_header()) as response:
388 | head_content = response.headers['Content-Type']
389 | # ensures content type is an image
390 | if head_content[0:10] != "image/jpeg":
391 | raise aiohttp.ClientResponseError(response.request_info, response.history)
392 | image = await response.content.read()
393 | return [url[0], image]
394 |
395 | @backoff.on_exception(backoff.expo,
396 | (aiohttp.web.HTTPServerError, aiohttp.ClientError, aiohttp.ClientResponseError, aiohttp.ServerConnectionError,
397 | aiohttp.ServerDisconnectedError, aiohttp.ClientHttpProxyError), max_tries=10)
398 | async def download_all_gsv_images(sites):
399 | """
400 | For the given list of sites/urls that make up a single GSV panorama, starts the connections, breaks each of the
401 | sites into tasks, then runs these tasks through asyncio.
402 | :param sites: list of all valid urls that make up the image
403 | :return: responses from the tasks which contains all the images and their position x and y data
404 | (needed for stitching)
405 | """
406 | conn = aiohttp.TCPConnector(limit=thread_count)
407 | async with aiohttp.ClientSession(raise_for_status=True, connector=conn) as session:
408 | tasks = []
409 | for url in sites:
410 | task = asyncio.ensure_future(download_single_gsv(session, url))
411 | tasks.append(task)
412 | responses = await asyncio.gather(*tasks, return_exceptions=True)
413 | return responses
414 |
415 | blank_image = Image.new('RGB', final_im_dimension, (0, 0, 0, 0))
416 | sites = generate_gsv_urls(zoom)
417 | all_pano_images = asyncio.get_event_loop().run_until_complete(download_all_gsv_images(sites))
418 |
419 | for cell_image in all_pano_images:
420 | img = Image.open(BytesIO(cell_image[1]))
421 | img = img.resize((512, 512))
422 | x, y = int(str.split(cell_image[0])[0]), int(str.split(cell_image[0])[1])
423 | blank_image.paste(img, (512 * x, 512 * y))
424 |
425 | # TODO: sleep after entire pano downloaded versus each tile?
426 |
427 | if zoom == 3:
428 | blank_image = blank_image.resize(final_im_dimension, Image.ANTIALIAS)
429 | blank_image.save(out_image_name, 'jpeg')
430 | os.chmod(out_image_name, 0o664)
431 | return DownloadResult.success
432 |
433 |
434 | def download_panorama_metadata_xmls(storage_path, pano_infos):
435 | '''
436 | This method downloads a xml file that contains depth information from GSV. It first
437 | checks if we have a folder for each pano_id, and checks if we already have the corresponding
438 | depth file or not.
439 | '''
440 | total_panos = len(pano_infos)
441 | success_count = 0
442 | fail_count = 0
443 | skipped_count = 0
444 | total_completed = 0
445 |
446 | for pano_info in pano_infos:
447 | pano_id = pano_info['gsv_panorama_id']
448 | print("METADOWNLOAD: Processing pano %s " % (pano_id))
449 | try:
450 | result_code = download_single_metadata_xml(storage_path, pano_id)
451 | if result_code == DownloadResult.failure:
452 | fail_count += 1
453 | elif result_code == DownloadResult.success:
454 | success_count += 1
455 | elif result_code == DownloadResult.skipped:
456 | skipped_count += 1
457 | except Exception as e:
458 | fail_count += 1
459 | logging.error("METADOWNLOAD: Failed to download metadata for pano %s due to error %s", pano_id, str(e))
460 | total_completed = fail_count + success_count + skipped_count
461 | print("METADOWNLOAD: Completed %d of %d (%d success, %d failed, %d skipped)" %
462 | (total_completed, total_panos, success_count, fail_count, skipped_count))
463 |
464 | logging.debug("METADOWNLOAD: Final result: Completed %d of %d (%d success, %d failed, %d skipped)",
465 | total_completed, total_panos, success_count, fail_count, skipped_count)
466 | return (success_count, fail_count, skipped_count, total_completed)
467 |
468 |
469 | def download_single_metadata_xml(storage_path, pano_id):
470 | base_url = "https://maps.google.com/cbk?output=xml&cb_client=maps_sv&hl=en&dm=1&pm=1&ph=1&renderer=cubic,spherical&v=4&panoid="
471 |
472 | # Check if the directory exists. Then check if the file already exists and skip if it does.
473 | destination_folder = os.path.join(storage_path, pano_id[:2])
474 | if not os.path.isdir(destination_folder):
475 | os.makedirs(destination_folder)
476 | os.chmod(destination_folder, 0o775 | stat.S_ISGID)
477 |
478 | filename = pano_id + ".xml"
479 | destination_file = os.path.join(destination_folder, filename)
480 | if os.path.isfile(destination_file):
481 | return DownloadResult.skipped
482 |
483 | url = base_url + pano_id
484 |
485 | session = request_session()
486 | req = get_response(url, session)
487 |
488 | # Check if the XML file is empty. If not, write it out to a file and set the permissions.
489 | lineOne = req.content.splitlines()[0]
490 | lineFive = req.content.splitlines()[4]
491 |
492 | if lineOne == b'' or lineFive == b' Error 404 (Not Found)!!1':
493 | return DownloadResult.failure
494 | else:
495 | with open(destination_file, 'wb') as f:
496 | f.write(req.content)
497 | os.chmod(destination_file, 0o664)
498 |
499 | return DownloadResult.success
500 |
501 |
502 | def generate_depthmapfiles(path_to_scrapes):
503 | success_count = 0
504 | fail_count = 0
505 | skip_count = 0
506 | total_completed = 0
507 | # Iterate through all .xml files in specified path, recursively
508 | for root, dirnames, filenames in os.walk(path_to_scrapes):
509 | for filename in fnmatch.filter(filenames, '*.xml'):
510 | xml_location = os.path.join(root, filename)
511 |
512 | # Pano id is XML filename minus the extension
513 | pano_id = filename[:-4]
514 | print("GENERATEDEPTH: Processing pano %s " % (pano_id))
515 |
516 | # Generate a .depth.txt file for the .xml file
517 | output_file = os.path.join(root, pano_id + ".depth.txt")
518 | if os.path.isfile(output_file):
519 | skip_count += 1
520 | else:
521 | output_code = call(["./decode_depthmap", xml_location, output_file])
522 | if output_code == 0:
523 | os.chmod(output_file, 0o664)
524 | success_count += 1
525 | else:
526 | fail_count += 1
527 | logging.error("GENERATEDEPTH: Could not create depth.txt for pano %s, error code was %s", pano_id,
528 | str(output_code))
529 | total_completed = fail_count + success_count + skip_count
530 | print("GENERATEDEPTH: Completed %d (%d success, %d failed, %d skipped)" %
531 | (total_completed, success_count, fail_count, skip_count))
532 |
533 | logging.debug("GENERATEDEPTH: Final result: Completed %d (%d success, %d failed, %d skipped)",
534 | total_completed, success_count, fail_count, skip_count)
535 | return success_count, fail_count, skip_count, total_completed
536 |
537 |
538 | def run_scraper_and_log_results(pano_infos, attempt_depth):
539 | start_time = datetime.now()
540 | with open(os.path.join(storage_location, "log.csv"), 'a') as log:
541 | log.write("\n%s" % (str(start_time)))
542 |
543 | # Try to download xml (which contains the depth data) if attempt_depth=True.
544 | xml_res = ()
545 | if attempt_depth:
546 | xml_res = download_panorama_metadata_xmls(storage_location, pano_infos)
547 | else:
548 | xml_res = (0, 0, len(pano_infos), len(pano_infos))
549 | xml_end_time = datetime.now()
550 | xml_duration = int(round((xml_end_time - start_time).total_seconds() / 60.0))
551 | with open(os.path.join(storage_location, "log.csv"), 'a') as log:
552 | log.write(",%d,%d,%d,%d,%d" % (xml_res[0], xml_res[1], xml_res[2], xml_res[3], xml_duration))
553 |
554 | im_res = download_panorama_images(storage_location, pano_infos)
555 | im_end_time = datetime.now()
556 | im_duration = int(round((im_end_time - xml_end_time).total_seconds() / 60.0))
557 | with open(os.path.join(storage_location, "log.csv"), 'a') as log:
558 | log.write(",%d,%d,%d,%d,%d,%d" % (im_res[0], im_res[1], im_res[2], im_res[3], im_res[4], im_duration))
559 |
560 | # Try to extract depth data from the xml if attempt_depth=True.
561 | depth_res = ()
562 | if attempt_depth:
563 | depth_res = generate_depthmapfiles(storage_location)
564 | else:
565 | depth_res = (0, 0, 0, 0)
566 | depth_end_time = datetime.now()
567 | depth_duration = int(round((depth_end_time - im_end_time).total_seconds() / 60.0))
568 | with open(os.path.join(storage_location, "log.csv"), 'a') as log:
569 | log.write(",%d,%d,%d,%d,%d" % (depth_res[0], depth_res[1], depth_res[2], depth_res[3], depth_duration))
570 |
571 | total_duration = int(round((depth_end_time - start_time).total_seconds() / 60.0))
572 | with open(os.path.join(storage_location, "log.csv"), 'a') as log:
573 | log.write(",%d" % (total_duration))
574 |
575 |
576 | # Access Project Sidewalk API to get Pano IDs for city
577 | print("Fetching pano-ids")
578 |
579 | if pano_metadata_csv is not None:
580 | pano_infos = fetch_pano_ids_csv(pano_metadata_csv)
581 | else:
582 | pano_infos = fetch_pano_ids_from_webserver(all_panos)
583 |
584 |
585 | # Uncomment this to test on a smaller subset of the pano_info
586 | # pano_infos = random.sample(pano_infos, 10)
587 | print(len(pano_infos))
588 | # print(pano_infos)
589 |
590 | # Use pano_id list and associated info to gather panos from GSV API
591 | print("Fetching Panoramas")
592 | run_scraper_and_log_results(pano_infos, attempt_depth)
593 |
--------------------------------------------------------------------------------