├── samples ├── sample_label.txt ├── sample_crop.jpg ├── sample_pano.jpg ├── getFullLabelList.sql ├── metadata-seattle.csv ├── labeldata.csv └── sample_meta.xml ├── decode_depthmap ├── requirements.txt ├── .gitignore ├── Dockerfile ├── flag_panos ├── README.md ├── json_to_csv.py ├── index.html └── index.js ├── DownloadRunnerDockerEntrypoint.sh ├── config.py ├── README.md ├── CropRunner.py └── DownloadRunner.py /samples/sample_label.txt: -------------------------------------------------------------------------------- 1 | pano_x 5110 2 | pano_y -688 3 | 4 | 7442 5 | -708 6 | -------------------------------------------------------------------------------- /decode_depthmap: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/ProjectSidewalk/sidewalk-panorama-tools/HEAD/decode_depthmap -------------------------------------------------------------------------------- /requirements.txt: -------------------------------------------------------------------------------- 1 | pandas>=1.2.3 2 | Pillow>=8.1.2 3 | requests==2.25.1 4 | aiohttp>=3.7.4 5 | backoff>=1.10.0 -------------------------------------------------------------------------------- /samples/sample_crop.jpg: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/ProjectSidewalk/sidewalk-panorama-tools/HEAD/samples/sample_crop.jpg -------------------------------------------------------------------------------- /samples/sample_pano.jpg: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/ProjectSidewalk/sidewalk-panorama-tools/HEAD/samples/sample_pano.jpg -------------------------------------------------------------------------------- /.gitignore: -------------------------------------------------------------------------------- 1 | .vscode 2 | *.pyc 3 | *.swp 4 | id_rsa 5 | venv 6 | *.log 7 | crop.log.bak.txt 8 | .idea 9 | cropsbak 10 | -------------------------------------------------------------------------------- /Dockerfile: -------------------------------------------------------------------------------- 1 | FROM ubuntu:20.04 2 | ARG DEBIAN_FRONTEND=noninteractive 3 | COPY . /app 4 | WORKDIR /app 5 | RUN apt-get update 6 | RUN apt-get install -y sshfs python3 python3-pip 7 | RUN pip3 install -r requirements.txt 8 | ENTRYPOINT ["./DownloadRunnerDockerEntrypoint.sh"] 9 | CMD [] 10 | -------------------------------------------------------------------------------- /samples/getFullLabelList.sql: -------------------------------------------------------------------------------- 1 | SELECT gsv_data.gsv_panorama_id, pano_x, pano_y, label_type_id, camera_heading, heading, pitch, label.label_id 2 | FROM label_point 3 | INNER JOIN label 4 | INNER JOIN gsv_data ON label.gsv_panorama_id = gsv_data.gsv_panorama_id 5 | ON label.label_id = label_point.label_id; 6 | -------------------------------------------------------------------------------- /flag_panos/README.md: -------------------------------------------------------------------------------- 1 | # Flag Panos 2 | 3 | This is a small web tool that Michael Duan (@michaelduan8) created to check the GSV API to see if it has metadata for panos that we don't currently have in our database. It was a one-time-use tool that was created to deal with the period right after Google shut down their depth data endpoint. Our code required access to it, so there was a lapse period where we did not have data. We are keeping this tool around in case it is ever useful again, but it won't be useful to others. 4 | -------------------------------------------------------------------------------- /flag_panos/json_to_csv.py: -------------------------------------------------------------------------------- 1 | import pandas as pd 2 | 3 | CITY = 'amsterdam' 4 | # JSON_TO_CONVERT = f'{CITY}_unretrievable_panos.json' 5 | # CSV_OUTPUT = f'{CITY}_unretrievable_panos.csv' 6 | 7 | JSON_TO_CSV_MAP = { 8 | f'{CITY}_pano_image_data.json': f'{CITY}_pano_image_data.csv', 9 | f'{CITY}_unretrievable_panos.json': f'{CITY}_unretrievable_panos.csv' 10 | } 11 | 12 | for json_file in JSON_TO_CSV_MAP.keys(): 13 | with open(json_file, encoding='utf-8') as f: 14 | df = pd.read_json(f) 15 | 16 | df.to_csv(JSON_TO_CSV_MAP[json_file], encoding='utf-8', index=False) 17 | -------------------------------------------------------------------------------- /samples/metadata-seattle.csv: -------------------------------------------------------------------------------- 1 | gsv_panorama_id,pano_x,pano_y,zoom,label_type_id,camera_heading,heading,pitch,label_id,width,height,tile_width,tile_height,image_date,imagery_type,pano_lat,pano_lng,label_lat,label_lng,computation_method,copyright 2 | _qVKgG3dGOoClMQI6QgVRg,10536,5049,3,1,143.214004516602,193.879470825195,-21.1875,120954,16384,8192,512,512,2019-06,1,47.6119232177734,-122.326042175293,47.6118603446453,-122.32606911454,approximation2, 3 | 7nNj0FtXp4bXcUGARDOcCg,6383,4553,1,1,181.617828369141,118,-22.0625,47125,16384,8192,512,512,2018-10,1,47.6568489074707,-122.313278198242,47.6567573547363,-122.313179016113,depth, 4 | -------------------------------------------------------------------------------- /samples/labeldata.csv: -------------------------------------------------------------------------------- 1 | gsv_panorama_id,pano_x,pano_y,label_type_id,camera_heading,heading,pitch,label_id 2 | 7nNj0FtXp4bXcUGARDOcCg,9785,4677,1,181.617828369141,202.9375,-17.5625,47126 3 | 7nNj0FtXp4bXcUGARDOcCg,10679,4747,1,181.617828369141,231.589279174805,-16.390625,74048 4 | 7nNj0FtXp4bXcUGARDOcCg,1912,4550,1,181.617828369141,51.2209815979004,-14.09375,74054 5 | _qVKgG3dGOoClMQI6QgVRg,9044,5043,1,143.214004516602,158.910720825195,-21.234375,120955 6 | _qVKgG3dGOoClMQI6QgVRg,15154,4963,1,143.214004516602,306.191955566406,-18.65625,120952 7 | _qVKgG3dGOoClMQI6QgVRg,15996,4827,3,143.214004516602,323.113830566406,-16.078125,120950 8 | -------------------------------------------------------------------------------- /DownloadRunnerDockerEntrypoint.sh: -------------------------------------------------------------------------------- 1 | #!/bin/bash 2 | # ./DownloadRunnerDockerEntrypoint sidewalk_server_fqdn 3 | # ./DownloadRunnerDockerEntrypoint sidewalk_server_fqdn user@host:/remote/path port 4 | 5 | mkdir -p /tmp/download_dest 6 | chmod 600 /app/id_rsa 7 | 8 | # Parse optional parameters at the end 9 | all_panos="" 10 | attempt_depth="" 11 | 12 | # Process arguments from the end 13 | while [[ $# -gt 0 ]]; do 14 | case "${@: -1}" in 15 | "--all-panos") 16 | all_panos="--all-panos" 17 | set -- "${@:1:$(($#-1))}" 18 | ;; 19 | "--attempt-depth") 20 | attempt_depth="--attempt-depth" 21 | set -- "${@:1:$(($#-1))}" 22 | ;; 23 | *) 24 | # Not an optional parameter, stop processing 25 | break 26 | ;; 27 | esac 28 | done 29 | 30 | # If one param, just download to /tmp. If three params, this means a host and port has been supplied. 31 | if [ $# -eq 1 ]; then 32 | python3 DownloadRunner.py $1 /tmp/download_dest $all_panos 33 | elif [ $# -eq 3 ]; then 34 | echo "Mounting $2 port $3 for $1" 35 | sshfs -o IdentityFile=/app/id_rsa,StrictHostKeyChecking=no $2 /tmp/download_dest -p $3 && python3 DownloadRunner.py $1 /tmp/download_dest $all_panos; umount /tmp/download_dest 36 | else 37 | echo "Usage:" 38 | echo " ./DownloadRunnerDockerEntrypoint sidewalk_server_fqdn" 39 | echo " ./DownloadRunnerDockerEntrypoint sidewalk_server_fqdn user@host:/remote/path port" 40 | fi 41 | -------------------------------------------------------------------------------- /flag_panos/index.html: -------------------------------------------------------------------------------- 1 | 2 | 3 | 4 | Street View split-map-panes 5 | 6 | 7 | 8 | 9 | 32 | 33 |
34 | 48 | 49 | 50 | 51 | 55 | 62 | 63 | 64 | -------------------------------------------------------------------------------- /flag_panos/index.js: -------------------------------------------------------------------------------- 1 | const CITY = 'amsterdam' 2 | const SIDEWALK_SERVER_FQDN = `https://sidewalk-${CITY}.cs.washington.edu` 3 | 4 | const OUTPUT_JSON = `${CITY}_pano_image_data.json` 5 | const UNRETRIEVABLE_PANOS_JSON = `${CITY}_unretrievable_panos.json` 6 | 7 | const CHUNK_SIZE = 10000; 8 | 9 | function getPanos(url, callback) { 10 | // grab panorama info from Project Sidewalk endpoint 11 | fetch(url) 12 | .then(response => response.json()) 13 | .then(result => callback(result)); 14 | } 15 | 16 | async function flag_panos_for_redownload(pano_data) { 17 | // initially, filter out panos that already have image data or have empty pano_id 18 | filtered_pano_data = pano_data.filter(pano => pano["gsv_panorama_id"] && (!pano["width"] || !pano["height"])); 19 | console.log(filtered_pano_data.length); 20 | 21 | // instantiate streetviewservice instance 22 | let streetViewService = new google.maps.StreetViewService(); 23 | 24 | let new_pano_data = []; 25 | let failed_to_retrieve_metadata = []; 26 | 27 | // Check pano metadata in chunks 28 | for (let i = 0; i < filtered_pano_data.length; i += CHUNK_SIZE) { 29 | let metadata_promises = []; 30 | 31 | pano_slice = filtered_pano_data.slice(i, i + CHUNK_SIZE); 32 | for (let pano of pano_slice) { 33 | // console.log(pano) 34 | let metadata_promise = streetViewService.getPanorama({pano: pano["gsv_panorama_id"]}, function(svPanoData, status) { 35 | if (status === google.maps.StreetViewStatus.OK) { 36 | tiles = svPanoData.tiles; 37 | new_pano_data.push({ 38 | gsv_panorama_id: pano["gsv_panorama_id"], 39 | image_width: tiles.worldSize.width, 40 | image_height: tiles.worldSize.height, 41 | tile_width: tiles.tileSize.width, 42 | tile_height: tiles.tileSize.height, 43 | copyright: svPanoData.copyright, 44 | center_heading: tiles.centerHeading, 45 | origin_heading: tiles.originHeading, 46 | origin_pitch: tiles.originPitch 47 | }); 48 | } else { 49 | // no street view data available for this panorama. 50 | //console.error(`Error loading Street View imagery for ${pano["gsv_panorama_id"]}: ${status}`); 51 | failed_to_retrieve_metadata.push({gsv_panorama_id: pano["gsv_panorama_id"]}); 52 | } 53 | }); 54 | 55 | metadata_promises.push(metadata_promise); 56 | } 57 | 58 | // wait for all metadata promises to resolve 59 | // TODO: add a final flag in order to post everything when all batches iterated over 60 | results = await Promise.allSettled(metadata_promises) 61 | 62 | // .then(results => { 63 | // see how many failed in chunk 64 | console.log(results.filter(result => result.status == "rejected").length); 65 | 66 | // check updated new_pano_data length 67 | console.log(new_pano_data.length); 68 | 69 | // check if this chunk was the last chunk 70 | last_chunk = i + CHUNK_SIZE >= filtered_pano_data.length; 71 | 72 | if (last_chunk) { 73 | // turn pano_data list into JSON 74 | let json_pano_data = JSON.stringify(new_pano_data); 75 | 76 | // use Blob in order to create download URL for the JSON file 77 | let pano_data_blob = new Blob([json_pano_data], {type: "application/json"}); 78 | let pano_data_url = URL.createObjectURL(pano_data_blob); 79 | 80 | // visualize link on webpage 81 | let a_pano_data = document.createElement('a'); 82 | a_pano_data.href = pano_data_url; 83 | a_pano_data.download = OUTPUT_JSON; 84 | a_pano_data.textContent = `Download ${OUTPUT_JSON}`; 85 | 86 | document.getElementById('json-download').appendChild(a_pano_data); 87 | 88 | // turn unretrievable panos list into JSON 89 | let unretrievable_panos_json = JSON.stringify(failed_to_retrieve_metadata); 90 | 91 | // use Blob in order to create download URL for the JSON file 92 | let unretrievable_panos_blob = new Blob([unretrievable_panos_json], {type: "application/json"}); 93 | let unretrievable_panos_url = URL.createObjectURL(unretrievable_panos_blob); 94 | 95 | // visualize link on webpage 96 | let a_unretrievable = document.createElement('a'); 97 | a_unretrievable.href = unretrievable_panos_url; 98 | a_unretrievable.download = UNRETRIEVABLE_PANOS_JSON; 99 | a_unretrievable.textContent = `Download ${UNRETRIEVABLE_PANOS_JSON}`; 100 | 101 | document.getElementById('json-download').appendChild(a_unretrievable); 102 | } else { 103 | // sleep for a minute to not exceed QPM rate-limit on Google's end. 104 | console.log("Sleeping for 1 min to not exceed QPM limit") 105 | await new Promise(r => setTimeout(r, 60000)); 106 | console.log("Done Sleeping") 107 | } 108 | } 109 | } 110 | 111 | function initialize() { 112 | // Get pano_ids from Project Sidewalk api. 113 | // Afterwards, filter for panos with no image size data and query for said image metadata. 114 | getPanos(SIDEWALK_SERVER_FQDN + '/adminapi/panos', (data) => flag_panos_for_redownload(data)); 115 | } -------------------------------------------------------------------------------- /config.py: -------------------------------------------------------------------------------- 1 | # Threads to use for asyncio - test but usually more threads the better as I/O task 2 | thread_count = 8 3 | 4 | # Proxy settings - if proxy not added, leave as is 5 | proxies = { 6 | "http": "http://", 7 | "https": "http://", 8 | } 9 | 10 | # ------------------------------- 11 | # Windows Headers 12 | # ------------------------------- 13 | 14 | # Edge 15 | headers_list = [ 16 | { 17 | 'Connection': 'keep-alive', 18 | 'Upgrade-Insecure-Requests': '1', 19 | 'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/88.0.4324.182 Safari/537.36 Edg/88.0.705.81', 20 | 'Accept': 'text/html,application/xhtml+xml,application/xml;q=0.9,image/webp,image/apng,*/*;q=0.8,application/signed-exchange;v=b3;q=0.9', 21 | 'Accept-Language': 'en-US,en;q=0.9', 22 | 'Referer': 'http://maps.google.com', 23 | }, 24 | 25 | # Firefox 85 on Windows 10 26 | { 27 | 'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64; rv:85.0) Gecko/20100101 Firefox/85.0', 28 | 'Accept': 'text/html,application/xhtml+xml,application/xml;q=0.9,image/webp,*/*;q=0.8', 29 | 'Accept-Language': 'en-US,en;q=0.5', 30 | 'Referer': 'http://maps.google.com', 31 | 'DNT': '1', 32 | 'Connection': 'keep-alive', 33 | 'Upgrade-Insecure-Requests': '1', 34 | }, 35 | 36 | # Chrome 88 Windows 10 37 | { 38 | 'Connection': 'keep-alive', 39 | 'Upgrade-Insecure-Requests': '1', 40 | 'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/88.0.4324.190 Safari/537.36', 41 | 'Accept': 'image/avif,image/webp,image/apng,image/svg+xml,image/*,*/*;q=0.8', 42 | 'Accept-Language': 'en-GB,en;q=0.9', 43 | 'Referer': 'http://maps.google.com', 44 | }, 45 | 46 | # Opera for Windows 10 47 | { 48 | 'Connection': 'keep-alive', 49 | 'Upgrade-Insecure-Requests': '1', 50 | 'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/88.0.4324.150 Safari/537.36 OPR/74.0.3911.160', 51 | 'Accept': 'text/html,application/xhtml+xml,application/xml;q=0.9,image/avif,image/webp,image/apng,*/*;q=0.8,application/signed-exchange;v=b3;q=0.9', 52 | 'Accept-Language': 'en-US,en;q=0.9', 53 | 'Referer': 'http://maps.google.com', 54 | }, 55 | 56 | # ------------------------------- 57 | # Mac Headers 58 | # ------------------------------- 59 | 60 | # Edge 88 Mac 61 | { 62 | 'Connection': 'keep-alive', 63 | 'Upgrade-Insecure-Requests': '1', 64 | 'User-Agent': 'Mozilla/5.0 (Macintosh; Intel Mac OS X 11_2_2) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/88.0.4324.182 Safari/537.36 Edg/88.0.705.81', 65 | 'Accept': 'text/html,application/xhtml+xml,application/xml;q=0.9,image/webp,image/apng,*/*;q=0.8,application/signed-exchange;v=b3;q=0.9', 66 | 'Accept-Language': 'en-GB,en;q=0.9,en-US;q=0.8', 67 | 'Referer': 'http://maps.google.com', 68 | }, 69 | 70 | # Opera 74 Mac 71 | { 72 | 'Connection': 'keep-alive', 73 | 'Upgrade-Insecure-Requests': '1', 74 | 'User-Agent': 'Mozilla/5.0 (Macintosh; Intel Mac OS X 11_2_2) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/88.0.4324.150 Safari/537.36 OPR/74.0.3911.160', 75 | 'Accept': 'text/html,application/xhtml+xml,application/xml;q=0.9,image/avif,image/webp,image/apng,*/*;q=0.8,application/signed-exchange;v=b3;q=0.9', 76 | 'Accept-Language': 'en-GB,en;q=0.9', 77 | 'Referer': 'http://maps.google.com', 78 | }, 79 | 80 | # Chrome 88 Mac 81 | { 82 | 'Connection': 'keep-alive', 83 | 'Upgrade-Insecure-Requests': '1', 84 | 'User-Agent': 'Mozilla/5.0 (Macintosh; Intel Mac OS X 11_2_2) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/88.0.4324.192 Safari/537.36', 85 | 'Accept': 'text/html,application/xhtml+xml,application/xml;q=0.9,image/avif,image/webp,image/apng,*/*;q=0.8,application/signed-exchange;v=b3;q=0.9', 86 | 'Referer': 'http://maps.google.com', 87 | 'Accept-Language': 'en-GB,en;q=0.9', 88 | }, 89 | 90 | # Firefox 85 Mac 91 | { 92 | 'User-Agent': 'Mozilla/5.0 (Macintosh; Intel Mac OS X 10.16; rv:86.0) Gecko/20100101 Firefox/86.0', 93 | 'Accept': 'image/webp,*/*', 94 | 'Accept-Language': 'en-US,en;q=0.5', 95 | 'Referer': 'http://maps.google.com', 96 | 'DNT': '1', 97 | 'Connection': 'keep-alive', 98 | }, 99 | 100 | # Safari 14 Mac 101 | { 102 | 'Accept': 'text/html,application/xhtml+xml,application/xml;q=0.9,*/*;q=0.8', 103 | 'Upgrade-Insecure-Requests': '1', 104 | 'Host': 'maps.google.com', 105 | 'User-Agent': 'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_6) AppleWebKit/605.1.15 (KHTML, like Gecko) Version/14.0.3 Safari/605.1.15', 106 | 'Accept-Language': 'en-ie', 107 | 'Connection': 'keep-alive', 108 | }, 109 | 110 | # ------------------------------- 111 | # Ubuntu Headers 112 | # ------------------------------- 113 | 114 | # Firefox 86 Ubuntu 115 | { 116 | 'User-Agent': 'Mozilla/5.0 (X11; Ubuntu; Linux x86_64; rv:86.0) Gecko/20100101 Firefox/86.0', 117 | 'Accept': 'image/webp,*/*', 118 | 'Accept-Language': 'en-GB,en;q=0.5', 119 | 'Referer': 'http://maps.google.com', 120 | 'DNT': '1', 121 | 'Connection': 'keep-alive', 122 | }, 123 | 124 | # Chrome 88 Ubuntu 125 | { 126 | 'Connection': 'keep-alive', 127 | 'Upgrade-Insecure-Requests': '1', 128 | 'User-Agent': 'Mozilla/5.0 (X11; Linux x86_64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/88.0.4324.182 Safari/537.36', 129 | 'Accept': 'text/html,application/xhtml+xml,application/xml;q=0.9,image/avif,image/webp,image/apng,*/*;q=0.8,application/signed-exchange;v=b3;q=0.9', 130 | 'Accept-Language': 'en-GB,en;q=0.9', 131 | 'Referer': 'http://maps.google.com', 132 | }, 133 | 134 | # Opera 74 Ubuntu 135 | { 136 | 'Connection': 'keep-alive', 137 | 'Upgrade-Insecure-Requests': '1', 138 | 'User-Agent': 'Mozilla/5.0 (X11; Linux x86_64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/88.0.4324.150 Safari/537.36 OPR/74.0.3911.160', 139 | 'Accept': 'text/html,application/xhtml+xml,application/xml;q=0.9,image/avif,image/webp,image/apng,*/*;q=0.8,application/signed-exchange;v=b3;q=0.9', 140 | 'Accept-Language': 'en-GB,en;q=0.9', 141 | 'Referer': 'http://maps.google.com', 142 | } 143 | ] 144 | -------------------------------------------------------------------------------- /README.md: -------------------------------------------------------------------------------- 1 | # sidewalk-panorama-tools 2 | 3 | ## About 4 | This repository contains a set of Python scripts, intended to be used with data from [Project Sidewalk](https://github.com/ProjectSidewalk/SidewalkWebpage). The purpose of these scripts are to create crops of sidewalk accessibility issues/features usable for ML and computer vision applications from Google Streetview Panoramas via crowd-sourced label data from Project Sidewalk. 5 | 6 | The scripts are intended to be run inside a Docker container running Ubuntu 20.04 64-bit. However, one should be able to run these scripts on most Linux distros without the need for Docker, assuming the Python packages listed in `requirements.txt` can be installed. Additional effort would be required to use the downloader on a Mac or Windows machine without Docker. 7 | 8 | There are two main scripts of note: [DownloadRunner.py](DownloadRunner.py) and [CropRunner.py](CropRunner.py). Both should be fully functional, but only the downloader is actively in use (a new version is in the works), so we may not notice bugs with the cropper as quickly. More details on both below! 9 | 10 | **Note:** At least 2GB RAM is recommended, as these scripts may crash on very low memory systems due to the size of the images processed. 11 | 12 | ## Downloader 13 | 1. [Install Docker Desktop](https://www.docker.com/get-started). 14 | 1. Run `git clone https://github.com/ProjectSidewalk/sidewalk-panorama-tools.git` in the directory where you want to put the code. 15 | 1. Create the Docker image 16 | ``` 17 | docker build --no-cache --pull -t projectsidewalk/scraper:v5 18 | ``` 19 | 1. You can then run the downloader using the following command: 20 | ``` 21 | docker run --cap-add SYS_ADMIN --device=/dev/fuse --security-opt apparmor:unconfined projectsidewalk/scraper:v5 22 | ``` 23 | Where the `` looks like `sidewalk-columbus.cs.washington.edu` if you want data from Columbus. If you visit that URL, you will see a dropdown menu with a list of publicly deployed cities that you can pull data from. 24 | 1. Right now the data is stored in a temporary directory in the Docker container. You could set up a shared volume for it, but for now you can just copy the data over using `docker cp :/tmp/download_dest/ `, where `` is the place on your local machine where you want to save the files. You can find the `` using `docker ps -a`. 25 | 26 | Additional settings can be configured for `DownloadRunner.py` in the configuration file `config.py`. 27 | * `thread_count` - the number of threads you wish to run in parallel. As this uses asyncio and is an I/O task, the higher the count the faster the operation, but you will need to test what the upper limit is for your own device and network connection. 28 | * `proxies` - if you wish to use a proxy when downloading, update this dictionary with the relevant details, otherwise leave as is and no proxy will be used. 29 | * `headers` - this is a list of real headers that is used when making requests. You can add to this list, edit it, or leave as is. 30 | 31 | ## Cropper 32 | 33 | `CropRunner.py` creates crops of the accessibility features from the downloaded GSV panoramas images via label data from Project Sidewalk, provided by their API. 34 | 35 | Usage: 36 | ```python 37 | python CropRunner.py [-h] (-d [D] | -f [F]) [-s S] [-c C] 38 | ``` 39 | * To fetch label metadata from webserver or a file, use respectively (mutually exclusive, required): 40 | * ``-d `` 41 | * ``-f `` 42 | * ``-s `` (optional). Specify if using a different directory containing panoramas. Panoramas are used to crop the labels. 43 | * ``-o `` (optional). Specify if want to set a different directory for crops to be stored. 44 | 45 | As an example: 46 | ```python 47 | python CropRunner.py -d sidewalk-columbus.cs.washington.edu -s /sidewalk/columbus/panos/ -o /sidewalk/columbus/crops/ 48 | ``` 49 | 50 | **Note** You will likely want to filter out labels where `disagree_count > agree_count`. These are based on human-provided validations from other Project Sidewalk users. This is not written in the code by default. There is also an option for a filter that is even more strict. This of course has the tradeoff of using less data, so this depends on the the needs of your project: more data vs more accurate data. To do this, you would query the `/v2/access/attributesWithLabels` API endpoint for the city you're looking at. Then you would only include labels where the `label_id` is also present in the attributesWithLabels API. This is a more aggressive filter that removes labels from some users that we suspect are providing low quality data based on some heuristics. 51 | 52 | **Note** We have noticed some error in the y-position of labels on the panorama. We believe that this either comes from a bug in the GSV API, or it may be there there is some metadata that Google is not providing us. The errors are relatively small and in the y-direction. As of Apr 2023 we are working on an alternative cropper that attempts to correct for these errors, but it is in development. The version here should work pretty well for now though! 53 | 54 | ## Definitions of variables found in APIs 55 | 56 | ### Downloader: /adminapi/panos 57 | | Attribute | Definition | 58 | | ------------- | ------------- | 59 | | gsv_panorama_id | A unique ID, provided by Google, for the panoramic image | 60 | | width | The width of the pano image in pixels | 61 | | height | The height of the pano image in pixels | 62 | | lat | The latitude of the camera when the image was taken | 63 | | lng | The longitude of the camera when the image was taken | 64 | | camera_heading | The heading (in degrees) of the center of the image with respect to true north | 65 | | camera_pitch | The pitch (in degrees) of the camera with respect to horizontal | 66 | 67 | 68 | ### Cropper: /adminapi/labels/cvMetadata 69 | You won't need most of this data in your work, but it's all here for reference. Everything through `notsure_count` might be useful, then there are a few that are duplicates from the API described above, then everything starting with `canvas_width` probably won't matter for you. 70 | 71 | | Attribute | Definition | 72 | | ------------- | ------------- | 73 | | label_id | A unique ID for each label (within a given city), provided by Project Sidewalk | 74 | | gsv_panorama_id | A unique ID, provided by Google, for the panoramic image [same as /adminapi/panos] | 75 | | label_type_id | An integer ID denoting the type of label placed, defined in the chart below | 76 | | pano_x | The x-pixel location of the label on the pano, where top-left is (0,0) | 77 | | pano_y | The y-pixel location of the label on the pano, where top-left is (0,0) | 78 | | agree_count | The number of "agree" validations provided by Project Sidewalk users | 79 | | disagree_count | The number of "disagree" validations provided by Project Sidewalk users | 80 | | notsure_count | The number of "not sure" validations provided by Project Sidewalk users | 81 | | pano_width | The width of the pano image in pixels [same as /adminapi/panos] | 82 | | pano_height | The height of the pano image in pixels [same as /adminapi/panos] | 83 | | camera_heading | The heading (in degrees) of the center of the image with respect to true north [same as /adminapi/panos] | 84 | | camera_pitch | The pitch (in degrees) of the camera with respect to horizontal [same as /adminapi/panos] | 85 | | canvas_width | The width of the canvas where the user placed a label in Project Sidewalk | 86 | | canvas_height | The height of the canvas where the user placed a label in Project Sidewalk | 87 | | canvas_x | The x-pixel location where the user clicked on the canvas to place the label, where top-left is (0,0) | 88 | | canvas_y | The y-pixel location where the user clicked on the canvas to place the label, where top-left is (0,0) | 89 | | heading | The heading (in degrees) of the center of the canvas with respect to true north when the label was placed | 90 | | pitch | The pitch (in degrees) of the center of the canvas with respect to _the camera's pitch_ when the label was placed | 91 | | zoom | The zoom level in the GSV interface when the user placed the label | 92 | 93 | 94 | Note that the numbers in the `label_type_id` column correspond to these label types (yes, 8 was skipped! :shrug:): 95 | 96 | | label_type_id | label type | 97 | | ------------- | ------------- | 98 | | 1 | Curb Ramp | 99 | | 2 | Missing Curb Ramp | 100 | | 3 | Obstacle in a Path | 101 | | 4 | Surface Problem | 102 | | 5 | Other | 103 | | 6 | Can't see the sidewalk | 104 | | 7 | No Sidewalk | 105 | | 9 | Crosswalk | 106 | | 10 | Pedestrian Signal | 107 | 108 | ## Suggested Improvements 109 | 110 | * `CropRunner.py` - implement multi core usage when creating crops. Currently runs on a single core, most modern machines 111 | have more than one core so would give a speed up for cropping 10's of thousands of images and objects. 112 | * Add logic to `progress_check()` function so that it can register if their is a network failure and does not log the pano id as visited and failed. 113 | * Project Sidewalk group to delete old or commented code once they decide it is no longer required (all code which used the previously available XML data). 114 | 115 | ## Depth Maps 116 | Depth maps are calculated using downloaded metadata from Google Street View. The endpoint being used to gather the needed XML metadata for depth map calculation isn't a publicly supported API endpoint from Google. It has been only sporadically available throughout 2022, and as of Apr 2023, has been unavailable for the past nine months. We continue to include the code to download the XML and decode the depth data in our download scripts on the off chance that the endpoint comes back online at some point. 117 | 118 | **Note:** Decoding the depth maps on an OS other than Linux will likely require recompiling the `decode_depthmap` binary for your system using [this source](https://github.com/jianxiongxiao/ProfXkit/blob/master/GoogleMapsScraper/decode_depthmap.cpp). 119 | 120 | ## Old Code We've Removed 121 | In PR [#26](https://github.com/ProjectSidewalk/sidewalk-panorama-tools/pull/26), we removed some old code. Some was related to our Tohme paper from 2014, some had to do with using depth maps for cropping images. Given that no one seems to be using the Tohme code (those on our team don't even know how it works) and Google has removed access to their depth data API, we removed this code in Apr 2023. We are hoping that this will simplify the repository, making it easier to make use of our newer work, while making it easier to maintain the code that's actually being used. 122 | 123 | If any of this code ever needs to be revived, it exists in the git history, and can be found in the PR linked above! 124 | -------------------------------------------------------------------------------- /CropRunner.py: -------------------------------------------------------------------------------- 1 | """ 2 | ** Crop Extractor for Project Sidewalk ** 3 | 4 | Given label metadata from the Project Sidewalk database, this script will extract JPEG crops of the features that have 5 | been labeled. The required metadata should be obtained through an API endpoint on the Project Sidewalk server for a 6 | given city, passed as an argument to this script. Alternatively, if you have a CSV containing this data (from running 7 | the samples/getFullLabelList.sql script) you can pass in the name of that CSV file as an argument. 8 | 9 | Additionally, you should have downloaded original panorama images from Street View using DownloadRunner.py. You will 10 | need to supply the path to the folder containing these files. 11 | 12 | """ 13 | 14 | import sys 15 | import logging 16 | import os 17 | from PIL import Image, ImageDraw 18 | import json 19 | import requests 20 | from requests.adapters import HTTPAdapter 21 | import urllib3 22 | from urllib3.util.retry import Retry 23 | import pandas as pd 24 | import argparse 25 | try: 26 | from xml.etree import cElementTree as ET 27 | except ImportError as e: 28 | from xml.etree import ElementTree as ET 29 | 30 | # Mark the center of the crop? 31 | MARK_LABEL = True 32 | 33 | logging.basicConfig(filename='crop.log', level=logging.DEBUG) 34 | 35 | parser = argparse.ArgumentParser() 36 | group_parser = parser.add_mutually_exclusive_group(required=True) 37 | group_parser.add_argument('-d', nargs='?', help='sidewalk_server_domain (preferred over metadata_file) - FDQN of SidewalkWebpage server to fetch label list from, i.e. sidewalk-columbus.cs.washington.edu') 38 | group_parser.add_argument('-f', nargs='?', help='metadata_file - path to file containing label_ids and their properties. It may be CSV or JSON. i.e. samples/labeldata.csv') 39 | parser.add_argument('-s', default='/tmp/download_dest/', help='pano_storage_directory - path to directory containing panoramas downloaded using DownloadRunner.py. default=/tmp/download_dest/') 40 | parser.add_argument('-o', default='/crops/', help='crop_output_directory - path to location for saving the crops. default=/crops/') 41 | args = parser.parse_args() 42 | 43 | # FDQN SidewalkWebpage server 44 | sidewalk_server_fdqn = args.d 45 | # Path to json or CSV data from database 46 | label_metadata_file = args.f 47 | # Path to panoramas downloaded using DownloadRunner.py. 48 | gsv_pano_path = args.s 49 | # Path to location for saving the crops 50 | crop_destination_path = args.o 51 | 52 | def request_session(): 53 | """ 54 | Sets up a request session to handle server HTTP requests, retrying in case of errors. 55 | :return: session 56 | """ 57 | session = requests.Session() 58 | retries = Retry(total=5, connect=5, status_forcelist=[429, 500, 502, 503, 504], backoff_factor=1, raise_on_status=False) 59 | adapter = HTTPAdapter(max_retries=retries) 60 | session.mount('https://', adapter) 61 | return session 62 | 63 | def fetch_label_ids_csv(metadata_csv_path): 64 | """ 65 | Reads metadata from a csv. Useful for old csv formats of cvMetadata such as cv-metadata-seattle.csv 66 | :param metadata_csv_path: The path to the metadata csv file and the file's name eg. sample/metadata-seattle.csv 67 | :return: A list of dicts containing the follow metadata: gsv_panorama_id, pano_x, pano_y, zoom, label_type_id, 68 | camera_heading, heading, pitch, label_id, width, height, tile_width, tile_height, image_date, imagery_type, 69 | pano_lat, pano_lng, label_lat, label_lng, computation_method, copyright 70 | """ 71 | df_meta = pd.read_csv(metadata_csv_path) 72 | df_meta = df_meta.drop_duplicates(subset=['label_id']).to_dict('records') 73 | return df_meta 74 | 75 | def json_to_list(jsondata): 76 | """ 77 | Transforms json like object to a list of dict to be read in bulk_extract_crops() to crop panos with label metadata 78 | :param jsondata: json object containing label ids and their associated properties 79 | :return: A list of dicts containing the following metadata: label_id, gsv_panorama_id, label_type_id, agree_count, 80 | disagree_count, notsure_count, pano_width, pano_height, pano_x, pano_y, canvas_width, canvas_height, canvas_x, 81 | canvas_y, zoom, heading, pitch, camera_heading, camera_pitch 82 | """ 83 | unique_label_ids = set() 84 | label_info = [] 85 | 86 | for value in jsondata: 87 | label_id = value["label_id"] 88 | if label_id not in unique_label_ids: 89 | unique_label_ids.add(label_id) 90 | label_info.append(value) 91 | else: 92 | print("Duplicate label ID") 93 | assert len(unique_label_ids) == len(label_info) 94 | return label_info 95 | 96 | def fetch_cvMetadata_from_file(metadata_json_path): 97 | """ 98 | Reads json file to exctact labels. 99 | :param metadata_file_path: the path of the json file containing all label ids and their associated data. 100 | :return: A list of dicts containing the following metadata: label_id, gsv_panorama_id, label_type_id, agree_count, 101 | disagree_count, notsure_count, pano_width, pano_height, pano_x, pano_y, canvas_width, canvas_height, canvas_x, 102 | canvas_y, zoom, heading, pitch, camera_heading, camera_pitch 103 | """ 104 | with open(metadata_json_path) as json_file: 105 | json_meta = json.load(json_file) 106 | return json_to_list(json_meta) 107 | 108 | # https://stackoverflow.com/questions/54356759/python-requests-how-to-determine-response-code-on-maxretries-exception 109 | def fetch_cvMetadata_from_server(server_fdqn): 110 | """ 111 | Function that uses HTTP request to server to fetch cvMetadata. Then parses the data to json and transforms it 112 | into list of dicts. Each element is associated with a single label. 113 | :return: list of labels 114 | """ 115 | url = 'https://' + server_fdqn + '/adminapi/labels/cvMetadata' 116 | session = request_session() 117 | try: 118 | print("Getting metadata from web server") 119 | response = session.get(url) 120 | response.raise_for_status() 121 | except requests.exceptions.HTTPError as e: 122 | logging.error('HTTPError: {}'.format(e)) 123 | print("Cannot fetch metadata from webserver. Check log file.") 124 | sys.exit(1) 125 | except urllib3.exceptions.MaxRetryError as e: 126 | logging.error('Retries: '.format(e)) 127 | print("Cannot fetch metadata from webserver. Check log file.") 128 | sys.exit(1) 129 | 130 | jsondata = response.json() 131 | return json_to_list(jsondata) 132 | 133 | def predict_crop_size(pano_y, pano_height): 134 | """ 135 | As it stands, this algorithm: 136 | 1. Converts `pano_y` and `pano_height` to the old version of `pano_y` that we had when this alg was written. 137 | 2. Approximates the distance to label from camera using an experimentally determined formula. 138 | 3. Predict an ideal crop size using an experimentally determined formula based on the estimated distance. 139 | 140 | Here is some context for the current formulae: 141 | https://github.com/ProjectSidewalk/sidewalk-cv-tools/issues/2#issuecomment-510609873 142 | https://github.com/ProjectSidewalk/SidewalkWebpage/issues/633#issuecomment-307283178 143 | 144 | There are some clear areas to improve this function: 145 | 1. We have an updated distance estimation formula that takes into account zoom level: 146 | https://github.com/ProjectSidewalk/SidewalkWebpage/blob/develop/public/javascripts/SVLabel/src/SVLabel/label/Label.js#L17 147 | 2. That distance estimation formula should be recreated given some of the bugs we've fixed in the past few years. 148 | """ 149 | old_pano_y = pano_height / 2 - pano_y 150 | crop_size = 0 151 | distance = max(0, 19.80546390 + 0.01523952 * old_pano_y) 152 | 153 | if distance > 0: 154 | crop_size = 8725.6 * (distance ** -1.192) 155 | if crop_size > 1500 or distance == 0: 156 | crop_size = 1500 157 | if crop_size < 50: 158 | crop_size = 50 159 | 160 | return crop_size 161 | 162 | 163 | def make_single_crop(path_to_image, pano_x, pano_y, output_filename, draw_mark=False): 164 | """ 165 | Makes a crop around the object of interest 166 | :param path_to_image: where the GSV pano is stored 167 | :param pano_x: x-pixel of label on the GSV image 168 | :param pano_y: y-pixel of label on the GSV image 169 | :param output_filename: name of file for saving 170 | :param draw_mark: if a dot should be drawn in the centre of the object/image 171 | :return: none 172 | """ 173 | pano = Image.open(path_to_image) 174 | draw = ImageDraw.Draw(pano) 175 | 176 | pano_width = pano.size[0] 177 | pano_height = pano.size[1] 178 | print(pano_width, pano_height) 179 | 180 | predicted_crop_size = predict_crop_size(pano_y, pano_height) 181 | crop_width = predicted_crop_size 182 | crop_height = predicted_crop_size 183 | 184 | r = 10 185 | if draw_mark: 186 | draw.ellipse((pano_x - r, pano_y - r, pano_x + r, pano_y + r), fill=128) 187 | 188 | print("Plotting at " + str(pano_x) + "," + str(pano_y)) 189 | 190 | top_left_x = pano_x - crop_width / 2 191 | top_left_y = pano_y - crop_height / 2 192 | cropped_square = pano.crop((top_left_x, top_left_y, top_left_x + crop_width, top_left_y + crop_height)) 193 | cropped_square.save(output_filename) 194 | 195 | return 196 | 197 | 198 | def bulk_extract_crops(labels_to_crop, path_to_gsv_scrapes, destination_dir, mark_label=False): 199 | total_labels = len(labels_to_crop) 200 | no_metadata_fail = 0 201 | no_pano_fail = 0 202 | success = 0 203 | 204 | for row in labels_to_crop: 205 | pano_id = row['gsv_panorama_id'] 206 | print(pano_id) 207 | pano_x = float(row['pano_x']) 208 | pano_y = float(row['pano_y']) 209 | label_type = int(row['label_type_id']) 210 | label_id = int(row['label_id']) 211 | 212 | pano_img_path = os.path.join(path_to_gsv_scrapes, pano_id[:2], pano_id + ".jpg") 213 | 214 | print(f'Cropping label {1 + no_pano_fail + no_metadata_fail + success} of {total_labels}') 215 | print(pano_img_path) 216 | # Extract the crop. 217 | if os.path.exists(pano_img_path): 218 | destination_folder = os.path.join(destination_dir, str(label_type)) 219 | if not os.path.isdir(destination_folder): 220 | os.makedirs(destination_folder) 221 | 222 | crop_destination = os.path.join(destination_dir, str(label_type), str(label_id) + ".jpg") 223 | 224 | if not os.path.exists(crop_destination): 225 | make_single_crop(pano_img_path, pano_x, pano_y, crop_destination, draw_mark=mark_label) 226 | print("Successfully extracted crop to " + str(label_id) + ".jpg") 227 | logging.info(f'{str(label_id)}.jpg {pano_id} {str(pano_x)} {str(pano_y)} {str(label_id)}') 228 | logging.info("---------------------------------------------------") 229 | success += 1 230 | else: 231 | no_pano_fail += 1 232 | print("Panorama image not found.") 233 | logging.warning("Skipped label id " + str(label_id) + " due to missing image.") 234 | 235 | print("Finished.") 236 | print(f"{no_pano_fail} extractions failed because panorama image was not found.") 237 | print(f"{no_metadata_fail} extractions failed because metadata was not found.") 238 | print(f"{success} extractions were successful.") 239 | return 240 | 241 | 242 | print("Cropping labels") 243 | 244 | if label_metadata_file is not None: 245 | file_path = os.path.splitext(label_metadata_file) 246 | if file_path[-1] == ".csv": 247 | label_infos = fetch_label_ids_csv(label_metadata_file) 248 | elif file_path[-1] == ".json": 249 | label_infos = fetch_cvMetadata_from_file(label_metadata_file) 250 | else: 251 | label_infos = fetch_cvMetadata_from_server(sidewalk_server_fdqn) 252 | 253 | bulk_extract_crops(label_infos, gsv_pano_path, crop_destination_path, mark_label=MARK_LABEL) 254 | -------------------------------------------------------------------------------- /samples/sample_meta.xml: -------------------------------------------------------------------------------- 1 | © 2016 Google4500 Sheriff Rd NE4500Washington, District of ColumbiaUnited StatesSheriff Rd NESheriff Rd NESheriff Rd NEeJzt2glc1HX-x3HkUBQUVFAxzPIAxc37gGHm9_vNDzPFCCG8ylLzANTyKMgU-KFZ8s-Ov5rmYxWztlqtvMqT-Q4_j2VT10zXlU1xK5XVLG9pK9Jtf3MAM8Nvjt_Mb-Yz43yej0dhIjO_-b3evx84j0LnBQQEBjQJDUAIIYQQQgghhBBCCCGEEEIIIYQQQgghhBBCCCGEEEIIIYQQQgghhBBCCCGEEEIIIYQQQgghhBBCCCGEEEIIIYQQQgghhBBCCCGEEEIIIYQQQgghhBBCCCGEEEIIIYQQQgghhBBCCCGEEEIIIYQQQgghhBBCCCEfFR4OfQQIUFR4A-hjQR4XZdo_vJmec48UbEreg0TuEiXWX9oKgsW485iRbKJs9Le_AdHy2N-H2OtvdQM20mN_nxHlSH_LCdhLj_19RZSj_QU9dF_gUHrs7ysk9Rf-UWH-ewr292tDsL9fk9K_B_a_52B__4b9_Rv292_Y379hf__mY_379HlA0NodD-2fsL9_k9q_r8P53dm_9T2ygEg9yCNwY__WDWQ73HuyP-QIGve3mh_7yy7SXExMjINf2F6uI_BM_yflOtx7vr-DC2gvkOUIsD8kl_o3WoDCiSPA-z8kF_tbLEDhxAC8r3-vXr3GDBtm5ZPYP8LAfACGDwpFcrLUI_C2_r16ydi_tyA2INbA8WPwHPv9jbkjYmIizJj1z2zf_iGBD_fvqjdunBv610lKSopm2Q7moi1JOHUysNe_obff9U9Le1jQdYDu9ycOGzbT8OdM-_9Bb0DrAXrp6Y0f1Yn-nl3Ao57rP9Kg7_3mpPZ3_O1fp_o_YbX_xIkTA8T66x4e-zvWv6_Aq_s_Id4_rXud-v7Cr-v7d-8u3n-SU_1jY0eM6NkzIGD2swGJUntKJUP_QYMGMSb9GYYZNKjR88zxh_65lg_qSv-eiTo9dVJSYmNTUlzJbJXr_Z8R7_-ITv3D0EZW-zv09j_2l52b-j-S7J7-zdzWv56hf_1_AvQfcc_0z8nJmSLIyvKx_mPGmPzGeJ2hdWz1nzZt2miD1FTDv_23f5bB8HpP6Zj1n_u4jkf6S3v7v1c3E6mpkye3a1DfXxiFof9gnaGmBteZ5VT_aMv-Pb25_3Ni_bOyMjLG1nvcYILOY2amTp3qof4tW8bFmb6wsLAwpVL_q1ATJqG72ZRax2wcpgZbeFpHSn9hAAkCb--vv7gzdR8zMoT-GUb3OSJeILF_C-f6tzQICorTCQ0NM6PUCRXVOGwTS6IDaSI6CvH-_YwSRBm_1C35bfSP17PfXx8y0_jRPG1j06dPt9yA3f4tTPTv39-l_voJBNWvwHIHdrZgexcW-lkn3jmh0UM0PFtgYKDZQXYxCDW8pLqYbdq0CWxax3ShnfRCHOxPGYUIZtjtLx46PsTaNW_66EbZDf1H6bQQIZTv31xHQv9WRsLZC2xpKijIfAWiQ5AwBqcGYiW1SfJAy-wm5QWmL6iNQaB4f5MNhIQMHDjwwXpi_dX1OutR9cT6CxsRy9w5xIrs7M4WrCW3LK_TsaOE_k1amS5AfAPmK7C-BJf3YEegGbHnNilv3j6oob6N_mYb0K9AOJ2i_RsVUJuqq2bMbxmzTgtrn7Db20p5XXsdKf2bWC7A2gZEZmBvCuLjkBLZkrXHNQvfqL1ZfZMBiPW32ICwApH-utuCA3F08R2q6ASL8PXtpfdvPABbG7AyA6fWYLu1neRm4buYHIPIgZvVt3MDENtA_c3A6EEL1vq7KXzzRhra645VYn_xBdjeQN0MrOzAsU1IbC0W3uK5RA_Yor5j_UVHoJ_BQItzbzkGR-4McnU3S9-x4TCl5be6ALsbsHc_cHYZUq53m-0tb_3S-otvwMoObG5CbBvNXckuctmba6vjcH_rC3BsA5LuCNL7W69uK714fYn9bYzAgSHY5Fx0y_Qi7Y319YyvwG7_hgE0XoCUETh-V3Ctuf32Vuo709_2BpydgklvKV9mJ71pfUkv0MYtQGQEjmzA9iSkt5aQ3np9Z_s7NgKLPTh9b7CX3sbzOlnfoQW4PoIGEmtLSd-QX-wVuNBf-gqsk798iMiN3w0LcObbgXz9HXti6_UD7b8BIGEELqxA5vJy1JewANdX4KbyduvLcANoNAN39JcSXrb6EhfgygrcUt6B-vL2N3JiBna7S5uTTPH1JC7AuRlIC-_wrKz-2OfW_nUkfFcQry4teqP4ctTXcWYBEocg2_XeuL6dI3RjfxMu_4Agvb6ch-_CAsSnYJlStuYS63uqvz1yx5e1vo4cC7DB6cou1r-n-rstvp5bF-CO-I7Uv3f6u-_Sr2f7XWGv6S-lfqBMbwC4yvvjG7hrAWD1veQG4BPx9dyzAHnrS3pqH-_v0fh67vg2IGN8afV9ur9nr3wTsi8ArL7v9geLryfzTQAqvq_2B21vJOcC5Kjv5FP7XH9vaG8k2wRgLn3f6-9F7Q1k-j4AFd9bBuCT7Y3kmABYfN_o763p67k6AWfju9reW_pbH4DXp6_j0gTA4ntxf18Jb8LpDUC198b-vpfdlHM_DUhqL2d8b-l_b5G8AYjrHvu7laQbAVB6HIC7ObgCqPTY3yNa2ZsBTHjs73GtxKdgVt0j0bG_N_F0cuzvXbC_fwPt7x3_C6hf847-OAAo2N-_YX__hv39G_b3b7D98S8A0LykPw4ACPb3b9jfv2F__4b9_RtwfxwAMOzv37C_f8P-_g26P74DCAs6P94AYEHnx_6woPNjf1jQ-bE_LOj8OABY0PWxPyzo-tgfFnR97A8Lun4gDgAUdPxA7A8KOn4g9gcFHV8HBwAHur0O9ocD3V4H-8OBbq-HAwADnV4P-4OBTq-H_cFApzfAAUCBLm-A_aFAlzfA_lCgyxvhAIBAhzfC_kCgw9fBAcCA7l4H-8OA7l4PBwACOns97A8COnsDHAAE6OoNsD8E6OomcAAAoKObwgF4HnRzU9jf86Cbm8EBeBx0cnM4AE-DLm4O-3sadHELOAAPgw5uCQfgWdC9G8EBeBR07sZwAJ4EXVsEDsCDoGOLwQF4DnRrcbgAT4EubQUOwEOgQ1vTFBfgEdCdrcMBeAJ0ZVtwAe4H3dg2XIC7QRe2BxfgXtB9HYATcCPouA7Bvwy4DXRah-EE3AI6qxR4G5AfdFPJcAOygs7pHLwTyAW6pEtwBS6DTigPHIKzoMvJD783SAFdy7OamtO9fugAwKCLgIMOAAz69IODDgAM-vSDgw4ADPr0g4MOAAz69IODDgAM-vSDgw4ADPr0g4MOAAv67HsF6AhgoE-8FwiwMLL1t4rUeQHK6rsc37U4iflrn0WamSUVZTN_5_g10xOZoQOXkl8L_5TM_ofj39-tZPpFFPAtrvSnTrVbrrmzulj9SeZWzaox2cqlv3B8YdkA5od9NR-8UtJCu_gMx3d7TMEo9m0s37F2Lr8y-Lhy8czmLJf4X014-JHkQ3c4PvMHFdN2xhbyybj5ytLLHD9kyUDm3IoxJHgVrbz4E8efy0tkXrzVnyzbGq0adZzj912azow91lZboRWOdUmVMnb2DXVX9avkky7nVD_lFvHlB4uZbqNe0UQujFOlHeb47I0PM7FN_qg8f5bjy4urlM_-GM2-e4Oj_5u1XVmyeULZrJDD6tykPH7I9RXUzlu3NWka4fNHczSbr9RoVT2L-FaLWzPN4o4oDk0t1cZQHF8wahzzwV_z-Y_iS6iyaec0-xf2VJ9PKqVuvfY8_dB7J0j3RWo270oiaXcoXfmmcL60p9MY5l8cHTEiRJVe-nbZnC7R6hvao8rakxwdu7yKFMYksBlVBXTelc7lxw4UJ4-uuKhuExxAbtQElw8-WcjnDn-DaRvEkiObP03majm-9fgMZvChOFXHzzj644Iqcu3AErak-qJmWV-NKuxKEb-2tJbOUOaWP9X7JT7x0Urlv09uZyefiCYbEkcrd_3M8Rc7DGaOHkrh2XPd6Ccrl5EFXVLYkuNzyZHD87WnOY7fQr3FhDVrQkb-p5LkXBLOf_9OzM0ijl_4XLFq84CysidH7GOpp-aQFxOU1HfrFvI38waoh62fSy3dMp8OvnCKVP_4MrunNIhknn1Hmz-C4z-__TXd8a0bmpbR8eSUcPzvJC1ipnBz-ILsrVTawjZkaO-r6h7TYlQLyzg6cnEV-T5pJXt5-x_I5eVNqZqDhTw_7xYT8EYxdWzni3R19CkyQ9uGZaYt4F9oOppKzNmiWV32s_rZftHljz9UyP_6-mnlqp_L2XObblAnvp1M5504QOYcf4Y9dSmaXj83nR7LfE6qLu5Xt79cxO_fRlRTvo7U1J5fqZ4YcZ8ibtpg5dXfOP4Oncoc2PQeWRC5X5X1VRE_ZU45vfuB2VTgqfn054GVZM97W1hFxjqy__8CqHVXC_kFO1-go15rSq3bVsjvnHZaOfbp79ia6J1kV-ebyYOvcfzGUgXD_DSdr_jiLPXKyiRS26kvc-kuR2-PCCJbzgzd027HSnbNslh-SkEqXfPcp6TmWhI7fOcCOiwyvfzYI5uSp4aeUP-ZKaR_WBFdflMzO_nAOqIu--CEZtxHfbQTD3D82PhKusvocH5p6Bi634JdZGR8Syazskrz2NZibUghxyfP3cXM-nVS-Wd_fom_tLVSeWnkXXZkq63U8wVz-OulXyof1lxl992-n-xWfkP-Vs3xU6qXMJOucvQ3vQ8qT9-dWcbteFr9xAWOpuJvkVeHhSnCdm1jI3-7QCY_sE2luV3EL7wvWJ30wQJeNX4U9fUzH2uWTWyasmXv29qXkzm64uszRHvuJPsh9a6qexeO77PvjDJ3R7uUhNw3ybhbfZLPCtd375m5zPnvOHr4pDuEhEcpviyfxP77ixXkTt4_kyNrhL1SCmbctdfLPwzJ50dvO6ncm90y5fmFYXTo6jF0depeonm7L6MZPZs6eWI-HRVQSYYe3cuOuLqI_DF8nXZwknD9ZbdVb38hl58Q_HfqtUf7kieGZLHdXkkli66PUI38hOMnZF6hT312l6xfc0d748dCPmxIvvrv3y9Tfv8Nx896uUqZsXIW2yfiIrlz-xftT78X8pm1mer1P-ruh3_RfCkc__TYtUzI8mLSQbtKtWg4x1PVD6rfCJvHl2_-iAr6pQU50qKE3dPxFqntkJIcfYHjh8_IYV7dsoDftSKdit-zSROZ_gXbMWADmbd4s-pUM47Pn_ELM3Avx3dQUdrz-RWK9IoL7G_TnyonZ17iD5-uVD4ad5ZOeuNN8v87vko-KJwfrlUaEzPoIxK_cY522CKOH70kQb2p_XFCffqxcoNwzytoWsAE7C7k738plFqZOF6z9vJhVrM8T7V6HkfvHlVFOtYqUlb_LY-UvXWevPktx8d8-hZ9rc1iKuHlF_miUyeVvdp2SQkb1p58eJYifxHu75W_72YuXE8gnZLOK-cIr6dD8Xf0sn1rFbcqvlKd2FxEf788VF3bJpBczWym6iHc__vl3KCblISTqzHjVKPe5_gWxyvo3KjXyc7gDdrmvTk-vGCNesL1EurYqnx6X_w_CPdNNXugRxzJKVuhGpQlPN-mrVTXfxTwJe89QM18epGmacJ6drviK3L7tXDq_aJCPqNnhvrkQxHUyamF9KS9p8k7p6vZrqUrSOyGg9ojh4v4roOWqu_7E0drasZr84-uUUy8eZRNe_x5em3Ld8u57GDlF-Eb1RmdjiqZmfnayy9w9NqsUvp_qLg2qQeJzt2Q1wm_Vhx3EnJokTJ3XiQAiUkSUQICVq9f5CiMnz6P1dj97lA3TW22O9v7-22fXtrnTsOm69a7u2FEZfVlJeRkaPKy9JJNN1MFp6XV-gsOtuR6_cxih0vW3ddu0kK3YUS7IlS_Ij4d_nLja2rEd_-_uTcg4TzrGxrWNbJsYAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAARtSWLUyfABi0dcslTJ8FNtx4Y_-eZnBFo_4eEgZlvFX_LVu2VXV6iStaGeSZoW_GV-m_9ga2b2_ZHv1Hxlr9225gR9V29B9x4530XzmBHUvQf8SNd9p_YmJicQM7LoP-I66r_tU_Ozruz_R3Bp1A_01tJ_pvat3034b-7znov7mh_-aG_psb-m9u6L-5jVj_XbsmJ3fv3j2IS29O6L-5ddt_x1D0f68sYE_N-5g8wQD7N9y3b8d9T_ZfxNAJ0J9JDf3f19UMtvbrBBvTf7Jfx33P9-9wAVur-nIC9GdST_2bFjC1jhPg9Z9JPfZfsYCpdQxg-Prv3bt39759bW5E_62XGa9_cufi26mp6f3dnqB9_22M9L_yyj72v6qm_rb6fgit3f_ARePjtdu2rlTvv3Pn1VOLRrb_wUXXHBxA_yXVO03unpy8tr3az7mLH10frNX_wIFN2__91_3R9ddff-iPDx06tG_fvoMH61_X2L-e9fCRGxbdeGPzVdfRf2MXsHWN_kf71_-mmxdN31J37KIh6_-Bev-Dzf3rE2juf6QK_Tvrv3__9HD3v7V1_0Nj9X67J5f7Vy33v-qq1v2Pr6v_7qtYrNq9P_ihMXZ3NbvXh_7T09P7G_pzuFwOp-lxbu69_8TQ9-etvGhP_fnsKoFAIBSKRGKxSNRD5fZ67y9p3f-2E1XLl-Fya5_lcNr27-jXP_TvuwH1v-3EYPo3_-_fAf3-t_xh3_uvkv9if1ZD_9tHuv_Jkydnqu64Y8T6X_bb3akGhw-v1r-KqDt-vP528_YnG5yQ3lSz8u__Wzau_7Vd_VD2Xmx90TXXNHwoW-pfLV7PuvjL35FGcrlCoVBWqdbV_8DK_sKR60-S-_fXnu11Uxdvq7llWS2--phmg_prdVpt4zem1Wr37l36rwaTS3avajnq0gqWHGlHX9NN_zE221A17P2Npqpqf4qqhZ5e0XxqytzolhU27Pmv60itweRqVh9F-5Usad1_6csbH-moZZFhkbVqIPlX6X_1orX7L0Zefv4vu3qlVvlruuk_se7-E810l33U7im56hy6HUgXFzp6iaWBoZWlmFZry5uXNmTosH_9kcbHd1aNrdm_KXTdznbGm2w51rp_0z8ALeqif4voLe1qsNrL8_om0Y2mvwoOHDi6Qr1O7f3RFZto3Ill5d0urchqs11aRav-9a9v7lRbQHP_nTvtnWVuVg_YMv2KFTTModf-DVPatZoOhtDlQrq6xIEmTUUbyra9rf2d6ntp0__AgUuB21jj5tZfvWxlxbVnsGR9v_81k8lkq_bv0xTWqTl_C5cF7egey_dcet-if_2Wpid6K10EXz1-FyvoS3_Zsk43sEFjWH6YrnL2rmk-za_0_bFK-86W0Gt_WQtdj6Bfe2h_2T0NNngLi_kbbXj71XfQaf8OwvdpA_23p6X-Nl6-YPPjvK-dWsaO9tD67utaQLuXgs77r5F-CDfQun_LHbQex-oXWFPbAfRsfRNov4Ka1v0v3dJp_6EZQY_1erd9--AW0MMG1hpCm0l01X8YNsB0fodj-0VDuoH2I2j5UtD1AJgdAbPp67Zfrt_5L162xxVcmkK73wF6GQBTK2C8fcsF9D6E1hfsdQdrlO95ABu-AtkQtF91Ap2MYa179mMJnZXvzwA2ZAXLDzQM7TsawAZab_h-DmBAO2jxEEPQfugWULeu-H0eQF-GsMa1h6D98C1g3fEHNICuxtDtBZlPP2QL6Kn-4AcwAIynH6IF9Fp_JAfQ1wWsp_yQLKAP8TfzAnopPwQL6Ff90RxADYPdmV5AH-OP8AC6nEB_wzO4gD7XH-UByKoBGGjO4AL6H3-kB7Ahgdc20vFHeQBMh182yvFHeABMZ28wwvFHdwBMR7_cyMYf2QEwXXylUY0_qgtguncLo9l-RAfAdOzWRrH9aA6A6dJtjV77kRwA05lXM2LpR3IBTDdew0iVv4jppF1hOvDaRif8EqabdoPpup0ZiewNmK7aOabLdm67g-mqXWA6a8eYrtoFppt2hemuHbriCqardo7ppF1iOm0HFs_JdNZOMVtzHZiuu6algzJdthMMdlw_pgOvquGcTMddG1MFe8R041VcflCm-66OkXb9wXTmNprOyXTi1Wx8tX5iOnUrrc7JdOV2NjhX_zEdu0m7gzJdupUN7DQ4TAe_zCrnZDp2s40qNGhMR1-2xjmZ7n25jSizUZgOv6iDczLd_JJBF9loTMeXdXhOprvXDbIEU0ahfg3T7d-b9WtGoX4N6g_MCNSvQf2BGf74i1B_cIa__iLUH5yhj78I8Qdo2OPXIf4ADXv8OsQfoCFvfxHiD9Jwt78I8QdrmNsvQfsBG970y9B-8IYyfCOk3zjD0rwFlIcaZAcAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAABgjD-YMjm8s1KzPiEzzsqjMbNmPk1LHToZoSxyEjJabyPMBsLjE5foDCfM11kSXoNQp1KaAk5NzhnmER4HrfDYgnlpRpLIBiQFX9SUsRFsbcqoTZZknpzRYPSmwzy-lVTT8YxLW3C7pFmqFBFF-Fa3lHZQlDY5G4klND5CZi1kVCYuRyMIOChZcl7h4XKSSVOKk7aqFCpDwmgiBPMagyUU1GYTKik3Ezc4UlqPPy_JmL3pOYtfEjYS0ZIhQ6tVMUc-n60-0ryYr5XPFzwxhzUmJ7gpgqa5bNuctiDJl-xxnTed1kZmWVTQHcuGwkmvJeyIFgwOdT4iT0do1ryBiHIiAl7OIUyY5mXGQCYjomKBWGHOnmAXQ64Q2-7hiA0ykTjvypJKMismrWaR0lJMJkvsqIGbDwdNBTWdCcmLYr7G71SLCXshQOgsXkswJ87YQlKxzeyMJk1miTifC2l0bkOSLfOyfLNzWb5XFHCJKdWsf96nj9B8eTzNi3mKSqNAqaRDZo004JSWbFEXzyqT5SICrS6cL9rNvpiD8rHZhlK8KNOKnRYFh0jKROkSW-MWpOU6IhPmqwM2ik3Hubk5PUsv88s1Ok_MmrRlJF5-Xl6wOlSuTKpIGLgcdcwkzTjZ7ryYckgppS8VCus4ERVPqRPqo2ZqvpRVpwR2p0roMKlNNl2CG-K5qLgi4FAqYjFnJmHkOfUJ0k6TccKT9pujfKUvkAiZ3AWTsZAPeZ2E11aivXyWIJnheaMpNcebCuac-Xm5nFNyl7Jxm0TkigSLfI9HGKQlHhtdksdYNqM_7FTH3TKLJ6Smw_aQ1ln9hiOaQDSfUxvNyqhemBfPFbMJnjNWkuporSGcYLmEegkt87EzplS8YGS5kiZONu41RlJZelbryfDDeZKnZQmkipDFxSUtAiKQklACXVJG-_JZ0ZyapiOCpINOK6N2GztHOTKkn2fkaQX2MBkmpDxpUaHjSRXOfIBHp4M6gSdVXbiXT_ndCrFSmhVrtVGRJVAwztMcrTnqjil48ahWkwwJU4o5hTBBsTlJldkQKSiinoCYynHFMpfTRgWS5kLQXwxlsjlutCDUEUpbliB80oyFNz-rjkcEWSXBToadJZWRmFPNc9QKrkGhiMb9VrlIm5pLEIQtZJGzZHKpPTHrCKQsVCzBFhN5c7DotIcj-ozKK2bzVKZQwK6O01Z2Jmbk0hmyGPe7bI6SzhSO84k8K-NymH18vt4R55ssoTDbntYSxSDPblHpdQp9xs2VFOb4-XlRkaAzPEIqNtJR9mxa6ZXI0jKuwkmZiny12S5V5ozJYvWZ5AmwHAYLVTILldykoZSQByVSwhQ0irU5Vt4miNl9Qf6cMGEzp5J2Dy8jShpp_7zWXMpS4YQy5Z-neP60lC8TU_wQn2WxiaMRfkIj8MzGtBmPmkO40hlz3mXJFCOhEDGn15jNbjuHniuWYo6oPCUPhOTJojBjj6Z9ViJVfQGqflZABWiT0xK3GknKEKOshMgmd2j90ahcY-QWg7aIgpUiuAq1UJrxO0wyW1I76xGJoj4Pn3Ykqt9IPhMyJg3BiF_tm7PqzHO03MSTu10-nySRVBaD2disge_iEvko36VXFa3ReU1SHnDpFW7Srs4T8tnCXClgMZhUs5m8VWhMh0therYY8aeIgDSfUJh0yoCKN58l-YSbDJEFOqx38vNuWp_0ZxS03kmbXNKwxG40cqovXMJ4XDarVLs8FKvkNkiUtrzFLdIaONyE0M7XqClaaXSoVUQgI5RziIgkwGbRwTzlSlCRkNROyxTzah2P5WWzlaQ9r7HIqaywVJjzylI5pypctJAOE8XzJaIONy-ncoR1Gm1RapeHOQHam7JRKqtBFmJFncGQMC0vJg1ZIm00JMV0wCBw57R6XtzFIvJZM6WMZhz6iJwSEwJ_KKyI-CxJriovCLr1cbmAz5IpJSa3lC93JgRmSuwTJ50adiRJy4R8DcttSFh5AYLMSl1OboKSeVglq89G8bWkiE17CiGFRZjwx4yeYpwVUHsTDqWaUMq1Ng2tyMnNZEkc1BOkgcsT5K0padJOCWfnvO5CQSGSyezmnIcqSbmeQlwXcBmz1qw7Z5f7g4SQiGZUswTpcjpKWUVwXhVP8f1EggjnhHM8uYWSaDRFJZkUaAuzQSshMSQdpchsSG1MRf0xc8yWVIfEdNjvyfmjHJHYqggqCA6l8xs0Vvq6Hz1UfuGznyy_vv_Iuf8Wv1L-Vu6Bc4--eWVlbGzso9U_Y6_c_2b5PwV3ll_MvXn7Pzx3vHyLW1x-9Y0_K4u-_P7Kf8wIy85fTJx_8PMnKt994dsnLPGPEk_f-PDJl566kjT925YZwcmT5J3Wu4gnX9l7x_Ejd588duLj5WnLxIWb936pYnnnM5W_MXzywldP_EWl8ubz5N2-2y-UJY9XPnzw9AVasVCZij1YeewTWy78-KbTM6J7A-TbZ18iPv7arjv-7r758-onvBUr-ZHK2X89R55_21w5mfkO-Wqcc_Ifn_oJ8XP27yqP_Pk_kVLzG5WXp18lPyX9fuXlQ2-Sr_7uu5XPvfwHclz0jcpv_v1F8rmjT1eImR-S6Ud-X_acf5w8pPr1zIfd95Ayw0Pl6949Q_762X3lzxw5Q97-2_2k9K2_mnl82w_K1z2wbUH78FRl29e3Lnj-9JnysYlUxfHBXZXp7wUq255-tfKRN35K3vfGDyrPfvPH5B_2Kc-_9p090luf_yVxzPAEeXDHc6TtwhlSL_wR-dz0GZL77gly_-uPkzckv3GB-9LPKr-_8ycXJs-9VfnKC4-ee336dfLEV_zk2f97jHS-Nil94whBLqgmpX_ywH5yWn0PSf_9o2RoelL68Jf_h_jWF24jD_9s18LYl753SuTZuXA4ySqrv7Z34fZTu6VnPh0ln3z7gbLpQePCL4LfLN_1om_h8Pd_XlbemVi4hz5WufZvnySveHby_FdvfJe85qCqfM_91y489JvPliWxHQt_7Tc-85e8M6ee-VzswumrHiXdj_1q5gnzWfJr73xx5vFvP0t6P_FfM3dn7yfPPuKpqHT_W_npqQ9VDI_cVbmHOF3JH3_-wlu_1VfyL0yVr7z5fkL9-bPkU7E7yNOumZnA6T3krcldC0e_-BZxr2rXwj-L7iP2_MvOhWMv3Vr-wMc-VnnnV8fIw898euZpeZh0_dB57rYj95I3UIbz8a8rSY0hvnD9F-479cu9N174f9xC1qg -------------------------------------------------------------------------------- /DownloadRunner.py: -------------------------------------------------------------------------------- 1 | # !/usr/bin/python3 2 | 3 | import os 4 | from os.path import exists 5 | import stat 6 | import http.client 7 | import json 8 | import logging 9 | from datetime import datetime 10 | import time 11 | import requests 12 | from requests.adapters import HTTPAdapter 13 | from requests.packages.urllib3.util.retry import Retry 14 | from PIL import Image 15 | import fnmatch 16 | import pandas as pd 17 | import random 18 | from config import headers_list, proxies, thread_count 19 | import argparse 20 | import asyncio 21 | import aiohttp 22 | from aiohttp import web 23 | import backoff 24 | from io import BytesIO 25 | import math 26 | 27 | try: 28 | from xml.etree import cElementTree as ET 29 | except ImportError as e: 30 | from xml.etree import ElementTree as ET 31 | 32 | 33 | class Enum(object): 34 | def __init__(self, tuplelist): 35 | self.tuplelist = tuplelist 36 | 37 | def __getattr__(self, name): 38 | return self.tuplelist.index(name) 39 | 40 | 41 | DownloadResult = Enum(('skipped', 'success', 'fallback_success', 'failure')) 42 | 43 | delay = 0 44 | 45 | # Check proxy settings, if none provided (default) set proxies to False 46 | if proxies['http'] == "http://" or proxies['https'] == "https://": 47 | proxies['http'] = None 48 | proxies['https'] = None 49 | 50 | parser = argparse.ArgumentParser() 51 | parser.add_argument('d', help='sidewalk_server_domain - FDQN of SidewalkWebpage server to fetch pano list from, i.e. sidewalk-columbus.cs.washington.edu') 52 | parser.add_argument('s', help='storage_path - location to store scraped panos') 53 | parser.add_argument('-c', nargs='?', default=None, help='csv_path - location of csv from which to read pano metadata') 54 | parser.add_argument('--all-panos', action='store_true', help='Run on all panos that users visited, even if no labels were added on them.') 55 | parser.add_argument('--attempt-depth', action='store_true', help='Attempt do download depth data (we believe that this endpoint was removed in 2022 and expect depth download to always fail).') 56 | args = parser.parse_args() 57 | 58 | sidewalk_server_fqdn = args.d 59 | storage_location = args.s 60 | pano_metadata_csv = args.c 61 | all_panos = args.all_panos 62 | attempt_depth = args.attempt_depth 63 | 64 | print(sidewalk_server_fqdn) 65 | print(storage_location) 66 | print(pano_metadata_csv) 67 | print(all_panos) 68 | # sidewalk_server_fqdn = "sidewalk-columbus.cs.washington.edu" # TODO: use as defaults? 69 | # storage_location = "download_data/" # The path to where you want to store downloaded GSV panos 70 | 71 | if not os.path.exists(storage_location): 72 | os.makedirs(storage_location) 73 | 74 | print("Starting run with pano list fetched from %s and destination path %s" % (sidewalk_server_fqdn, storage_location)) 75 | 76 | 77 | def new_random_delay(): 78 | """ 79 | New random delay value generated 80 | :return: int between 50 and 250 in steps of 3 81 | """ 82 | return random.randrange(100, 200, 3) 83 | 84 | 85 | def random_header(): 86 | """ 87 | Takes the headers provided from the config file and randomly selections and returns one each time this function 88 | is called. 89 | :return: a randomly selected header file. 90 | """ 91 | headers = random.choice(headers_list) 92 | return headers 93 | 94 | 95 | # Set up the requests session for better robustness/respect of crawling 96 | # https://stackoverflow.com/questions/23013220/max-retries-exceeded-with-url-in-requests 97 | # Server errors while using proxy - https://findwork.dev/blog/advanced-usage-python-requests-timeouts-retries-hooks/ 98 | def request_session(): 99 | """ 100 | Sets up a request session to be used for duration of scripts operation. 101 | :return: session 102 | """ 103 | session = requests.Session() 104 | retry = Retry(total=5, connect=5, status_forcelist=[429, 500, 502, 503, 504], backoff_factor=1) 105 | adapter = HTTPAdapter(max_retries=retry) 106 | session.mount('http://', adapter) 107 | session.mount('https://', adapter) 108 | return session 109 | 110 | 111 | def get_response(url, session, stream=False): 112 | """ 113 | Uses requests library to get response 114 | :param url: url to visit 115 | :param session: requests session 116 | :param stream: Default False 117 | :return: response 118 | """ 119 | response = session.get(url, headers=random_header(), proxies=proxies, stream=stream) 120 | 121 | if not stream: 122 | return response 123 | else: 124 | return response.raw 125 | 126 | 127 | def progress_check(csv_pano_log_path): 128 | """ 129 | Checks download status via a csv: log as skipped if downloaded == 1, failure if download == 0. 130 | This speeds things up instead of trying to re-download broken links or images. 131 | NB: This will not check if the failure was due to internet connection being unavailable etc. so use with caution. 132 | :param csv_pano_log_path: 133 | :return: pano_ids processed, total count of processed, count of success, count of failure 134 | """ 135 | # temporary skip/speed up of processed panos 136 | df_pano_id_check = pd.read_csv(csv_pano_log_path) 137 | df_id_set = set(df_pano_id_check['gsv_pano_id']) 138 | total_processed = len(df_pano_id_check.index) 139 | total_success = df_pano_id_check['downloaded'].sum() 140 | total_failed = total_processed - total_success 141 | return df_id_set, total_processed, total_success, total_failed 142 | 143 | 144 | # Not currently used - data retrieved from Project Sidewalk API 145 | def extract_panowidthheight(path_to_metadata_xml): 146 | pano = {} 147 | pano_xml = open(path_to_metadata_xml, 'rb') 148 | tree = ET.parse(pano_xml) 149 | root = tree.getroot() 150 | for child in root: 151 | if child.tag == 'data_properties': 152 | pano[child.tag] = child.attrib 153 | 154 | return int(pano['data_properties']['width']), int(pano['data_properties']['height']) 155 | 156 | # Fallback function to get unique pano_ids in case we want to determine panoramas for scraping from a CSV. 157 | def fetch_pano_ids_csv(metadata_csv_path): 158 | """ 159 | Function loads the provided metadata csv file (downloaded from the server) as a dataframe. This dataframe replaces 160 | all the information that is needed to be gathered from Google maps, such as image size, image capture, coordinates. 161 | :param metadata_csv_path: The path to the metadata csv file and the file's name eg. sample/metadata-seattle.csv 162 | :return: A dataframe containing the follow metadata: gsv_panorama_id, pano_x, pano_y, zoom, label_type_id, 163 | camera_heading, heading, pitch, label_id, width, height, tile_width, tile_height, image_date, imagery_type, 164 | pano_lat, pano_lng, label_lat, label_lng, computation_method, copyright 165 | """ 166 | df_meta = pd.read_csv(metadata_csv_path) 167 | df_meta = df_meta.drop_duplicates(subset=['gsv_panorama_id']).to_dict('records') 168 | return df_meta 169 | 170 | 171 | def fetch_pano_ids_from_webserver(include_all_panos): 172 | """ 173 | Fetch panoramic image IDs from the web server. 174 | 175 | Args: 176 | include_all_panos (bool): If True, output all panos, whether or not they have a label. 177 | If False, output only panos with labels on them. 178 | 179 | Returns: 180 | list[str]: A list of panoramic image ID strings retrieved from the server. 181 | """ 182 | unique_ids = set() 183 | pano_info = [] 184 | conn = http.client.HTTPSConnection(sidewalk_server_fqdn) 185 | conn.request("GET", "/adminapi/panos") 186 | r1 = conn.getresponse() 187 | data = r1.read() 188 | jsondata = json.loads(data) 189 | 190 | # Structure of JSON data 191 | # [ 192 | # { 193 | # "gsv_panorama_id": String, 194 | # "width": Int, 195 | # "height": Int, 196 | # "lat": Float, 197 | # "lng": Float, 198 | # "camera_heading": Float, 199 | # "camera_pitch": Float 200 | # }, 201 | # ... 202 | # ] 203 | for value in jsondata: 204 | pano_id = value["gsv_panorama_id"] 205 | has_labels = value["has_labels"] 206 | if (include_all_panos or has_labels) and pano_id not in unique_ids: 207 | # Check if the pano_id is an empty string. 208 | if pano_id and pano_id != 'tutorial': 209 | unique_ids.add(pano_id) 210 | pano_info.append(value) 211 | else: 212 | print("Pano ID is an empty string or is for tutorial") 213 | assert len(unique_ids) == len(pano_info) 214 | return pano_info 215 | 216 | 217 | def download_panorama_images(storage_path, pano_infos): 218 | logging.basicConfig(filename='scrape.log', level=logging.DEBUG) 219 | success_count, skipped_count, fallback_success_count, fail_count, total_completed = 0, 0, 0, 0, 0 220 | total_panos = len(pano_infos) 221 | 222 | # csv log file for pano_id failures, place in 'storage' folder (alongside pano results) 223 | csv_pano_log_path = os.path.join(storage_path, "gsv_panorama_id_log.csv") 224 | columns = ['gsv_pano_id', 'downloaded'] 225 | if not exists(csv_pano_log_path): 226 | df_pano_id_log = pd.DataFrame(columns=columns) 227 | df_pano_id_log.to_csv(csv_pano_log_path, mode='w', header=True, index=False) 228 | else: 229 | df_pano_id_log = pd.read_csv(csv_pano_log_path) 230 | processed_ids = list(df_pano_id_log['gsv_pano_id']) 231 | 232 | df_id_set, total_completed, skipped_count, fail_count = progress_check(csv_pano_log_path) 233 | 234 | for pano_info in pano_infos: 235 | pano_id = pano_info['gsv_panorama_id'] 236 | if pano_id in df_id_set: 237 | continue 238 | start_time = time.time() 239 | print("IMAGEDOWNLOAD: Processing pano %s " % (pano_id)) 240 | try: 241 | pano_dims = (pano_info['width'], pano_info['height']) 242 | result_code = download_single_pano(storage_path, pano_id, pano_dims) 243 | if result_code == DownloadResult.success: 244 | success_count += 1 245 | elif result_code == DownloadResult.fallback_success: 246 | fallback_success_count += 1 247 | elif result_code == DownloadResult.skipped: 248 | skipped_count += 1 249 | elif result_code == DownloadResult.failure: 250 | fail_count += 1 251 | downloaded = 0 if result_code == DownloadResult.failure else 1 252 | 253 | except Exception as e: 254 | fail_count += 1 255 | downloaded = 0 256 | logging.error("IMAGEDOWNLOAD: Failed to download pano %s due to error %s", pano_id, str(e)) 257 | total_completed = success_count + fallback_success_count + fail_count + skipped_count 258 | 259 | if pano_id not in processed_ids: 260 | df_data_append = pd.DataFrame([[pano_id, downloaded]], columns=columns) 261 | df_data_append.to_csv(csv_pano_log_path, mode='a', header=False, index=False) 262 | else: 263 | df_pano_id_log = pd.read_csv(csv_pano_log_path) 264 | df_pano_id_log.loc[df_pano_id_log['gsv_pano_id'] == pano_id, 'downloaded'] = downloaded 265 | df_pano_id_log.to_csv(csv_pano_log_path, mode='w', header=True, index=False) 266 | processed_ids.append(pano_id) 267 | 268 | print("IMAGEDOWNLOAD: Completed %d of %d (%d success, %d fallback success, %d failed, %d skipped)" 269 | % (total_completed, total_panos, success_count, fallback_success_count, fail_count, skipped_count)) 270 | print("--- %s seconds ---" % (time.time() - start_time)) 271 | 272 | logging.debug( 273 | "IMAGEDOWNLOAD: Final result: Completed %d of %d (%d success, %d fallback success, %d failed, %d skipped)", 274 | total_completed, 275 | total_panos, 276 | success_count, 277 | fallback_success_count, 278 | fail_count, 279 | skipped_count) 280 | 281 | return success_count, fallback_success_count, fail_count, skipped_count, total_completed 282 | 283 | 284 | def download_single_pano(storage_path, pano_id, pano_dims): 285 | base_url = 'https://maps.google.com/cbk?output=tile&cb_client=maps_sv&fover=2&onerr=3&renderer=spherical&v=4' 286 | 287 | destination_dir = os.path.join(storage_path, pano_id[:2]) 288 | if not os.path.isdir(destination_dir): 289 | os.makedirs(destination_dir) 290 | os.chmod(destination_dir, 0o775 | stat.S_ISGID) 291 | 292 | filename = pano_id + ".jpg" 293 | out_image_name = os.path.join(destination_dir, filename) 294 | 295 | # Skip download if image already exists 296 | if os.path.isfile(out_image_name): 297 | return DownloadResult.skipped 298 | 299 | final_image_width = int(pano_dims[0]) if pano_dims[0] is not None else None 300 | final_image_height = int(pano_dims[1]) if pano_dims[1] is not None else None 301 | zoom = None 302 | 303 | session = request_session() 304 | 305 | # Check XML metadata for image width/height max zoom if its downloaded. 306 | xml_metadata_path = os.path.join(destination_dir, pano_id + ".xml") 307 | if os.path.isfile(xml_metadata_path): 308 | print(xml_metadata_path) 309 | with open(xml_metadata_path, 'rb') as pano_xml: 310 | try: 311 | tree = ET.parse(pano_xml) 312 | root = tree.getroot() 313 | 314 | # Get the number of zoom levels. 315 | for child in root: 316 | if child.tag == 'data_properties': 317 | zoom = int(child.attrib['num_zoom_levels']) 318 | if final_image_width is None: final_image_width = int(child.attrib['width']) 319 | if final_image_height is None: final_image_height = int(child.attrib['height']) 320 | 321 | # If there is no zoom in the XML, then we skip this and try some zoom levels below. 322 | if zoom is not None: 323 | # Check if the image exists (occasionally we will have XML but no JPG). 324 | test_url = f'{base_url}&zoom={zoom}&x=0&y=0&panoid={pano_id}' 325 | test_request = get_response(test_url, session, stream=True) 326 | test_tile = Image.open(test_request) 327 | if test_tile.convert("L").getextrema() == (0, 0): 328 | return DownloadResult.failure 329 | except Exception as e: 330 | pass 331 | 332 | # If we did not find image width/height from API or XML, then set download to failure. 333 | if final_image_width is None or final_image_height is None: 334 | return DownloadResult.failure 335 | 336 | # If we did not find a zoom level in the XML above, then try a couple zoom level options here. 337 | if zoom is None: 338 | url_zoom_3 = f'{base_url}&zoom=3&x=0&y=0&panoid={pano_id}' 339 | url_zoom_5 = f'{base_url}&zoom=5&x=0&y=0&panoid={pano_id}' 340 | 341 | req_zoom_3 = get_response(url_zoom_3, session, stream=True) 342 | im_zoom_3 = Image.open(req_zoom_3) 343 | req_zoom_5 = get_response(url_zoom_5, session, stream=True) 344 | im_zoom_5 = Image.open(req_zoom_5) 345 | 346 | # In some cases (e.g., old GSV images), we don't have zoom level 5, so Google returns a 347 | # transparent image. This means we need to set the zoom level to 3. Google also returns a 348 | # transparent image if there is no imagery. So check at both zoom levels. How to check: 349 | # http://stackoverflow.com/questions/14041562/python-pil-detect-if-an-image-is-completely-black-or-white 350 | if im_zoom_5.convert("L").getextrema() != (0, 0): 351 | zoom = 5 352 | elif im_zoom_3.convert("L").getextrema() != (0, 0): 353 | zoom = 3 354 | else: 355 | # can't determine zoom 356 | return DownloadResult.failure 357 | 358 | final_im_dimension = (final_image_width, final_image_height) 359 | 360 | def generate_gsv_urls(zoom): 361 | """ 362 | Generates all valid urls of GSV tiles to be downloaded for stitching into single panorama. 363 | :param zoom: the valid/working zoom value for this pano_id 364 | :return: a list of all valid urls to be accessed for downloading the panorama 365 | """ 366 | sites_gsv = [] 367 | for y in range(int(math.ceil(final_image_height / 512.0))): 368 | for x in range(int(math.ceil(final_image_width / 512.0))): 369 | url = f'{base_url}&zoom={zoom}&x={str(x)}&y={str(y)}&panoid={pano_id}' 370 | sites_gsv.append((str(x) + " " + str(y), url)) 371 | return sites_gsv 372 | 373 | @backoff.on_exception(backoff.expo, (aiohttp.web.HTTPServerError, aiohttp.ClientError, aiohttp.ClientResponseError, 374 | aiohttp.ServerConnectionError, aiohttp.ServerDisconnectedError, 375 | aiohttp.ClientHttpProxyError), max_tries=10) 376 | async def download_single_gsv(session, url): 377 | """ 378 | Downloads a single 512x512 panorama tile 379 | :param session: requests sessions object 380 | :param url: the url to be accessed where the target image is 381 | :return: a list containing - x and y position of the download image, downloaded image 382 | """ 383 | # TODO: possibly not needed 384 | # # If not using proxies, delay for a little bit to avoid hammering the server 385 | # if proxies["http"] is None: 386 | # time.sleep(new_random_delay() / 1000) 387 | async with session.get(url[1], proxy=proxies["http"], headers=random_header()) as response: 388 | head_content = response.headers['Content-Type'] 389 | # ensures content type is an image 390 | if head_content[0:10] != "image/jpeg": 391 | raise aiohttp.ClientResponseError(response.request_info, response.history) 392 | image = await response.content.read() 393 | return [url[0], image] 394 | 395 | @backoff.on_exception(backoff.expo, 396 | (aiohttp.web.HTTPServerError, aiohttp.ClientError, aiohttp.ClientResponseError, aiohttp.ServerConnectionError, 397 | aiohttp.ServerDisconnectedError, aiohttp.ClientHttpProxyError), max_tries=10) 398 | async def download_all_gsv_images(sites): 399 | """ 400 | For the given list of sites/urls that make up a single GSV panorama, starts the connections, breaks each of the 401 | sites into tasks, then runs these tasks through asyncio. 402 | :param sites: list of all valid urls that make up the image 403 | :return: responses from the tasks which contains all the images and their position x and y data 404 | (needed for stitching) 405 | """ 406 | conn = aiohttp.TCPConnector(limit=thread_count) 407 | async with aiohttp.ClientSession(raise_for_status=True, connector=conn) as session: 408 | tasks = [] 409 | for url in sites: 410 | task = asyncio.ensure_future(download_single_gsv(session, url)) 411 | tasks.append(task) 412 | responses = await asyncio.gather(*tasks, return_exceptions=True) 413 | return responses 414 | 415 | blank_image = Image.new('RGB', final_im_dimension, (0, 0, 0, 0)) 416 | sites = generate_gsv_urls(zoom) 417 | all_pano_images = asyncio.get_event_loop().run_until_complete(download_all_gsv_images(sites)) 418 | 419 | for cell_image in all_pano_images: 420 | img = Image.open(BytesIO(cell_image[1])) 421 | img = img.resize((512, 512)) 422 | x, y = int(str.split(cell_image[0])[0]), int(str.split(cell_image[0])[1]) 423 | blank_image.paste(img, (512 * x, 512 * y)) 424 | 425 | # TODO: sleep after entire pano downloaded versus each tile? 426 | 427 | if zoom == 3: 428 | blank_image = blank_image.resize(final_im_dimension, Image.ANTIALIAS) 429 | blank_image.save(out_image_name, 'jpeg') 430 | os.chmod(out_image_name, 0o664) 431 | return DownloadResult.success 432 | 433 | 434 | def download_panorama_metadata_xmls(storage_path, pano_infos): 435 | ''' 436 | This method downloads a xml file that contains depth information from GSV. It first 437 | checks if we have a folder for each pano_id, and checks if we already have the corresponding 438 | depth file or not. 439 | ''' 440 | total_panos = len(pano_infos) 441 | success_count = 0 442 | fail_count = 0 443 | skipped_count = 0 444 | total_completed = 0 445 | 446 | for pano_info in pano_infos: 447 | pano_id = pano_info['gsv_panorama_id'] 448 | print("METADOWNLOAD: Processing pano %s " % (pano_id)) 449 | try: 450 | result_code = download_single_metadata_xml(storage_path, pano_id) 451 | if result_code == DownloadResult.failure: 452 | fail_count += 1 453 | elif result_code == DownloadResult.success: 454 | success_count += 1 455 | elif result_code == DownloadResult.skipped: 456 | skipped_count += 1 457 | except Exception as e: 458 | fail_count += 1 459 | logging.error("METADOWNLOAD: Failed to download metadata for pano %s due to error %s", pano_id, str(e)) 460 | total_completed = fail_count + success_count + skipped_count 461 | print("METADOWNLOAD: Completed %d of %d (%d success, %d failed, %d skipped)" % 462 | (total_completed, total_panos, success_count, fail_count, skipped_count)) 463 | 464 | logging.debug("METADOWNLOAD: Final result: Completed %d of %d (%d success, %d failed, %d skipped)", 465 | total_completed, total_panos, success_count, fail_count, skipped_count) 466 | return (success_count, fail_count, skipped_count, total_completed) 467 | 468 | 469 | def download_single_metadata_xml(storage_path, pano_id): 470 | base_url = "https://maps.google.com/cbk?output=xml&cb_client=maps_sv&hl=en&dm=1&pm=1&ph=1&renderer=cubic,spherical&v=4&panoid=" 471 | 472 | # Check if the directory exists. Then check if the file already exists and skip if it does. 473 | destination_folder = os.path.join(storage_path, pano_id[:2]) 474 | if not os.path.isdir(destination_folder): 475 | os.makedirs(destination_folder) 476 | os.chmod(destination_folder, 0o775 | stat.S_ISGID) 477 | 478 | filename = pano_id + ".xml" 479 | destination_file = os.path.join(destination_folder, filename) 480 | if os.path.isfile(destination_file): 481 | return DownloadResult.skipped 482 | 483 | url = base_url + pano_id 484 | 485 | session = request_session() 486 | req = get_response(url, session) 487 | 488 | # Check if the XML file is empty. If not, write it out to a file and set the permissions. 489 | lineOne = req.content.splitlines()[0] 490 | lineFive = req.content.splitlines()[4] 491 | 492 | if lineOne == b'' or lineFive == b' Error 404 (Not Found)!!1': 493 | return DownloadResult.failure 494 | else: 495 | with open(destination_file, 'wb') as f: 496 | f.write(req.content) 497 | os.chmod(destination_file, 0o664) 498 | 499 | return DownloadResult.success 500 | 501 | 502 | def generate_depthmapfiles(path_to_scrapes): 503 | success_count = 0 504 | fail_count = 0 505 | skip_count = 0 506 | total_completed = 0 507 | # Iterate through all .xml files in specified path, recursively 508 | for root, dirnames, filenames in os.walk(path_to_scrapes): 509 | for filename in fnmatch.filter(filenames, '*.xml'): 510 | xml_location = os.path.join(root, filename) 511 | 512 | # Pano id is XML filename minus the extension 513 | pano_id = filename[:-4] 514 | print("GENERATEDEPTH: Processing pano %s " % (pano_id)) 515 | 516 | # Generate a .depth.txt file for the .xml file 517 | output_file = os.path.join(root, pano_id + ".depth.txt") 518 | if os.path.isfile(output_file): 519 | skip_count += 1 520 | else: 521 | output_code = call(["./decode_depthmap", xml_location, output_file]) 522 | if output_code == 0: 523 | os.chmod(output_file, 0o664) 524 | success_count += 1 525 | else: 526 | fail_count += 1 527 | logging.error("GENERATEDEPTH: Could not create depth.txt for pano %s, error code was %s", pano_id, 528 | str(output_code)) 529 | total_completed = fail_count + success_count + skip_count 530 | print("GENERATEDEPTH: Completed %d (%d success, %d failed, %d skipped)" % 531 | (total_completed, success_count, fail_count, skip_count)) 532 | 533 | logging.debug("GENERATEDEPTH: Final result: Completed %d (%d success, %d failed, %d skipped)", 534 | total_completed, success_count, fail_count, skip_count) 535 | return success_count, fail_count, skip_count, total_completed 536 | 537 | 538 | def run_scraper_and_log_results(pano_infos, attempt_depth): 539 | start_time = datetime.now() 540 | with open(os.path.join(storage_location, "log.csv"), 'a') as log: 541 | log.write("\n%s" % (str(start_time))) 542 | 543 | # Try to download xml (which contains the depth data) if attempt_depth=True. 544 | xml_res = () 545 | if attempt_depth: 546 | xml_res = download_panorama_metadata_xmls(storage_location, pano_infos) 547 | else: 548 | xml_res = (0, 0, len(pano_infos), len(pano_infos)) 549 | xml_end_time = datetime.now() 550 | xml_duration = int(round((xml_end_time - start_time).total_seconds() / 60.0)) 551 | with open(os.path.join(storage_location, "log.csv"), 'a') as log: 552 | log.write(",%d,%d,%d,%d,%d" % (xml_res[0], xml_res[1], xml_res[2], xml_res[3], xml_duration)) 553 | 554 | im_res = download_panorama_images(storage_location, pano_infos) 555 | im_end_time = datetime.now() 556 | im_duration = int(round((im_end_time - xml_end_time).total_seconds() / 60.0)) 557 | with open(os.path.join(storage_location, "log.csv"), 'a') as log: 558 | log.write(",%d,%d,%d,%d,%d,%d" % (im_res[0], im_res[1], im_res[2], im_res[3], im_res[4], im_duration)) 559 | 560 | # Try to extract depth data from the xml if attempt_depth=True. 561 | depth_res = () 562 | if attempt_depth: 563 | depth_res = generate_depthmapfiles(storage_location) 564 | else: 565 | depth_res = (0, 0, 0, 0) 566 | depth_end_time = datetime.now() 567 | depth_duration = int(round((depth_end_time - im_end_time).total_seconds() / 60.0)) 568 | with open(os.path.join(storage_location, "log.csv"), 'a') as log: 569 | log.write(",%d,%d,%d,%d,%d" % (depth_res[0], depth_res[1], depth_res[2], depth_res[3], depth_duration)) 570 | 571 | total_duration = int(round((depth_end_time - start_time).total_seconds() / 60.0)) 572 | with open(os.path.join(storage_location, "log.csv"), 'a') as log: 573 | log.write(",%d" % (total_duration)) 574 | 575 | 576 | # Access Project Sidewalk API to get Pano IDs for city 577 | print("Fetching pano-ids") 578 | 579 | if pano_metadata_csv is not None: 580 | pano_infos = fetch_pano_ids_csv(pano_metadata_csv) 581 | else: 582 | pano_infos = fetch_pano_ids_from_webserver(all_panos) 583 | 584 | 585 | # Uncomment this to test on a smaller subset of the pano_info 586 | # pano_infos = random.sample(pano_infos, 10) 587 | print(len(pano_infos)) 588 | # print(pano_infos) 589 | 590 | # Use pano_id list and associated info to gather panos from GSV API 591 | print("Fetching Panoramas") 592 | run_scraper_and_log_results(pano_infos, attempt_depth) 593 | --------------------------------------------------------------------------------