├── .gitattributes ├── .gitignore ├── Dockerfile ├── LICENSE ├── README.md ├── build_server ├── docker-compose.yml.example ├── teamcity_agent │ ├── Dockerfile │ ├── requirements.txt │ └── shut_down_on_empty_queue.py ├── teamcity_logs │ └── README.md └── teamcity_server_data │ └── README.md ├── docker-compose.yml ├── efd_orders.svg ├── efd_orders.svg.png ├── files ├── archaeology │ ├── LICENSE │ ├── README.md │ └── archaeology.csv.zip ├── buildings │ └── buildings.csv.zip ├── neighborhoods │ ├── LICENSE.md │ └── neighborhoods.csv.zip └── roads │ └── boogstralen.zip ├── model ├── __init__.py ├── all_grid_search.py ├── all_test.py ├── archaeology_convnet.py ├── archaeology_convnet_fixed.py ├── archaeology_lstm.py ├── baseline │ ├── __init__.py │ ├── all_baseline_models.py │ ├── archaeo_feature_type_decision_tree.py │ ├── archaeo_feature_type_knn.py │ ├── archaeo_feature_type_logistic_regression.py │ ├── archaeo_feature_type_svm_linear.py │ ├── archaeo_feature_type_svm_polynomial.py │ ├── archaeo_feature_type_svm_rbf.py │ ├── building_type_decision_tree.py │ ├── building_type_knn.py │ ├── building_type_logistic_regression.py │ ├── building_type_svm_linear.py │ ├── building_type_svm_polynomial.py │ ├── building_type_svm_rbf.py │ ├── neighborhood_inhabintants_decision_tree.py │ ├── neighborhood_inhabintants_knn.py │ ├── neighborhood_inhabintants_logistic_regression.py │ ├── neighborhood_inhabintants_svm_linear.py │ ├── neighborhood_inhabintants_svm_polynomial.py │ └── neighborhood_inhabintants_svm_rbf.py ├── building_convnet.py ├── building_convnet_fixed.py ├── building_lstm.py ├── configs │ └── README.md ├── grid_search.py ├── neighborhood_convnet.py ├── neighborhood_convnet_fixed.py ├── neighborhood_lstm.py ├── plots │ └── README.md └── topoml_util │ ├── ConsoleLogger.py │ ├── GaussianMixtureLoss.py │ ├── GeoVectorizer.py │ ├── LoggerCallback.py │ ├── PyplotLogger.py │ ├── Tokenizer.py │ ├── __init__.py │ ├── gaussian_loss.py │ ├── geom_fourier_descriptors.py │ ├── geom_scaler.py │ ├── np_gaussian_2d_loss.py │ ├── np_gmm_loss.py │ ├── sketch_rnn_model.py │ ├── slack_send.py │ ├── test_GaussianMixtureLoss.py │ ├── test_GeoVectorizer.py │ ├── test_Tokenizer.py │ ├── test_files │ ├── big_multipolygon_wkt.txt │ ├── example.csv │ ├── gmm_output.py │ ├── multipart_multipolygon_wkt.txt │ └── polygon_multipolygon.csv │ ├── test_fourier_descriptors.py │ ├── test_geom_loss.py │ ├── test_geom_scaler.py │ ├── test_np_gaussian_2d_loss.py │ ├── test_rasterization.py │ ├── test_sketch-rnn-model.py │ ├── test_wkt2pyplot.py │ └── wkt2pyplot.py ├── prep ├── ProgressBar.py ├── densified.py ├── export-data.sh ├── get-data.sh ├── preprocess-archaeology.py ├── preprocess-buildings.py ├── preprocess-neighborhoods.py ├── spatial-join.sql ├── triangles.py ├── util │ ├── __init__.py │ ├── layerToWGS.py │ └── sim.c └── vectorize_brt_osm.py ├── requirements.txt ├── script ├── build-script.sh ├── install-docker-ubuntu.sh ├── install-requirements.sh ├── run-all-models.sh ├── slack_notify.py └── test-tensorflow.py └── setup.py /.gitattributes: -------------------------------------------------------------------------------- 1 | *.sh text eol=lf 2 | *.npz filter=lfs diff=lfs merge=lfs -text 3 | -------------------------------------------------------------------------------- /.gitignore: -------------------------------------------------------------------------------- 1 | # Byte-compiled / optimized / DLL files 2 | __pycache__/ 3 | *.py[cod] 4 | *$py.class 5 | 6 | # C extensions 7 | *.so 8 | 9 | # Distribution / packaging 10 | .Python 11 | env/ 12 | build/ 13 | develop-eggs/ 14 | dist/ 15 | downloads/ 16 | eggs/ 17 | .eggs/ 18 | lib 19 | lib64/ 20 | parts/ 21 | sdist/ 22 | var/ 23 | wheels/ 24 | *.egg-info/ 25 | .installed.cfg 26 | *.egg 27 | 28 | # PyInstaller 29 | # Usually these files are written by a python script from a template 30 | # before PyInstaller builds the exe, so as to inject date/other infos into it. 31 | *.manifest 32 | *.spec 33 | 34 | # Installer logs 35 | pip-log.txt 36 | pip-delete-this-directory.txt 37 | 38 | # Unit test / coverage reports 39 | htmlcov/ 40 | .tox/ 41 | .coverage 42 | .coverage.* 43 | .cache 44 | nosetests.xml 45 | coverage.xml 46 | *.cover 47 | .hypothesis/ 48 | 49 | # Translations 50 | *.mo 51 | *.pot 52 | 53 | # Django stuff: 54 | *.log 55 | local_settings.py 56 | 57 | # Flask stuff: 58 | instance/ 59 | .webassets-cache 60 | 61 | # Scrapy stuff: 62 | .scrapy 63 | 64 | # Sphinx documentation 65 | docs/_build/ 66 | 67 | # PyBuilder 68 | target/ 69 | 70 | # Jupyter Notebook 71 | .ipynb_checkpoints 72 | 73 | # pyenv 74 | .python-version 75 | 76 | # celery beat schedule file 77 | celerybeat-schedule 78 | 79 | # SageMath parsed files 80 | *.sage.py 81 | 82 | # dotenv 83 | .env 84 | 85 | # virtualenv 86 | .venv 87 | venv/ 88 | ENV/ 89 | 90 | # Spyder project settings 91 | .spyderproject 92 | .spyproject 93 | 94 | # Rope project settings 95 | .ropeproject 96 | 97 | # mkdocs documentation 98 | /site 99 | 100 | # mypy 101 | .mypy_cache/ 102 | 103 | # Custom stuff 104 | /.idea/ 105 | /model/tensorboard_log/ 106 | /model/plots/*.png 107 | /model/configs/*.py 108 | /script/cuda/ 109 | !/continuous_delivery/teamcity_logs/README.md 110 | !/continuous_delivery/teamcity_server_data/README.md 111 | !/continuous_delivery/docker-compose.yml 112 | !/continuous_delivery/teamcity_agent/Dockerfile 113 | !/files/roads/boogstralen.zip 114 | !/continuous_delivery/ 115 | !/files/neighborhoods/neighborhoods_order_30_train.npz 116 | !/files/neighborhoods/neighborhoods_order_30_test.npz 117 | *.csv 118 | !/model/topoml_util/test_files/example.csv 119 | !/model/topoml_util/test_files/polygon_multipolygon.csv 120 | # Prevent committing credentials set in compose file 121 | build_server/docker-compose.yml 122 | /build_server/teamcity_server_data/config/ 123 | /build_server/teamcity_server_data/plugins/ 124 | /build_server/teamcity_server_data/system/ 125 | -------------------------------------------------------------------------------- /Dockerfile: -------------------------------------------------------------------------------- 1 | FROM geodata/gdal:2.1.3 2 | 3 | RUN apt-get update && apt-get install -y curl unzip 4 | RUN pip install shapely numpy 5 | WORKDIR /data/prep -------------------------------------------------------------------------------- /LICENSE: -------------------------------------------------------------------------------- 1 | MIT License 2 | 3 | Copyright (c) 2017 Rein van 't Veer 4 | 5 | Permission is hereby granted, free of charge, to any person obtaining a copy 6 | of this software and associated documentation files (the "Software"), to deal 7 | in the Software without restriction, including without limitation the rights 8 | to use, copy, modify, merge, publish, distribute, sublicense, and/or sell 9 | copies of the Software, and to permit persons to whom the Software is 10 | furnished to do so, subject to the following conditions: 11 | 12 | The above copyright notice and this permission notice shall be included in all 13 | copies or substantial portions of the Software. 14 | 15 | THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR 16 | IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, 17 | FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE 18 | AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER 19 | LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, 20 | OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE 21 | SOFTWARE. 22 | -------------------------------------------------------------------------------- /build_server/docker-compose.yml.example: -------------------------------------------------------------------------------- 1 | version: "2" 2 | 3 | services: 4 | teamcity: 5 | image: jetbrains/teamcity-server 6 | restart: unless-stopped 7 | volumes: 8 | - ./teamcity_logs:/opt/teamcity/logs 9 | - ./teamcity_server_data:/data/teamcity_server/datadir 10 | ports: 11 | - "8111:8111" 12 | 13 | teamcity_agent: 14 | build: teamcity_agent 15 | restart: unless-stopped 16 | depends_on: 17 | - teamcity 18 | volumes: 19 | - ./teamcity_agent:/data/teamcity_agent/conf 20 | environment: 21 | - SERVER_URL=teamcity:8111 22 | - SLACK_API_TOKEN=yourslackapitoken 23 | - SLACK_CHANNEL=#machinelearning 24 | - AWS_ACCESS_KEY_ID=yourawsaccesskeyid 25 | - AWS_SECRET_ACCESS_KEY=yourawsaccesskeytoken 26 | -------------------------------------------------------------------------------- /build_server/teamcity_agent/Dockerfile: -------------------------------------------------------------------------------- 1 | FROM jetbrains/teamcity-agent 2 | 3 | RUN NVIDIA_GPGKEY_SUM=d1be581509378368edeec8c1eb2958702feedf3bc3d17011adbf24efacce4ab5 && \ 4 | NVIDIA_GPGKEY_FPR=ae09fe4bbd223a84b2ccfce3f60f4b3d7fa2af80 && \ 5 | apt-key adv --fetch-keys http://developer.download.nvidia.com/compute/cuda/repos/ubuntu1604/x86_64/7fa2af80.pub && \ 6 | apt-key adv --export --no-emit-version -a $NVIDIA_GPGKEY_FPR | tail -n +5 > cudasign.pub && \ 7 | echo "$NVIDIA_GPGKEY_SUM cudasign.pub" | sha256sum -c --strict - && rm cudasign.pub && \ 8 | echo "deb http://developer.download.nvidia.com/compute/cuda/repos/ubuntu1604/x86_64 /" > /etc/apt/sources.list.d/cuda.list && \ 9 | echo "deb http://developer.download.nvidia.com/compute/machine-learning/repos/ubuntu1604/x86_64 /" > /etc/apt/sources.list.d/nvidia-ml.list 10 | 11 | ENV CUDA_VERSION 9.0.176 12 | 13 | ENV CUDA_PKG_VERSION 9-0=$CUDA_VERSION-1 14 | RUN apt-get update && apt-get install -y --no-install-recommends \ 15 | cuda-cudart-$CUDA_PKG_VERSION && \ 16 | ln -s cuda-9.0 /usr/local/cuda && \ 17 | rm -rf /var/lib/apt/lists/* 18 | 19 | # nvidia-docker 1.0 20 | LABEL com.nvidia.volumes.needed="nvidia_driver" 21 | LABEL com.nvidia.cuda.version="${CUDA_VERSION}" 22 | 23 | RUN echo "/usr/local/nvidia/lib" >> /etc/ld.so.conf.d/nvidia.conf && \ 24 | echo "/usr/local/nvidia/lib64" >> /etc/ld.so.conf.d/nvidia.conf 25 | 26 | # nvidia-container-runtime 27 | ENV NVIDIA_VISIBLE_DEVICES all 28 | ENV NVIDIA_DRIVER_CAPABILITIES compute,utility 29 | ENV NVIDIA_REQUIRE_CUDA "cuda>=9.0" 30 | 31 | # NCCL 32 | ENV NCCL_VERSION 2.1.15 33 | 34 | RUN apt-get update && apt-get install -y --no-install-recommends \ 35 | cuda-libraries-$CUDA_PKG_VERSION \ 36 | libnccl2=$NCCL_VERSION-1+cuda9.0 && \ 37 | rm -rf /var/lib/apt/lists/* 38 | 39 | # CUDNN 40 | #ENV CUDNN_VERSION 7.1.1.5 41 | ENV CUDNN_VERSION 7.0.3.11 42 | LABEL com.nvidia.cudnn.version="${CUDNN_VERSION}" 43 | 44 | RUN apt-get update && apt-get install -y --no-install-recommends \ 45 | libcudnn7=$CUDNN_VERSION-1+cuda9.0 && \ 46 | rm -rf /var/lib/apt/lists/* 47 | 48 | # Extra env vars 49 | ENV LD_LIBRARY_PATH /usr/local/cuda/lib64/ 50 | ENV CUDA_HOME=/usr/local/cuda 51 | 52 | RUN apt-get update && apt-get install -y python3-pip 53 | RUN pip3 install --upgrade pip 54 | COPY requirements.txt /requirements.txt 55 | RUN pip3 install -r requirements.txt 56 | 57 | # Install Git Large File Storage 58 | RUN curl -s https://packagecloud.io/install/repositories/github/git-lfs/script.deb.sh | bash 59 | RUN apt-get update && apt-get install -y --no-install-recommends \ 60 | git-lfs && \ 61 | rm -rf /var/lib/apt/lists/* 62 | RUN git lfs install 63 | -------------------------------------------------------------------------------- /build_server/teamcity_agent/requirements.txt: -------------------------------------------------------------------------------- 1 | Keras>=2.1.2 2 | numpy>=1.14.0 3 | pandas>=0.22.0 4 | scikit-learn>=0.19.1 5 | scipy>=1.0.0 6 | Shapely>=1.6.3 7 | slackclient>=1.1.0 8 | tensorflow-gpu>=1.6.0 9 | matplotlib>=2.1.2 10 | pyefd>=1.0 11 | boto3>=1.6 -------------------------------------------------------------------------------- /build_server/teamcity_agent/shut_down_on_empty_queue.py: -------------------------------------------------------------------------------- 1 | import http 2 | import os 3 | from datetime import datetime 4 | 5 | import boto3 6 | import requests 7 | 8 | from slackclient import SlackClient 9 | 10 | SCRIPT_NAME = os.path.basename(__file__) 11 | TIMESTAMP = str(datetime.now()).replace(':', '.') 12 | # Set this to the appropriate region 13 | REGION_NAME = 'eu-west-1' 14 | 15 | # Get environment variables 16 | # Slack is required. We need to know if something is wrong 17 | slack_token = os.environ['SLACK_API_TOKEN'] 18 | slack_channel = os.environ['SLACK_CHANNEL'] 19 | # We are also going to require Amazon credentials, set as environment variables 20 | amazon_id = os.environ['AWS_ACCESS_KEY_ID'] 21 | amazon_key = os.environ['AWS_SECRET_ACCESS_KEY'] 22 | 23 | # Initialize frameworks 24 | ec2 = boto3.client('ec2', region_name=REGION_NAME) 25 | sc = SlackClient(slack_token) 26 | 27 | 28 | # Slack notification function 29 | def notify(signature, message): 30 | sc.api_call("chat.postMessage", channel=slack_channel, 31 | text="Script " + signature + " notification: " + str(message)) 32 | 33 | 34 | # Get build queue length 35 | queue = "http://teamcity:8111/guestAuth/app/rest/buildQueue" 36 | headers = { 37 | 'Accept': "application/json", 38 | 'Cache-Control': "no-cache", 39 | } 40 | queue_res = requests.get(queue, headers=headers) 41 | queue_status = queue_res.json() 42 | queue_length = queue_status['count'] 43 | 44 | # Get instance id for this machine 45 | # https://stackoverflow.com/questions/33301880/how-to-obtain-current-instance-id-from-boto3#33307704 46 | try: 47 | instance_metadata = requests.get('http://169.254.169.254/latest/meta-data/instance-id') 48 | except ConnectionError as e: 49 | notify(SCRIPT_NAME, 'ERROR getting instance id, cannot issue commands') 50 | raise ConnectionError(e) 51 | 52 | instance_id = instance_metadata.text 53 | 54 | if queue_length == 0: 55 | print('build server reports empty queue, shutting down.') 56 | shutdown_res = ec2.stop_instances(InstanceIds=[instance_id]) 57 | http_status_code = shutdown_res['ResponseMetadata']['HTTPStatusCode'] 58 | http_status = http.HTTPStatus(http_status_code).name 59 | 60 | if http_status_code == 200: 61 | print('Stop instances:', http_status) 62 | notify(SCRIPT_NAME, 'successful shutdown of {} with response {}'.format(instance_id, http_status)) 63 | else: 64 | notify(SCRIPT_NAME, 'ERROR shutting down instance id: {}'.format(http_status)) 65 | else: 66 | notify(SCRIPT_NAME, 'job finished, build server reports non-empty queue, continuing.') 67 | 68 | 69 | -------------------------------------------------------------------------------- /build_server/teamcity_logs/README.md: -------------------------------------------------------------------------------- 1 | # Log dir 2 | This is the directory the build server logs are kept. -------------------------------------------------------------------------------- /build_server/teamcity_server_data/README.md: -------------------------------------------------------------------------------- 1 | # Server data 2 | This is the directory the server data are kept. -------------------------------------------------------------------------------- /docker-compose.yml: -------------------------------------------------------------------------------- 1 | version: "3" 2 | 3 | services: 4 | postgis: 5 | image: mdillon/postgis 6 | restart: unless-stopped 7 | ports: 8 | - '5432:5432' 9 | 10 | data-prep: 11 | build: . 12 | volumes: 13 | - ./files:/data/files 14 | - ./prep:/data/prep 15 | command: bash /data/prep/get-data.sh 16 | depends_on: 17 | - postgis 18 | -------------------------------------------------------------------------------- /efd_orders.svg.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/SPINLab/geometry-learning/5300d421ef848c2748a2ba41ced5c6e2fba93200/efd_orders.svg.png -------------------------------------------------------------------------------- /files/archaeology/LICENSE: -------------------------------------------------------------------------------- 1 | Data in this folder is copyright (c) ADC ArcheoProjecten 2 | 3 | This work is licensed under the Creative Commons Attribution-ShareAlike 4.0 International License. To view a copy of this license, visit http://creativecommons.org/licenses/by-sa/4.0/ or send a letter to Creative Commons, PO Box 1866, Mountain View, CA 94042, USA. -------------------------------------------------------------------------------- /files/archaeology/README.md: -------------------------------------------------------------------------------- 1 | # Archaeological feature data 2 | Data in this folder is (c) ADC ArcheoProjecten, redistributed under [CC-BY-SA 4.0 license](http://creativecommons.org/licenses/by-sa/4.0/) by kind permission of ADC ArcheoProjecten. 3 | 4 | Data from the following projects was used: 5 | 6 | Project | Reference | No. of features | has definitions 7 | --- | --- | --- | --- 8 | ENKN_09 | ADC ArcheoProjecten; Roessingh, W.; Lohof, E.; (2010): Enkhuizen Kadijken 5A en 5B Opgraving. DANS. https://doi.org/10.17026/dans-27r-e5f8 | 11058 | Yes 9 | VENO13_08 | ADC ArcheoProjecten; Gerrets, D.A.; Jacobs, E.; (2011): Venlo Venlo TPN Deelgebied 1 en 2 Opgraving. DANS. https://doi.org/10.17026/dans-26f-55zu | 5101 | Yes (joined) 10 | MONF_09 | ADC ArcheoProjecten; Veken, B. van der; Prangsma, N.M.; (2011): Montferland Didam Westelijke Randweg Kerkwijk Opgraving. DANS. https://doi.org/10.17026/dans-zmk-35vy | 5603 | Yes (joined) 11 | VEEE_07 | ADC ArcheoProjecten; Dijkstra, J.; Zuidhoff, F.S.; (2011): Veere Rijksweg N57 Proefsleuven Begeleiding Opgraving. DANS. https://doi.org/10.17026/dans-xyc-re2w | 5243 | Yes 12 | GOUA_08 | ADC ArcheoProjecten; Dijkstra, J.; Houkes, M.C. ; Ostkamp, S. ; (2010): Gouda Bolwerk Opgraving en Begeleiding. DANS. https://doi.org/10.17026/dans-xzm-x29h | 5306 | Yes 13 | VENO_02 | ADC ArcheoProjecten; Velde, H. van de; Ostkamp, S.; Veldman, H.A.P.; Wyns, S.; (2002): Venlo Maasboulevard. DANS. https://doi.org/10.17026/dans-x84-msac | 5207 | Yes 14 | KATK_08 | ADC ArcheoProjecten; Velde, H.M. van der; (2011): Katwijk Zanderij Westerbaan Opgraving. DANS. https://doi.org/10.17026/dans-znz-r2ba | 3187 | Yes (joined) 15 | WIJD_07 | Dijkstra, J. (ADC ArcheoProjecten) (2012): Wijk bij Duurstede Veilingterrein DO Opgraving. DANS. https://doi.org/10.17026/dans-x8d-qmae | 12131 | Yes (joined) 16 | OOST_10 | Roessingh, W. (ADC ArcheoProjecten); Blom, E. (ADC ArcheoProjecten) (2012): Oosterhout Vrachelen De Contreie Vrachelen 4 Opgraving. DANS. https://doi.org/10.17026/dans-25d-fpe5 | 17251 | Yes (joined) 17 | VEGL_10 | ADC ArcheoProjecten; Van der Veken, B. (ADC ArcheoProjecten); Blom, E. (ADC ArcheoProjecten) (2012): Veghel Scheiffelaar II Opgraving. DANS. https://doi.org/10.17026/dans-z93-7zbe | 4271 | Yes (joined) 18 | | | | 19 | TOTAL | | 74358 -------------------------------------------------------------------------------- /files/archaeology/archaeology.csv.zip: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/SPINLab/geometry-learning/5300d421ef848c2748a2ba41ced5c6e2fba93200/files/archaeology/archaeology.csv.zip -------------------------------------------------------------------------------- /files/buildings/buildings.csv.zip: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/SPINLab/geometry-learning/5300d421ef848c2748a2ba41ced5c6e2fba93200/files/buildings/buildings.csv.zip -------------------------------------------------------------------------------- /files/neighborhoods/LICENSE.md: -------------------------------------------------------------------------------- 1 | # License 2 | Wijken en buurten data (c) Centraal Bureau voor de Statistiek, 3 | data licensed [CC-BY](https://creativecommons.org/licenses/by/4.0/) 4 | 5 | # Source 6 | https://geodata.nationaalgeoregister.nl/wijkenbuurten2017/wfs?request=GetFeature&service=WFS&version=2.0.0&typeName=cbs_buurten_2017&outputFormat=csv&srsName=EPSG%3A4326&PropertyName=aantal_inwoners%2Cgeom 7 | -------------------------------------------------------------------------------- /files/neighborhoods/neighborhoods.csv.zip: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/SPINLab/geometry-learning/5300d421ef848c2748a2ba41ced5c6e2fba93200/files/neighborhoods/neighborhoods.csv.zip -------------------------------------------------------------------------------- /files/roads/boogstralen.zip: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/SPINLab/geometry-learning/5300d421ef848c2748a2ba41ced5c6e2fba93200/files/roads/boogstralen.zip -------------------------------------------------------------------------------- /model/__init__.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/SPINLab/geometry-learning/5300d421ef848c2748a2ba41ced5c6e2fba93200/model/__init__.py -------------------------------------------------------------------------------- /model/all_grid_search.py: -------------------------------------------------------------------------------- 1 | import os 2 | 3 | import sys 4 | from sklearn.model_selection import ParameterGrid 5 | from topoml_util.slack_send import notify 6 | 7 | SCRIPT_VERSION = '0.0.9' 8 | N_TIMES = 6 9 | 10 | HYPERPARAMS = { 11 | 'BATCH_SIZE': [512], 12 | 'REPEAT_DEEP_ARCH': [1], 13 | 'LSTM_SIZE': [64], 14 | 'DENSE_SIZE': [32], 15 | 'EPOCHS': [200], 16 | 'LEARNING_RATE': [1e-4], 17 | # 'GEOM_SCALE': [1e0, 1e-1, 1e-2, 1e-3], 18 | 'RECURRENT_DROPOUT': [0.10], 19 | # 'PATIENCE': [8, 16, 24, 32, 40], 20 | 'EARLY_STOPPING': 1 21 | } 22 | grid = list(ParameterGrid(HYPERPARAMS)) 23 | 24 | scripts = [ 25 | # 'neighborhood_inhabitants.py', 26 | # 'building_type.py', 27 | # 'archaeological_features.py' 28 | ] 29 | 30 | for configuration in grid: 31 | # Set environment variables (this allows you to do hyperparam searches from any scripting environment) 32 | for key, value in configuration.items(): 33 | os.environ[key] = str(value) 34 | 35 | # repeat to get a sense of results spread 36 | for _ in range(N_TIMES): 37 | for script in scripts: 38 | r_code = os.system('python3 {}'.format(script)) 39 | if not r_code == 0: 40 | print('{} exited with error'.format(script)) 41 | notify('{} grid search'.format(script), 'with error') 42 | sys.exit(1) 43 | 44 | notify('All grid search', 'no errors') 45 | print('All grid search', 'finished successfully') 46 | -------------------------------------------------------------------------------- /model/all_test.py: -------------------------------------------------------------------------------- 1 | """ 2 | Final test script for evaluation statistics 3 | """ 4 | 5 | import os 6 | 7 | import sys 8 | from sklearn.model_selection import ParameterGrid 9 | from topoml_util.slack_send import notify 10 | 11 | notify('ALL TEST SCRIPT RUNNING FINAL TESTS', 'STARTING') 12 | 13 | SCRIPT_VERSION = '1.0.0' 14 | N_TIMES = 1 15 | 16 | HYPERPARAMS = { # All using standard hyperparameters 17 | # 'BATCH_SIZE': [512], 18 | # 'REPEAT_DEEP_ARCH': [0], 19 | # 'LSTM_SIZE': [64], 20 | # 'DENSE_SIZE': [32], 21 | # 'EPOCHS': [200], 22 | # 'LEARNING_RATE': [1e-4], 23 | # 'GEOM_SCALE': [1e0, 1e-1, 1e-2, 1e-3], # Leave at standard normalization 24 | # 'RECURRENT_DROPOUT': [0.10], 25 | # 'PATIENCE': [8, 16, 24, 32, 40], # Early stopping disabled by default 26 | } 27 | grid = list(ParameterGrid(HYPERPARAMS)) 28 | 29 | scripts = [ 30 | # 'neighborhood_convnet.py', 31 | # 'neighborhood_lstm.py', 32 | # 'building_convnet.py', 33 | # 'building_lstm.py', 34 | 'archaeology_convnet.py', 35 | 'archaeology_lstm.py' 36 | ] 37 | 38 | for configuration in grid: 39 | # Set environment variables (this allows you to do hyperparam searches from any scripting environment) 40 | for key, value in configuration.items(): 41 | os.environ[key] = str(value) 42 | 43 | # repeat to get a sense of results spread 44 | for _ in range(N_TIMES): 45 | for script in scripts: 46 | r_code = os.system('python3 {} --test'.format(script)) 47 | if not r_code == 0: 48 | print('{} exited with error'.format(script)) 49 | notify('{} grid search'.format(script), 'with error') 50 | sys.exit(1) 51 | 52 | notify('ALL TEST', 'no errors') 53 | print('ALL TEST', 'finished successfully') 54 | -------------------------------------------------------------------------------- /model/baseline/__init__.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/SPINLab/geometry-learning/5300d421ef848c2748a2ba41ced5c6e2fba93200/model/baseline/__init__.py -------------------------------------------------------------------------------- /model/baseline/all_baseline_models.py: -------------------------------------------------------------------------------- 1 | import os 2 | 3 | import sys 4 | from sklearn.model_selection import ParameterGrid 5 | from topoml_util.slack_send import notify 6 | 7 | SCRIPT_VERSION = '0.0.9' 8 | N_TIMES = 1 9 | 10 | HYPERPARAMS = { 11 | # 'BATCH_SIZE': [512], 12 | # 'REPEAT_DEEP_ARCH': [1], 13 | # 'LSTM_SIZE': [64], 14 | # 'DENSE_SIZE': [32], 15 | # 'EPOCHS': [200], 16 | # 'LEARNING_RATE': [1e-4], 17 | # 'GEOM_SCALE': [1e0, 1e-1, 1e-2, 1e-3], 18 | # 'RECURRENT_DROPOUT': [0.10], 19 | # 'PATIENCE': [8, 16, 24, 32, 40], 20 | # 'EARLY_STOPPING': 1 21 | } 22 | grid = list(ParameterGrid(HYPERPARAMS)) 23 | 24 | scripts = [ 25 | 'archaeo_feature_type_decision_tree.py', 26 | 'archaeo_feature_type_knn.py', 27 | 'archaeo_feature_type_logistic_regression.py', 28 | 'archaeo_feature_type_svm_rbf.py', 29 | 'building_type_decision_tree.py', 30 | 'building_type_knn.py', 31 | 'building_type_logistic_regression.py', 32 | 'building_type_svm_rbf.py', 33 | 'neighborhood_inhabintants_decision_tree.py', 34 | 'neighborhood_inhabintants_knn.py', 35 | 'neighborhood_inhabintants_logistic_regression.py', 36 | 'neighborhood_inhabintants_svm_rbf.py', 37 | ] 38 | 39 | for configuration in grid: 40 | # Set environment variables (this allows you to do hyperparam searches from any scripting environment) 41 | for key, value in configuration.items(): 42 | os.environ[key] = str(value) 43 | 44 | # repeat to get a sense of results spread 45 | for _ in range(N_TIMES): 46 | for script in scripts: 47 | print('Executing', script) 48 | r_code = os.system('python3 {}'.format(script)) 49 | if not r_code == 0: 50 | print('{} exited with error'.format(script)) 51 | notify('{} grid search'.format(script), 'with error') 52 | sys.exit(1) 53 | 54 | notify('All grid search', 'no errors') 55 | print('All grid search', 'finished successfully') 56 | -------------------------------------------------------------------------------- /model/baseline/archaeo_feature_type_decision_tree.py: -------------------------------------------------------------------------------- 1 | """ 2 | This script executes the task of estimating the type of an archaeological feature, based solely on the geometry for 3 | that feature. The data for this script can be found at http://hdl.handle.net/10411/GYPPBR. 4 | """ 5 | 6 | import multiprocessing 7 | import os 8 | import sys 9 | from datetime import datetime, timedelta 10 | from pathlib import Path 11 | from time import time 12 | from urllib.request import urlretrieve 13 | 14 | import numpy as np 15 | from sklearn.metrics import accuracy_score 16 | from sklearn.model_selection import cross_val_score, StratifiedShuffleSplit, GridSearchCV 17 | from sklearn.preprocessing import StandardScaler 18 | from sklearn.tree import DecisionTreeClassifier 19 | 20 | PACKAGE_PARENT = '..' 21 | SCRIPT_DIR = os.path.dirname(os.path.realpath(os.path.join(os.getcwd(), os.path.expanduser(__file__)))) 22 | sys.path.append(os.path.normpath(os.path.join(SCRIPT_DIR, PACKAGE_PARENT))) 23 | 24 | from topoml_util.slack_send import notify 25 | 26 | SCRIPT_VERSION = '1.0.2' 27 | SCRIPT_NAME = os.path.basename(__file__) 28 | TIMESTAMP = str(datetime.now()).replace(':', '.') 29 | NUM_CPUS = multiprocessing.cpu_count() - 1 or 1 30 | DATA_FOLDER = SCRIPT_DIR + '/../../files/archaeology/' 31 | TRAIN_DATA_FILE = 'archaeology_train_v7.npz' 32 | TEST_DATA_FILE = 'archaeology_test_v7.npz' 33 | TRAIN_DATA_URL = 'https://dataverse.nl/api/access/datafile/11377' 34 | TEST_DATA_URL = 'https://dataverse.nl/api/access/datafile/11376' 35 | EFD_ORDERS = [0, 1, 2, 3, 4, 6, 8, 12, 16, 20, 24] 36 | SCRIPT_START = time() 37 | 38 | if __name__ == '__main__': # this is to squelch warnings on scikit-learn multithreaded grid search 39 | # Load training data 40 | path = Path(DATA_FOLDER + TRAIN_DATA_FILE) 41 | if not path.exists(): 42 | print("Retrieving training data from web...") 43 | urlretrieve(TRAIN_DATA_URL, DATA_FOLDER + TRAIN_DATA_FILE) 44 | 45 | train_loaded = np.load(DATA_FOLDER + TRAIN_DATA_FILE) 46 | train_fourier_descriptors = train_loaded['elliptic_fourier_descriptors'] 47 | train_labels = train_loaded['feature_type'] 48 | 49 | scaler = StandardScaler().fit(train_fourier_descriptors) 50 | train_fourier_descriptors = scaler.transform(train_fourier_descriptors) 51 | 52 | param_grid = {'max_depth': range(5, 11)} 53 | cv = StratifiedShuffleSplit(n_splits=5, test_size=0.2, random_state=42) 54 | grid = GridSearchCV( 55 | DecisionTreeClassifier(), 56 | n_jobs=NUM_CPUS, 57 | param_grid=param_grid, 58 | verbose=2, 59 | cv=cv) 60 | 61 | print('Performing grid search on model...') 62 | print('Using {} threads for grid search'.format(NUM_CPUS)) 63 | print('Searching {} elliptic fourier descriptor orders'.format(EFD_ORDERS)) 64 | 65 | best_order = 0 66 | best_score = 0 67 | best_params = {} 68 | 69 | for order in EFD_ORDERS: 70 | print('Fitting order {} fourier descriptors'.format(order)) 71 | stop_position = 3 + (order * 8) 72 | grid.fit(train_fourier_descriptors[:, :stop_position], train_labels) 73 | print("The best parameters for order {} are {} with a score of {}\n".format( 74 | order, grid.best_params_, grid.best_score_)) 75 | if grid.best_score_ > best_score: 76 | best_score = grid.best_score_ 77 | best_order = order 78 | best_params = grid.best_params_ 79 | 80 | print('Training model on order {} with best parameters {}'.format( 81 | best_order, best_params)) 82 | stop_position = 3 + (best_order * 8) 83 | clf = DecisionTreeClassifier(max_depth=best_params['max_depth']) 84 | scores = cross_val_score(clf, train_fourier_descriptors[:, :stop_position], train_labels, cv=10, n_jobs=NUM_CPUS) 85 | print('Cross-validation scores:', scores) 86 | clf.fit(train_fourier_descriptors[:, :stop_position], train_labels) 87 | 88 | # Run predictions on unseen test data to verify generalization 89 | path = Path(DATA_FOLDER + TEST_DATA_FILE) 90 | if not path.exists(): 91 | print("Retrieving test data from web...") 92 | urlretrieve(TEST_DATA_URL, DATA_FOLDER + TEST_DATA_FILE) 93 | 94 | test_loaded = np.load(DATA_FOLDER + TEST_DATA_FILE) 95 | test_fourier_descriptors = test_loaded['elliptic_fourier_descriptors'] 96 | test_labels = np.asarray(test_loaded['feature_type'], dtype=int) 97 | test_fourier_descriptors = scaler.transform(test_fourier_descriptors) 98 | 99 | print('Run on test data...') 100 | predictions = clf.predict(test_fourier_descriptors[:, :stop_position]) 101 | test_accuracy = accuracy_score(test_labels, predictions) 102 | 103 | runtime = time() - SCRIPT_START 104 | message = '\nTest accuracy of {} for fourier descriptor order {} with {} in {}'.format( 105 | test_accuracy, best_order, best_params, timedelta(seconds=runtime)) 106 | print(message) 107 | notify(SCRIPT_NAME, message) 108 | -------------------------------------------------------------------------------- /model/baseline/archaeo_feature_type_knn.py: -------------------------------------------------------------------------------- 1 | """ 2 | This script executes the task of estimating the type of an archaeological feature, based solely on the geometry for 3 | that feature. The data for this script can be found at http://hdl.handle.net/10411/GYPPBR. 4 | """ 5 | 6 | import multiprocessing 7 | import os 8 | import sys 9 | from datetime import datetime, timedelta 10 | from pathlib import Path 11 | from time import time 12 | from urllib.request import urlretrieve 13 | 14 | import numpy as np 15 | from sklearn.metrics import accuracy_score 16 | from sklearn.model_selection import cross_val_score, StratifiedShuffleSplit, GridSearchCV 17 | from sklearn.neighbors import KNeighborsClassifier 18 | from sklearn.preprocessing import StandardScaler 19 | 20 | PACKAGE_PARENT = '..' 21 | SCRIPT_DIR = os.path.dirname(os.path.realpath(os.path.join(os.getcwd(), os.path.expanduser(__file__)))) 22 | sys.path.append(os.path.normpath(os.path.join(SCRIPT_DIR, PACKAGE_PARENT))) 23 | 24 | from topoml_util.slack_send import notify 25 | 26 | SCRIPT_VERSION = '1.0.5' 27 | SCRIPT_NAME = os.path.basename(__file__) 28 | TIMESTAMP = str(datetime.now()).replace(':', '.') 29 | NUM_CPUS = multiprocessing.cpu_count() - 1 or 1 30 | DATA_FOLDER = SCRIPT_DIR + '/../../files/archaeology/' 31 | TRAIN_DATA_FILE = 'archaeology_train_v7.npz' 32 | TEST_DATA_FILE = 'archaeology_test_v7.npz' 33 | TRAIN_DATA_URL = 'https://dataverse.nl/api/access/datafile/11377' 34 | TEST_DATA_URL = 'https://dataverse.nl/api/access/datafile/11376' 35 | EFD_ORDERS = [0, 1, 2, 3, 4, 6, 8, 12, 16, 20, 24] 36 | SCRIPT_START = time() 37 | 38 | if __name__ == '__main__': # this is to squelch warnings on scikit-learn multithreaded grid search 39 | # Load training data 40 | path = Path(DATA_FOLDER + TRAIN_DATA_FILE) 41 | if not path.exists(): 42 | print("Retrieving training data from web...") 43 | urlretrieve(TRAIN_DATA_URL, DATA_FOLDER + TRAIN_DATA_FILE) 44 | 45 | train_loaded = np.load(DATA_FOLDER + TRAIN_DATA_FILE) 46 | train_fourier_descriptors = train_loaded['elliptic_fourier_descriptors'] 47 | train_labels = train_loaded['feature_type'] 48 | 49 | scaler = StandardScaler().fit(train_fourier_descriptors) 50 | train_fourier_descriptors = scaler.transform(train_fourier_descriptors) 51 | 52 | k_range = np.linspace(start=21, stop=30, num=10, dtype=int) 53 | param_grid = dict(n_neighbors=k_range) 54 | cv = StratifiedShuffleSplit(n_splits=5, test_size=0.2, random_state=42) 55 | grid = GridSearchCV( 56 | KNeighborsClassifier(), 57 | n_jobs=NUM_CPUS, 58 | param_grid=param_grid, 59 | verbose=2, 60 | cv=cv) 61 | 62 | print('Performing grid search on model...') 63 | print('Using {} threads for grid search'.format(NUM_CPUS)) 64 | print('Searching {} elliptic fourier descriptor orders'.format(EFD_ORDERS)) 65 | 66 | best_order = 0 67 | best_score = 0 68 | best_params = {} 69 | 70 | for order in EFD_ORDERS: 71 | print('Fitting order {} fourier descriptors'.format(order)) 72 | stop_position = 3 + (order * 8) 73 | grid.fit(train_fourier_descriptors[::5, :stop_position], train_labels[::5]) 74 | print("The best parameters for order {} are {} with a score of {}\n".format( 75 | order, grid.best_params_, grid.best_score_)) 76 | if grid.best_score_ > best_score: 77 | best_score = grid.best_score_ 78 | best_order = order 79 | best_params = grid.best_params_ 80 | 81 | print('Training model on order {} with best parameters {}'.format( 82 | best_order, best_params)) 83 | stop_position = 3 + (best_order * 8) 84 | clf = KNeighborsClassifier(n_neighbors=best_params['n_neighbors']) 85 | scores = cross_val_score(clf, train_fourier_descriptors[:, :stop_position], train_labels, cv=10, n_jobs=NUM_CPUS) 86 | print('Cross-validation scores:', scores) 87 | clf.fit(train_fourier_descriptors[:, :stop_position], train_labels) 88 | 89 | # Run predictions on unseen test data to verify generalization 90 | path = Path(DATA_FOLDER + TEST_DATA_FILE) 91 | if not path.exists(): 92 | print("Retrieving test data from web...") 93 | urlretrieve(TEST_DATA_URL, DATA_FOLDER + TEST_DATA_FILE) 94 | 95 | test_loaded = np.load(DATA_FOLDER + TEST_DATA_FILE) 96 | test_fourier_descriptors = test_loaded['elliptic_fourier_descriptors'] 97 | test_labels = np.asarray(test_loaded['feature_type'], dtype=int) 98 | test_fourier_descriptors = scaler.transform(test_fourier_descriptors) 99 | 100 | print('Run on test data...') 101 | predictions = clf.predict(test_fourier_descriptors[:, :stop_position]) 102 | test_accuracy = accuracy_score(test_labels, predictions) 103 | 104 | runtime = time() - SCRIPT_START 105 | message = '\nTest accuracy of {} for fourier descriptor order {} with {} in {}'.format( 106 | test_accuracy, best_order, best_params, timedelta(seconds=runtime)) 107 | print(message) 108 | notify(SCRIPT_NAME, message) 109 | -------------------------------------------------------------------------------- /model/baseline/archaeo_feature_type_logistic_regression.py: -------------------------------------------------------------------------------- 1 | """ 2 | This script executes the task of estimating the type of an archaeological feature, based solely on the geometry for 3 | that feature. The data for this script can be found at http://hdl.handle.net/10411/GYPPBR. 4 | """ 5 | 6 | import multiprocessing 7 | import os 8 | import sys 9 | from datetime import datetime, timedelta 10 | from pathlib import Path 11 | from time import time 12 | from urllib.request import urlretrieve 13 | 14 | import numpy as np 15 | from sklearn.linear_model import LogisticRegression 16 | from sklearn.metrics import accuracy_score 17 | from sklearn.model_selection import cross_val_score, StratifiedShuffleSplit, GridSearchCV 18 | from sklearn.preprocessing import StandardScaler 19 | 20 | PACKAGE_PARENT = '..' 21 | SCRIPT_DIR = os.path.dirname(os.path.realpath(os.path.join(os.getcwd(), os.path.expanduser(__file__)))) 22 | sys.path.append(os.path.normpath(os.path.join(SCRIPT_DIR, PACKAGE_PARENT))) 23 | 24 | from topoml_util.slack_send import notify 25 | 26 | SCRIPT_VERSION = '1.0.1' 27 | SCRIPT_NAME = os.path.basename(__file__) 28 | TIMESTAMP = str(datetime.now()).replace(':', '.') 29 | NUM_CPUS = multiprocessing.cpu_count() - 1 or 1 30 | DATA_FOLDER = SCRIPT_DIR + '/../../files/archaeology/' 31 | TRAIN_DATA_FILE = 'archaeology_train_v7.npz' 32 | TEST_DATA_FILE = 'archaeology_test_v7.npz' 33 | TRAIN_DATA_URL = 'https://dataverse.nl/api/access/datafile/11377' 34 | TEST_DATA_URL = 'https://dataverse.nl/api/access/datafile/11376' 35 | EFD_ORDERS = [0, 1, 2, 3, 4, 6, 8, 12, 16, 20, 24] 36 | SCRIPT_START = time() 37 | 38 | if __name__ == '__main__': # this is to squelch warnings on scikit-learn multithreaded grid search 39 | # Load training data 40 | path = Path(DATA_FOLDER + TRAIN_DATA_FILE) 41 | if not path.exists(): 42 | print("Retrieving training data from web...") 43 | urlretrieve(TRAIN_DATA_URL, DATA_FOLDER + TRAIN_DATA_FILE) 44 | 45 | train_loaded = np.load(DATA_FOLDER + TRAIN_DATA_FILE) 46 | train_fourier_descriptors = train_loaded['elliptic_fourier_descriptors'] 47 | train_labels = train_loaded['feature_type'] 48 | 49 | scaler = StandardScaler().fit(train_fourier_descriptors) 50 | train_fourier_descriptors = scaler.transform(train_fourier_descriptors) 51 | 52 | # Grid search 53 | C_range = [1e-2, 1e-1, 1e0, 1e1, 1e2, 1e3] 54 | param_grid = dict(C=C_range) 55 | cv = StratifiedShuffleSplit(n_splits=5, test_size=0.2, random_state=42) 56 | grid = GridSearchCV( 57 | LogisticRegression(), 58 | n_jobs=NUM_CPUS, 59 | param_grid=param_grid, 60 | verbose=2, 61 | cv=cv) 62 | 63 | print('Performing grid search on model...') 64 | print('Using {} threads for grid search'.format(NUM_CPUS)) 65 | print('Searching {} elliptic fourier descriptor orders'.format(EFD_ORDERS)) 66 | 67 | best_order = 0 68 | best_score = 0 69 | best_params = {} 70 | 71 | for order in EFD_ORDERS: 72 | print('Fitting order {} fourier descriptors'.format(order)) 73 | stop_position = 3 + (order * 8) 74 | grid.fit(train_fourier_descriptors[:, :stop_position], train_labels) 75 | print("The best parameters for order {} are {} with a score of {}\n".format( 76 | order, grid.best_params_, grid.best_score_)) 77 | if grid.best_score_ > best_score: 78 | best_score = grid.best_score_ 79 | best_order = order 80 | best_params = grid.best_params_ 81 | 82 | print('Training model on order {} with best parameters {}'.format( 83 | best_order, best_params)) 84 | stop_position = 3 + (best_order * 8) 85 | clf = LogisticRegression(C=best_params['C']) 86 | scores = cross_val_score(clf, train_fourier_descriptors[:, :stop_position], train_labels, cv=10, n_jobs=NUM_CPUS) 87 | print('Cross-validation scores:', scores) 88 | clf.fit(train_fourier_descriptors[:, :stop_position], train_labels) 89 | 90 | # Run predictions on unseen test data to verify generalization 91 | path = Path(DATA_FOLDER + TEST_DATA_FILE) 92 | if not path.exists(): 93 | print("Retrieving test data from web...") 94 | urlretrieve(TEST_DATA_URL, DATA_FOLDER + TEST_DATA_FILE) 95 | 96 | test_loaded = np.load(DATA_FOLDER + TEST_DATA_FILE) 97 | test_fourier_descriptors = test_loaded['elliptic_fourier_descriptors'] 98 | test_labels = np.asarray(test_loaded['feature_type'], dtype=int) 99 | test_fourier_descriptors = scaler.transform(test_fourier_descriptors) 100 | 101 | print('Run on test data...') 102 | predictions = clf.predict(test_fourier_descriptors[:, :stop_position]) 103 | test_accuracy = accuracy_score(test_labels, predictions) 104 | 105 | runtime = time() - SCRIPT_START 106 | message = '\nTest accuracy of {} for fourier descriptor order {} with {} in {}'.format( 107 | test_accuracy, best_order, best_params, timedelta(seconds=runtime)) 108 | print(message) 109 | notify(SCRIPT_NAME, message) 110 | -------------------------------------------------------------------------------- /model/baseline/archaeo_feature_type_svm_linear.py: -------------------------------------------------------------------------------- 1 | """ 2 | This script executes the task of estimating the type of an archaeological feature, based solely on the geometry for 3 | that feature. The data for this script can be found at http://hdl.handle.net/10411/GYPPBR. 4 | """ 5 | 6 | import multiprocessing 7 | import os 8 | import sys 9 | from datetime import datetime, timedelta 10 | from pathlib import Path 11 | from time import time 12 | from urllib.request import urlretrieve 13 | 14 | import numpy as np 15 | from sklearn.metrics import accuracy_score 16 | from sklearn.model_selection import StratifiedShuffleSplit, GridSearchCV 17 | from sklearn.preprocessing import StandardScaler 18 | from sklearn.svm import SVC 19 | 20 | PACKAGE_PARENT = '..' 21 | SCRIPT_DIR = os.path.dirname(os.path.realpath(os.path.join(os.getcwd(), os.path.expanduser(__file__)))) 22 | sys.path.append(os.path.normpath(os.path.join(SCRIPT_DIR, PACKAGE_PARENT))) 23 | 24 | from topoml_util.slack_send import notify 25 | 26 | SCRIPT_VERSION = '1.0.1' 27 | SCRIPT_NAME = os.path.basename(__file__) 28 | TIMESTAMP = str(datetime.now()).replace(':', '.') 29 | NUM_CPUS = multiprocessing.cpu_count() - 1 or 1 30 | DATA_FOLDER = SCRIPT_DIR + '/../../files/archaeology/' 31 | TRAIN_DATA_FILE = 'archaeology_train_v7.npz' 32 | TEST_DATA_FILE = 'archaeology_test_v7.npz' 33 | TRAIN_DATA_URL = 'https://dataverse.nl/api/access/datafile/11377' 34 | TEST_DATA_URL = 'https://dataverse.nl/api/access/datafile/11376' 35 | EFD_ORDERS = [0, 1, 2, 3, 4, 6, 8, 12, 16, 20, 24] 36 | SCRIPT_START = time() 37 | 38 | if __name__ == '__main__': # this is to squelch warnings on scikit-learn multithreaded grid search 39 | # Load training data 40 | path = Path(DATA_FOLDER + TRAIN_DATA_FILE) 41 | if not path.exists(): 42 | print("Retrieving training data from web...") 43 | urlretrieve(TRAIN_DATA_URL, DATA_FOLDER + TRAIN_DATA_FILE) 44 | 45 | train_loaded = np.load(DATA_FOLDER + TRAIN_DATA_FILE) 46 | train_fourier_descriptors = train_loaded['elliptic_fourier_descriptors'] 47 | train_labels = train_loaded['feature_type'] 48 | 49 | scaler = StandardScaler().fit(train_fourier_descriptors) 50 | train_fourier_descriptors = scaler.transform(train_fourier_descriptors) 51 | 52 | C_range = [1e-1, 1e0, 1e1, 1e2, 1e3] 53 | param_grid = dict(C=C_range) 54 | cv = StratifiedShuffleSplit(n_splits=5, test_size=0.2, random_state=42) 55 | grid = GridSearchCV( 56 | SVC(kernel='linear', max_iter=int(1e8)), 57 | n_jobs=NUM_CPUS, 58 | param_grid=param_grid, 59 | verbose=2, 60 | cv=cv) 61 | 62 | print('Performing grid search on model...') 63 | print('Using {} threads for grid search'.format(NUM_CPUS)) 64 | print('Searching {} elliptic fourier descriptor orders'.format(EFD_ORDERS)) 65 | 66 | best_order = 0 67 | best_score = 0 68 | best_params = {} 69 | 70 | for order in EFD_ORDERS: 71 | print('Fitting order {} fourier descriptors'.format(order)) 72 | stop_position = 3 + (order * 8) 73 | grid.fit(train_fourier_descriptors[::10, :stop_position], train_labels[::10]) 74 | print("The best parameters for order {} are {} with a score of {}\n".format( 75 | order, grid.best_params_, grid.best_score_)) 76 | if grid.best_score_ > best_score: 77 | best_score = grid.best_score_ 78 | best_order = order 79 | best_params = grid.best_params_ 80 | 81 | print('Training model on order {} with best parameters {}'.format( 82 | best_order, best_params)) 83 | stop_position = 3 + (best_order * 8) 84 | clf = SVC(kernel='linear', C=best_params['C'], max_iter=int(1e8)) 85 | clf.fit(X=train_fourier_descriptors[:, :stop_position], y=train_labels) 86 | 87 | # Run predictions on unseen test data to verify generalization 88 | path = Path(DATA_FOLDER + TEST_DATA_FILE) 89 | if not path.exists(): 90 | print("Retrieving test data from web...") 91 | urlretrieve(TEST_DATA_URL, DATA_FOLDER + TEST_DATA_FILE) 92 | 93 | test_loaded = np.load(DATA_FOLDER + TEST_DATA_FILE) 94 | test_fourier_descriptors = test_loaded['elliptic_fourier_descriptors'] 95 | test_labels = np.asarray(test_loaded['feature_type'], dtype=int) 96 | test_fourier_descriptors = scaler.transform(test_fourier_descriptors) 97 | 98 | print('Run on test data...') 99 | predictions = clf.predict(test_fourier_descriptors[:, :stop_position]) 100 | test_accuracy = accuracy_score(test_labels, predictions) 101 | 102 | runtime = time() - SCRIPT_START 103 | message = '\nTest accuracy of {} for fourier descriptor order {} with {} in {}'.format( 104 | test_accuracy, best_order, best_params, timedelta(seconds=runtime)) 105 | print(message) 106 | notify(SCRIPT_NAME, message) 107 | -------------------------------------------------------------------------------- /model/baseline/archaeo_feature_type_svm_polynomial.py: -------------------------------------------------------------------------------- 1 | """ 2 | This script executes the task of estimating the type of an archaeological feature, based solely on the geometry for 3 | that feature. The data for this script can be found at http://hdl.handle.net/10411/GYPPBR. 4 | """ 5 | 6 | import multiprocessing 7 | import os 8 | import sys 9 | from datetime import datetime, timedelta 10 | from pathlib import Path 11 | from time import time 12 | from urllib.request import urlretrieve 13 | 14 | import numpy as np 15 | from sklearn.metrics import accuracy_score 16 | from sklearn.model_selection import StratifiedShuffleSplit, GridSearchCV 17 | from sklearn.preprocessing import StandardScaler 18 | from sklearn.svm import SVC 19 | 20 | PACKAGE_PARENT = '..' 21 | SCRIPT_DIR = os.path.dirname(os.path.realpath(os.path.join(os.getcwd(), os.path.expanduser(__file__)))) 22 | sys.path.append(os.path.normpath(os.path.join(SCRIPT_DIR, PACKAGE_PARENT))) 23 | 24 | from topoml_util.slack_send import notify 25 | 26 | SCRIPT_VERSION = '1.0.1' 27 | SCRIPT_NAME = os.path.basename(__file__) 28 | TIMESTAMP = str(datetime.now()).replace(':', '.') 29 | NUM_CPUS = multiprocessing.cpu_count() - 1 or 1 30 | DATA_FOLDER = SCRIPT_DIR + '/../../files/archaeology/' 31 | TRAIN_DATA_FILE = 'archaeology_train_v7.npz' 32 | TEST_DATA_FILE = 'archaeology_test_v7.npz' 33 | TRAIN_DATA_URL = 'https://dataverse.nl/api/access/datafile/11377' 34 | TEST_DATA_URL = 'https://dataverse.nl/api/access/datafile/11376' 35 | EFD_ORDERS = [0, 1, 2, 3, 4, 6, 8, 12, 16, 20, 24] 36 | SCRIPT_START = time() 37 | 38 | if __name__ == '__main__': # this is to squelch warnings on scikit-learn multithreaded grid search 39 | # Load training data 40 | path = Path(DATA_FOLDER + TRAIN_DATA_FILE) 41 | if not path.exists(): 42 | print("Retrieving training data from web...") 43 | urlretrieve(TRAIN_DATA_URL, DATA_FOLDER + TRAIN_DATA_FILE) 44 | 45 | train_loaded = np.load(DATA_FOLDER + TRAIN_DATA_FILE) 46 | train_fourier_descriptors = train_loaded['elliptic_fourier_descriptors'] 47 | train_labels = train_loaded['feature_type'] 48 | 49 | scaler = StandardScaler().fit(train_fourier_descriptors) 50 | train_fourier_descriptors = scaler.transform(train_fourier_descriptors) 51 | 52 | C_range = [1e-2, 1e-1, 1e0, 1e1, 1e2, 1e3] 53 | degree_range = range(1, 7) 54 | param_grid = dict(degree=degree_range, C=C_range) 55 | cv = StratifiedShuffleSplit(n_splits=5, test_size=0.2, random_state=42) 56 | grid = GridSearchCV( 57 | SVC(kernel='poly', max_iter=int(1e8)), 58 | n_jobs=NUM_CPUS, 59 | param_grid=param_grid, 60 | verbose=2, 61 | cv=cv) 62 | 63 | print('Performing grid search on model...') 64 | print('Using {} threads for grid search'.format(NUM_CPUS)) 65 | print('Searching {} elliptic fourier descriptor orders'.format(EFD_ORDERS)) 66 | 67 | best_order = 0 68 | best_score = 0 69 | best_params = {} 70 | 71 | for order in EFD_ORDERS: 72 | print('Fitting order {} fourier descriptors'.format(order)) 73 | stop_position = 3 + (order * 8) 74 | grid.fit(train_fourier_descriptors[::10, :stop_position], train_labels[::10]) 75 | print("The best parameters for order {} are {} with a score of {}\n".format( 76 | order, grid.best_params_, grid.best_score_)) 77 | if grid.best_score_ > best_score: 78 | best_score = grid.best_score_ 79 | best_order = order 80 | best_params = grid.best_params_ 81 | 82 | print('Training model on order {} with best parameters {}'.format( 83 | best_order, best_params)) 84 | stop_position = 3 + (best_order * 8) 85 | clf = SVC(kernel='poly', C=best_params['C'], degree=best_params['degree']) 86 | clf.fit(X=train_fourier_descriptors[:, :stop_position], y=train_labels) 87 | 88 | # Run predictions on unseen test data to verify generalization 89 | path = Path(DATA_FOLDER + TEST_DATA_FILE) 90 | if not path.exists(): 91 | print("Retrieving test data from web...") 92 | urlretrieve(TEST_DATA_URL, DATA_FOLDER + TEST_DATA_FILE) 93 | 94 | test_loaded = np.load(DATA_FOLDER + TEST_DATA_FILE) 95 | test_fourier_descriptors = test_loaded['elliptic_fourier_descriptors'] 96 | test_labels = np.asarray(test_loaded['feature_type'], dtype=int) 97 | test_fourier_descriptors = scaler.transform(test_fourier_descriptors) 98 | 99 | print('Run on test data...') 100 | predictions = clf.predict(test_fourier_descriptors[:, :stop_position]) 101 | test_accuracy = accuracy_score(test_labels, predictions) 102 | 103 | runtime = time() - SCRIPT_START 104 | message = '\nTest accuracy of {} for fourier descriptor order {} with {} in {}'.format( 105 | test_accuracy, best_order, best_params, timedelta(seconds=runtime)) 106 | print(message) 107 | notify(SCRIPT_NAME, message) 108 | -------------------------------------------------------------------------------- /model/baseline/archaeo_feature_type_svm_rbf.py: -------------------------------------------------------------------------------- 1 | """ 2 | This script executes the task of estimating the type of an archaeological feature, based solely on the geometry for 3 | that feature. The data for this script can be found at http://hdl.handle.net/10411/GYPPBR. 4 | """ 5 | 6 | import multiprocessing 7 | import os 8 | import sys 9 | from datetime import datetime, timedelta 10 | from pathlib import Path 11 | from time import time 12 | from urllib.request import urlretrieve 13 | 14 | import numpy as np 15 | from sklearn.metrics import accuracy_score 16 | from sklearn.model_selection import StratifiedShuffleSplit, GridSearchCV 17 | from sklearn.preprocessing import StandardScaler 18 | from sklearn.svm import SVC 19 | 20 | PACKAGE_PARENT = '..' 21 | SCRIPT_DIR = os.path.dirname(os.path.realpath(os.path.join(os.getcwd(), os.path.expanduser(__file__)))) 22 | sys.path.append(os.path.normpath(os.path.join(SCRIPT_DIR, PACKAGE_PARENT))) 23 | 24 | from topoml_util.slack_send import notify 25 | 26 | SCRIPT_VERSION = '1.0.1' 27 | SCRIPT_NAME = os.path.basename(__file__) 28 | TIMESTAMP = str(datetime.now()).replace(':', '.') 29 | NUM_CPUS = multiprocessing.cpu_count() - 1 or 1 30 | DATA_FOLDER = SCRIPT_DIR + '/../../files/archaeology/' 31 | TRAIN_DATA_FILE = 'archaeology_train_v7.npz' 32 | TEST_DATA_FILE = 'archaeology_test_v7.npz' 33 | TRAIN_DATA_URL = 'https://dataverse.nl/api/access/datafile/11377' 34 | TEST_DATA_URL = 'https://dataverse.nl/api/access/datafile/11376' 35 | EFD_ORDERS = [0, 1, 2, 3, 4, 6, 8, 12, 16, 20, 24] 36 | SCRIPT_START = time() 37 | 38 | if __name__ == '__main__': # this is to squelch warnings on scikit-learn multithreaded grid search 39 | # Load training data 40 | path = Path(DATA_FOLDER + TRAIN_DATA_FILE) 41 | if not path.exists(): 42 | print("Retrieving training data from web...") 43 | urlretrieve(TRAIN_DATA_URL, DATA_FOLDER + TRAIN_DATA_FILE) 44 | 45 | train_loaded = np.load(DATA_FOLDER + TRAIN_DATA_FILE) 46 | train_fourier_descriptors = train_loaded['elliptic_fourier_descriptors'] 47 | train_labels = train_loaded['feature_type'] 48 | 49 | scaler = StandardScaler().fit(train_fourier_descriptors) 50 | train_fourier_descriptors = scaler.transform(train_fourier_descriptors) 51 | 52 | C_range = [1e-1, 1e0, 1e1, 1e2, 1e3] 53 | gamma_range = np.logspace(-4, 4, 9) 54 | param_grid = dict(gamma=gamma_range, C=C_range) 55 | cv = StratifiedShuffleSplit(n_splits=5, test_size=0.2, random_state=42) 56 | grid = GridSearchCV( 57 | SVC(kernel='rbf'), 58 | n_jobs=NUM_CPUS, 59 | param_grid=param_grid, 60 | verbose=2, 61 | cv=cv) 62 | 63 | print('Performing grid search on model...') 64 | print('Using {} threads for grid search'.format(NUM_CPUS)) 65 | print('Searching {} elliptic fourier descriptor orders'.format(EFD_ORDERS)) 66 | 67 | best_order = 0 68 | best_score = 0 69 | best_params = {} 70 | 71 | for order in EFD_ORDERS: 72 | print('Fitting order {} fourier descriptors'.format(order)) 73 | stop_position = 3 + (order * 8) 74 | grid.fit(train_fourier_descriptors[::5, :stop_position], train_labels[::5]) 75 | print("The best parameters for order {} are {} with a score of {}\n".format( 76 | order, grid.best_params_, grid.best_score_)) 77 | if grid.best_score_ > best_score: 78 | best_score = grid.best_score_ 79 | best_order = order 80 | best_params = grid.best_params_ 81 | 82 | print('Training model on order {} with best parameters {}'.format( 83 | best_order, best_params)) 84 | stop_position = 3 + (best_order * 8) 85 | clf = SVC(kernel='rbf', C=best_params['C'], gamma=best_params['gamma']) 86 | clf.fit(X=train_fourier_descriptors[:, :stop_position], y=train_labels) 87 | 88 | # Run predictions on unseen test data to verify generalization 89 | path = Path(DATA_FOLDER + TEST_DATA_FILE) 90 | if not path.exists(): 91 | print("Retrieving test data from web...") 92 | urlretrieve(TEST_DATA_URL, DATA_FOLDER + TEST_DATA_FILE) 93 | 94 | test_loaded = np.load(DATA_FOLDER + TEST_DATA_FILE) 95 | test_fourier_descriptors = test_loaded['elliptic_fourier_descriptors'] 96 | test_labels = np.asarray(test_loaded['feature_type'], dtype=int) 97 | test_fourier_descriptors = scaler.transform(test_fourier_descriptors) 98 | 99 | print('Run on test data...') 100 | predictions = clf.predict(test_fourier_descriptors[:, :stop_position]) 101 | test_accuracy = accuracy_score(test_labels, predictions) 102 | 103 | runtime = time() - SCRIPT_START 104 | message = '\nTest accuracy of {} for fourier descriptor order {} with {} in {}'.format( 105 | test_accuracy, best_order, best_params, timedelta(seconds=runtime)) 106 | print(message) 107 | notify(SCRIPT_NAME, message) 108 | -------------------------------------------------------------------------------- /model/baseline/building_type_decision_tree.py: -------------------------------------------------------------------------------- 1 | """ 2 | This script executes the task of estimating the building type, based solely on the geometry for that building. 3 | The data for this script can be found at http://hdl.handle.net/10411/GYPPBR. 4 | """ 5 | 6 | import multiprocessing 7 | import os 8 | import sys 9 | from datetime import datetime, timedelta 10 | from pathlib import Path 11 | from time import time 12 | from urllib.request import urlretrieve 13 | 14 | import numpy as np 15 | from sklearn.metrics import accuracy_score 16 | from sklearn.model_selection import cross_val_score, StratifiedShuffleSplit, GridSearchCV 17 | from sklearn.preprocessing import StandardScaler 18 | from sklearn.tree import DecisionTreeClassifier 19 | 20 | PACKAGE_PARENT = '..' 21 | SCRIPT_DIR = os.path.dirname(os.path.realpath(os.path.join(os.getcwd(), os.path.expanduser(__file__)))) 22 | sys.path.append(os.path.normpath(os.path.join(SCRIPT_DIR, PACKAGE_PARENT))) 23 | 24 | from topoml_util.slack_send import notify 25 | 26 | SCRIPT_VERSION = '1.0.2' 27 | SCRIPT_NAME = os.path.basename(__file__) 28 | TIMESTAMP = str(datetime.now()).replace(':', '.') 29 | NUM_CPUS = multiprocessing.cpu_count() - 1 or 1 30 | DATA_FOLDER = SCRIPT_DIR + '/../../files/buildings/' 31 | TRAIN_DATA_FILE = 'buildings_train_v7.npz' 32 | TEST_DATA_FILE = 'buildings_test_v7.npz' 33 | TRAIN_DATA_URL = 'https://dataverse.nl/api/access/datafile/11381' 34 | TEST_DATA_URL = 'https://dataverse.nl/api/access/datafile/11380' 35 | EFD_ORDERS = [0, 1, 2, 3, 4, 6, 8, 12, 16, 20, 24] 36 | SCRIPT_START = time() 37 | 38 | if __name__ == '__main__': # this is to squelch warnings on scikit-learn multithreaded grid search 39 | # Load training data 40 | path = Path(DATA_FOLDER + TRAIN_DATA_FILE) 41 | if not path.exists(): 42 | print("Retrieving training data from web...") 43 | urlretrieve(TRAIN_DATA_URL, DATA_FOLDER + TRAIN_DATA_FILE) 44 | 45 | train_loaded = np.load(DATA_FOLDER + TRAIN_DATA_FILE) 46 | train_fourier_descriptors = train_loaded['elliptic_fourier_descriptors'] 47 | train_labels = train_loaded['building_type'] 48 | 49 | scaler = StandardScaler().fit(train_fourier_descriptors) 50 | train_fourier_descriptors = scaler.transform(train_fourier_descriptors) 51 | 52 | param_grid = {'max_depth': range(6, 13)} 53 | cv = StratifiedShuffleSplit(n_splits=5, test_size=0.2, random_state=42) 54 | grid = GridSearchCV( 55 | DecisionTreeClassifier(), 56 | n_jobs=NUM_CPUS, 57 | param_grid=param_grid, 58 | verbose=2, 59 | cv=cv) 60 | 61 | print('Performing grid search on model...') 62 | print('Using {} threads for grid search'.format(NUM_CPUS)) 63 | print('Searching {} elliptic fourier descriptor orders'.format(EFD_ORDERS)) 64 | 65 | best_order = 0 66 | best_score = 0 67 | best_params = {} 68 | 69 | for order in EFD_ORDERS: 70 | print('Fitting order {} fourier descriptors'.format(order)) 71 | stop_position = 3 + (order * 8) 72 | grid.fit(train_fourier_descriptors[:, :stop_position], train_labels) 73 | print("The best parameters for order {} are {} with a score of {}\n".format( 74 | order, grid.best_params_, grid.best_score_)) 75 | if grid.best_score_ > best_score: 76 | best_score = grid.best_score_ 77 | best_order = order 78 | best_params = grid.best_params_ 79 | 80 | print('Training model on order {} with best parameters {}'.format( 81 | best_order, best_params)) 82 | stop_position = 3 + (best_order * 8) 83 | clf = DecisionTreeClassifier(max_depth=best_params['max_depth']) 84 | scores = cross_val_score(clf, train_fourier_descriptors[:, :stop_position], train_labels, cv=10, n_jobs=NUM_CPUS) 85 | print('Cross-validation scores:', scores) 86 | clf.fit(train_fourier_descriptors[:, :stop_position], train_labels) 87 | 88 | # Run predictions on unseen test data to verify generalization 89 | path = Path(DATA_FOLDER + TEST_DATA_FILE) 90 | if not path.exists(): 91 | print("Retrieving test data from web...") 92 | urlretrieve(TEST_DATA_URL, DATA_FOLDER + TEST_DATA_FILE) 93 | 94 | test_loaded = np.load(DATA_FOLDER + TEST_DATA_FILE) 95 | test_fourier_descriptors = test_loaded['elliptic_fourier_descriptors'] 96 | test_labels = np.asarray(test_loaded['building_type'], dtype=int) 97 | test_fourier_descriptors = scaler.transform(test_fourier_descriptors) 98 | 99 | print('Run on test data...') 100 | predictions = clf.predict(test_fourier_descriptors[:, :stop_position]) 101 | test_accuracy = accuracy_score(test_labels, predictions) 102 | 103 | runtime = time() - SCRIPT_START 104 | message = '\nTest accuracy of {} for fourier descriptor order {} with {} in {}'.format( 105 | test_accuracy, best_order, best_params, timedelta(seconds=runtime)) 106 | print(message) 107 | notify(SCRIPT_NAME, message) 108 | -------------------------------------------------------------------------------- /model/baseline/building_type_knn.py: -------------------------------------------------------------------------------- 1 | """ 2 | This script executes the task of estimating the building type, based solely on the geometry for that building. 3 | The data for this script can be found at http://hdl.handle.net/10411/GYPPBR. 4 | """ 5 | 6 | import multiprocessing 7 | import os 8 | import sys 9 | from datetime import datetime, timedelta 10 | from pathlib import Path 11 | from time import time 12 | from urllib.request import urlretrieve 13 | 14 | import numpy as np 15 | from sklearn.metrics import accuracy_score 16 | from sklearn.model_selection import cross_val_score, StratifiedShuffleSplit, GridSearchCV 17 | from sklearn.neighbors import KNeighborsClassifier 18 | from sklearn.preprocessing import StandardScaler 19 | 20 | PACKAGE_PARENT = '..' 21 | SCRIPT_DIR = os.path.dirname(os.path.realpath(os.path.join(os.getcwd(), os.path.expanduser(__file__)))) 22 | sys.path.append(os.path.normpath(os.path.join(SCRIPT_DIR, PACKAGE_PARENT))) 23 | 24 | from topoml_util.slack_send import notify 25 | 26 | SCRIPT_VERSION = '1.0.5' 27 | SCRIPT_NAME = os.path.basename(__file__) 28 | TIMESTAMP = str(datetime.now()).replace(':', '.') 29 | NUM_CPUS = multiprocessing.cpu_count() - 1 or 1 30 | DATA_FOLDER = SCRIPT_DIR + '/../../files/buildings/' 31 | TRAIN_DATA_FILE = 'buildings_train_v7.npz' 32 | TEST_DATA_FILE = 'buildings_test_v7.npz' 33 | TRAIN_DATA_URL = 'https://dataverse.nl/api/access/datafile/11381' 34 | TEST_DATA_URL = 'https://dataverse.nl/api/access/datafile/11380' 35 | EFD_ORDERS = [0, 1, 2, 3, 4, 6, 8, 12, 16, 20, 24] 36 | SCRIPT_START = time() 37 | 38 | if __name__ == '__main__': # this is to squelch warnings on scikit-learn multithreaded grid search 39 | # Load training data 40 | path = Path(DATA_FOLDER + TRAIN_DATA_FILE) 41 | if not path.exists(): 42 | print("Retrieving training data from web...") 43 | urlretrieve(TRAIN_DATA_URL, DATA_FOLDER + TRAIN_DATA_FILE) 44 | 45 | train_loaded = np.load(DATA_FOLDER + TRAIN_DATA_FILE) 46 | train_fourier_descriptors = train_loaded['elliptic_fourier_descriptors'] 47 | train_labels = train_loaded['building_type'] 48 | 49 | scaler = StandardScaler().fit(train_fourier_descriptors) 50 | train_fourier_descriptors = scaler.transform(train_fourier_descriptors) 51 | 52 | k_range = np.linspace(start=21, stop=30, num=10, dtype=int) 53 | param_grid = dict(n_neighbors=k_range) 54 | cv = StratifiedShuffleSplit(n_splits=5, test_size=0.2, random_state=42) 55 | grid = GridSearchCV( 56 | KNeighborsClassifier(), 57 | n_jobs=NUM_CPUS, 58 | param_grid=param_grid, 59 | verbose=2, 60 | cv=cv) 61 | 62 | print('Performing grid search on model...') 63 | print('Using {} threads for grid search'.format(NUM_CPUS)) 64 | print('Searching {} elliptic fourier descriptor orders'.format(EFD_ORDERS)) 65 | 66 | best_order = 0 67 | best_score = 0 68 | best_params = {} 69 | 70 | for order in EFD_ORDERS: 71 | print('Fitting order {} fourier descriptors'.format(order)) 72 | stop_position = 3 + (order * 8) 73 | grid.fit(train_fourier_descriptors[::5, :stop_position], train_labels[::5]) 74 | print("The best parameters for order {} are {} with a score of {}\n".format( 75 | order, grid.best_params_, grid.best_score_)) 76 | if grid.best_score_ > best_score: 77 | best_score = grid.best_score_ 78 | best_order = order 79 | best_params = grid.best_params_ 80 | 81 | print('Training model on order {} with best parameters {}'.format( 82 | best_order, best_params)) 83 | stop_position = 3 + (best_order * 8) 84 | clf = KNeighborsClassifier(n_neighbors=best_params['n_neighbors']) 85 | scores = cross_val_score(clf, train_fourier_descriptors[:, :stop_position], train_labels, cv=10, n_jobs=NUM_CPUS) 86 | print('Cross-validation scores:', scores) 87 | clf.fit(train_fourier_descriptors[:, :stop_position], train_labels) 88 | 89 | # Run predictions on unseen test data to verify generalization 90 | path = Path(DATA_FOLDER + TEST_DATA_FILE) 91 | if not path.exists(): 92 | print("Retrieving test data from web...") 93 | urlretrieve(TEST_DATA_URL, DATA_FOLDER + TEST_DATA_FILE) 94 | 95 | test_loaded = np.load(DATA_FOLDER + TEST_DATA_FILE) 96 | test_fourier_descriptors = test_loaded['elliptic_fourier_descriptors'] 97 | test_labels = np.asarray(test_loaded['building_type'], dtype=int) 98 | test_fourier_descriptors = scaler.transform(test_fourier_descriptors) 99 | 100 | print('Run on test data...') 101 | predictions = clf.predict(test_fourier_descriptors[:, :stop_position]) 102 | test_accuracy = accuracy_score(test_labels, predictions) 103 | 104 | runtime = time() - SCRIPT_START 105 | message = '\nTest accuracy of {} for fourier descriptor order {} with {} in {}'.format( 106 | test_accuracy, best_order, best_params, timedelta(seconds=runtime)) 107 | print(message) 108 | notify(SCRIPT_NAME, message) 109 | -------------------------------------------------------------------------------- /model/baseline/building_type_logistic_regression.py: -------------------------------------------------------------------------------- 1 | """ 2 | This script executes the task of estimating the building type, based solely on the geometry for that building. 3 | The data for this script can be found at http://hdl.handle.net/10411/GYPPBR. 4 | """ 5 | 6 | import multiprocessing 7 | import os 8 | import sys 9 | from datetime import datetime, timedelta 10 | from pathlib import Path 11 | from time import time 12 | from urllib.request import urlretrieve 13 | 14 | import numpy as np 15 | from sklearn.linear_model import LogisticRegression 16 | from sklearn.metrics import accuracy_score 17 | from sklearn.model_selection import cross_val_score, StratifiedShuffleSplit, GridSearchCV 18 | from sklearn.preprocessing import StandardScaler 19 | 20 | PACKAGE_PARENT = '..' 21 | SCRIPT_DIR = os.path.dirname(os.path.realpath(os.path.join(os.getcwd(), os.path.expanduser(__file__)))) 22 | sys.path.append(os.path.normpath(os.path.join(SCRIPT_DIR, PACKAGE_PARENT))) 23 | 24 | from topoml_util.slack_send import notify 25 | 26 | SCRIPT_VERSION = '1.0.1' 27 | SCRIPT_NAME = os.path.basename(__file__) 28 | TIMESTAMP = str(datetime.now()).replace(':', '.') 29 | NUM_CPUS = multiprocessing.cpu_count() - 1 or 1 30 | DATA_FOLDER = SCRIPT_DIR + '/../../files/buildings/' 31 | TRAIN_DATA_FILE = 'buildings_train_v7.npz' 32 | TEST_DATA_FILE = 'buildings_test_v7.npz' 33 | TRAIN_DATA_URL = 'https://dataverse.nl/api/access/datafile/11381' 34 | TEST_DATA_URL = 'https://dataverse.nl/api/access/datafile/11380' 35 | EFD_ORDERS = [0, 1, 2, 3, 4, 6, 8, 12, 16, 20, 24] 36 | SCRIPT_START = time() 37 | 38 | if __name__ == '__main__': # this is to squelch warnings on scikit-learn multithreaded grid search 39 | # Load training data 40 | path = Path(DATA_FOLDER + TRAIN_DATA_FILE) 41 | if not path.exists(): 42 | print("Retrieving training data from web...") 43 | urlretrieve(TRAIN_DATA_URL, DATA_FOLDER + TRAIN_DATA_FILE) 44 | 45 | train_loaded = np.load(DATA_FOLDER + TRAIN_DATA_FILE) 46 | train_fourier_descriptors = train_loaded['elliptic_fourier_descriptors'] 47 | train_labels = train_loaded['building_type'] 48 | 49 | scaler = StandardScaler().fit(train_fourier_descriptors) 50 | train_fourier_descriptors = scaler.transform(train_fourier_descriptors) 51 | 52 | # Grid search 53 | C_range = [1e-2, 1e-1, 1e0, 1e1, 1e2, 1e3] 54 | param_grid = dict(C=C_range) 55 | cv = StratifiedShuffleSplit(n_splits=5, test_size=0.2, random_state=42) 56 | grid = GridSearchCV( 57 | LogisticRegression(), 58 | n_jobs=NUM_CPUS, 59 | param_grid=param_grid, 60 | verbose=2, 61 | cv=cv) 62 | 63 | print('Performing grid search on model...') 64 | print('Using {} threads for grid search'.format(NUM_CPUS)) 65 | print('Searching {} elliptic fourier descriptor orders'.format(EFD_ORDERS)) 66 | 67 | best_order = 0 68 | best_score = 0 69 | best_params = {} 70 | 71 | for order in EFD_ORDERS: 72 | print('Fitting order {} fourier descriptors'.format(order)) 73 | stop_position = 3 + (order * 8) 74 | grid.fit(train_fourier_descriptors[:, :stop_position], train_labels) 75 | print("The best parameters for order {} are {} with a score of {}\n".format( 76 | order, grid.best_params_, grid.best_score_)) 77 | if grid.best_score_ > best_score: 78 | best_score = grid.best_score_ 79 | best_order = order 80 | best_params = grid.best_params_ 81 | 82 | print('Training model on order {} with best parameters {}'.format( 83 | best_order, best_params)) 84 | stop_position = 3 + (best_order * 8) 85 | clf = LogisticRegression(C=best_params['C']) 86 | scores = cross_val_score(clf, train_fourier_descriptors[:, :stop_position], train_labels, cv=10, n_jobs=NUM_CPUS) 87 | print('Cross-validation scores:', scores) 88 | clf.fit(train_fourier_descriptors[:, :stop_position], train_labels) 89 | 90 | # Run predictions on unseen test data to verify generalization 91 | path = Path(DATA_FOLDER + TEST_DATA_FILE) 92 | if not path.exists(): 93 | print("Retrieving test data from web...") 94 | urlretrieve(TEST_DATA_URL, DATA_FOLDER + TEST_DATA_FILE) 95 | 96 | test_loaded = np.load(DATA_FOLDER + TEST_DATA_FILE) 97 | test_fourier_descriptors = test_loaded['elliptic_fourier_descriptors'] 98 | test_labels = np.asarray(test_loaded['building_type'], dtype=int) 99 | test_fourier_descriptors = scaler.transform(test_fourier_descriptors) 100 | 101 | print('Run on test data...') 102 | predictions = clf.predict(test_fourier_descriptors[:, :stop_position]) 103 | test_accuracy = accuracy_score(test_labels, predictions) 104 | 105 | runtime = time() - SCRIPT_START 106 | message = '\nTest accuracy of {} for fourier descriptor order {} with {} in {}'.format( 107 | test_accuracy, best_order, best_params, timedelta(seconds=runtime)) 108 | print(message) 109 | notify(SCRIPT_NAME, message) 110 | -------------------------------------------------------------------------------- /model/baseline/building_type_svm_linear.py: -------------------------------------------------------------------------------- 1 | """ 2 | This script executes the task of estimating the building type, based solely on the geometry for that building. 3 | The data for this script can be found at http://hdl.handle.net/10411/GYPPBR. 4 | """ 5 | 6 | import multiprocessing 7 | import os 8 | import sys 9 | from datetime import datetime, timedelta 10 | from pathlib import Path 11 | from time import time 12 | from urllib.request import urlretrieve 13 | 14 | import numpy as np 15 | from sklearn.metrics import accuracy_score 16 | from sklearn.model_selection import StratifiedShuffleSplit, GridSearchCV 17 | from sklearn.preprocessing import StandardScaler 18 | from sklearn.svm import SVC 19 | 20 | PACKAGE_PARENT = '..' 21 | SCRIPT_DIR = os.path.dirname(os.path.realpath(os.path.join(os.getcwd(), os.path.expanduser(__file__)))) 22 | sys.path.append(os.path.normpath(os.path.join(SCRIPT_DIR, PACKAGE_PARENT))) 23 | 24 | from topoml_util.slack_send import notify 25 | 26 | SCRIPT_VERSION = '1.0.1' 27 | SCRIPT_NAME = os.path.basename(__file__) 28 | TIMESTAMP = str(datetime.now()).replace(':', '.') 29 | NUM_CPUS = multiprocessing.cpu_count() - 1 or 1 30 | DATA_FOLDER = SCRIPT_DIR + '/../../files/buildings/' 31 | TRAIN_DATA_FILE = 'buildings_train_v7.npz' 32 | TEST_DATA_FILE = 'buildings_test_v7.npz' 33 | TRAIN_DATA_URL = 'https://dataverse.nl/api/access/datafile/11381' 34 | TEST_DATA_URL = 'https://dataverse.nl/api/access/datafile/11380' 35 | EFD_ORDERS = [0, 1, 2, 3, 4, 6, 8, 12, 16, 20, 24] 36 | SCRIPT_START = time() 37 | 38 | if __name__ == '__main__': # this is to squelch warnings on scikit-learn multithreaded grid search 39 | # Load training data 40 | path = Path(DATA_FOLDER + TRAIN_DATA_FILE) 41 | if not path.exists(): 42 | print("Retrieving training data from web...") 43 | urlretrieve(TRAIN_DATA_URL, DATA_FOLDER + TRAIN_DATA_FILE) 44 | 45 | train_loaded = np.load(DATA_FOLDER + TRAIN_DATA_FILE) 46 | train_fourier_descriptors = train_loaded['elliptic_fourier_descriptors'] 47 | train_labels = train_loaded['building_type'] 48 | 49 | scaler = StandardScaler().fit(train_fourier_descriptors) 50 | train_fourier_descriptors = scaler.transform(train_fourier_descriptors) 51 | 52 | C_range = [1e-1, 1e0, 1e1, 1e2, 1e3] 53 | param_grid = dict(C=C_range) 54 | cv = StratifiedShuffleSplit(n_splits=5, test_size=0.2, random_state=42) 55 | grid = GridSearchCV( 56 | SVC(kernel='linear', max_iter=int(1e7)), 57 | n_jobs=NUM_CPUS, 58 | param_grid=param_grid, 59 | verbose=2, 60 | cv=cv) 61 | 62 | print('Performing grid search on model...') 63 | print('Using {} threads for grid search'.format(NUM_CPUS)) 64 | print('Searching {} elliptic fourier descriptor orders'.format(EFD_ORDERS)) 65 | 66 | best_order = 0 67 | best_score = 0 68 | best_params = {} 69 | 70 | for order in EFD_ORDERS: 71 | print('Fitting order {} fourier descriptors'.format(order)) 72 | stop_position = 3 + (order * 8) 73 | grid.fit(train_fourier_descriptors[::20, :stop_position], train_labels[::20]) 74 | print("The best parameters for order {} are {} with a score of {}\n".format( 75 | order, grid.best_params_, grid.best_score_)) 76 | if grid.best_score_ > best_score: 77 | best_score = grid.best_score_ 78 | best_order = order 79 | best_params = grid.best_params_ 80 | 81 | print('Training model on order {} with best parameters {}'.format( 82 | best_order, best_params)) 83 | stop_position = 3 + (best_order * 8) 84 | clf = SVC(kernel='linear', C=best_params['C'], max_iter=int(1e7)) 85 | clf.fit(X=train_fourier_descriptors[:, :stop_position], y=train_labels) 86 | 87 | # Run predictions on unseen test data to verify generalization 88 | path = Path(DATA_FOLDER + TEST_DATA_FILE) 89 | if not path.exists(): 90 | print("Retrieving test data from web...") 91 | urlretrieve(TEST_DATA_URL, DATA_FOLDER + TEST_DATA_FILE) 92 | 93 | test_loaded = np.load(DATA_FOLDER + TEST_DATA_FILE) 94 | test_fourier_descriptors = test_loaded['elliptic_fourier_descriptors'] 95 | test_labels = np.asarray(test_loaded['building_type'], dtype=int) 96 | test_fourier_descriptors = scaler.transform(test_fourier_descriptors) 97 | 98 | print('Run on test data...') 99 | predictions = clf.predict(test_fourier_descriptors[:, :stop_position]) 100 | test_accuracy = accuracy_score(test_labels, predictions) 101 | 102 | runtime = time() - SCRIPT_START 103 | message = '\nTest accuracy of {} for fourier descriptor order {} with {} in {}'.format( 104 | test_accuracy, best_order, best_params, timedelta(seconds=runtime)) 105 | print(message) 106 | notify(SCRIPT_NAME, message) 107 | -------------------------------------------------------------------------------- /model/baseline/building_type_svm_polynomial.py: -------------------------------------------------------------------------------- 1 | """ 2 | This script executes the task of estimating the building type, based solely on the geometry for that building. 3 | The data for this script can be found at http://hdl.handle.net/10411/GYPPBR. 4 | """ 5 | 6 | import multiprocessing 7 | import os 8 | import sys 9 | from datetime import datetime, timedelta 10 | from pathlib import Path 11 | from time import time 12 | from urllib.request import urlretrieve 13 | 14 | import numpy as np 15 | from sklearn.metrics import accuracy_score 16 | from sklearn.model_selection import StratifiedShuffleSplit, GridSearchCV 17 | from sklearn.preprocessing import StandardScaler 18 | from sklearn.svm import SVC 19 | 20 | PACKAGE_PARENT = '..' 21 | SCRIPT_DIR = os.path.dirname(os.path.realpath(os.path.join(os.getcwd(), os.path.expanduser(__file__)))) 22 | sys.path.append(os.path.normpath(os.path.join(SCRIPT_DIR, PACKAGE_PARENT))) 23 | 24 | from topoml_util.slack_send import notify 25 | 26 | SCRIPT_VERSION = '1.0.4' 27 | SCRIPT_NAME = os.path.basename(__file__) 28 | TIMESTAMP = str(datetime.now()).replace(':', '.') 29 | NUM_CPUS = multiprocessing.cpu_count() - 1 or 1 30 | DATA_FOLDER = SCRIPT_DIR + '/../../files/buildings/' 31 | TRAIN_DATA_FILE = 'buildings_train_v7.npz' 32 | TEST_DATA_FILE = 'buildings_test_v7.npz' 33 | TRAIN_DATA_URL = 'https://dataverse.nl/api/access/datafile/11381' 34 | TEST_DATA_URL = 'https://dataverse.nl/api/access/datafile/11380' 35 | EFD_ORDERS = [0, 1, 2, 3, 4, 6, 8, 12, 16, 20, 24] 36 | SCRIPT_START = time() 37 | 38 | if __name__ == '__main__': # this is to squelch warnings on scikit-learn multithreaded grid search 39 | # Load training data 40 | path = Path(DATA_FOLDER + TRAIN_DATA_FILE) 41 | if not path.exists(): 42 | print("Retrieving training data from web...") 43 | urlretrieve(TRAIN_DATA_URL, DATA_FOLDER + TRAIN_DATA_FILE) 44 | 45 | train_loaded = np.load(DATA_FOLDER + TRAIN_DATA_FILE) 46 | train_fourier_descriptors = train_loaded['elliptic_fourier_descriptors'] 47 | train_labels = train_loaded['building_type'] 48 | 49 | scaler = StandardScaler().fit(train_fourier_descriptors) 50 | train_fourier_descriptors = scaler.transform(train_fourier_descriptors) 51 | 52 | C_range = [1e-2, 1e-1, 1e0, 1e1, 1e2, 1e3] 53 | degree_range = range(1, 7) 54 | param_grid = dict(degree=degree_range, C=C_range) 55 | cv = StratifiedShuffleSplit(n_splits=5, test_size=0.2, random_state=42) 56 | grid = GridSearchCV( 57 | SVC(kernel='poly', max_iter=int(1e7)), 58 | n_jobs=NUM_CPUS, 59 | param_grid=param_grid, 60 | verbose=2, 61 | cv=cv) 62 | 63 | print('Performing grid search on model...') 64 | print('Using {} threads for grid search'.format(NUM_CPUS)) 65 | print('Searching {} elliptic fourier descriptor orders'.format(EFD_ORDERS)) 66 | 67 | best_order = 0 68 | best_score = 0 69 | best_params = {} 70 | 71 | for order in EFD_ORDERS: 72 | print('Fitting order {} fourier descriptors'.format(order)) 73 | stop_position = 3 + (order * 8) 74 | grid.fit(train_fourier_descriptors[::16, :stop_position], train_labels[::16]) 75 | print("The best parameters for order {} are {} with a score of {}\n".format( 76 | order, grid.best_params_, grid.best_score_)) 77 | if grid.best_score_ > best_score: 78 | best_score = grid.best_score_ 79 | best_order = order 80 | best_params = grid.best_params_ 81 | 82 | print('Training model on order {} with best parameters {}'.format( 83 | best_order, best_params)) 84 | stop_position = 3 + (best_order * 8) 85 | clf = SVC(kernel='poly', C=best_params['C'], degree=best_params['degree']) 86 | clf.fit(X=train_fourier_descriptors[:, :stop_position], y=train_labels) 87 | 88 | # Run predictions on unseen test data to verify generalization 89 | path = Path(DATA_FOLDER + TEST_DATA_FILE) 90 | if not path.exists(): 91 | print("Retrieving test data from web...") 92 | urlretrieve(TEST_DATA_URL, DATA_FOLDER + TEST_DATA_FILE) 93 | 94 | test_loaded = np.load(DATA_FOLDER + TEST_DATA_FILE) 95 | test_fourier_descriptors = test_loaded['elliptic_fourier_descriptors'] 96 | test_labels = np.asarray(test_loaded['building_type'], dtype=int) 97 | test_fourier_descriptors = scaler.transform(test_fourier_descriptors) 98 | 99 | print('Run on test data...') 100 | predictions = clf.predict(test_fourier_descriptors[:, :stop_position]) 101 | test_accuracy = accuracy_score(test_labels, predictions) 102 | 103 | runtime = time() - SCRIPT_START 104 | message = '\nTest accuracy of {} for fourier descriptor order {} with {} in {}'.format( 105 | test_accuracy, best_order, best_params, timedelta(seconds=runtime)) 106 | print(message) 107 | notify(SCRIPT_NAME, message) 108 | -------------------------------------------------------------------------------- /model/baseline/building_type_svm_rbf.py: -------------------------------------------------------------------------------- 1 | """ 2 | This script executes the task of estimating the building type, based solely on the geometry for that building. 3 | The data for this script can be found at http://hdl.handle.net/10411/GYPPBR. 4 | """ 5 | 6 | import multiprocessing 7 | import os 8 | import sys 9 | from datetime import datetime, timedelta 10 | from pathlib import Path 11 | from time import time 12 | from urllib.request import urlretrieve 13 | 14 | import numpy as np 15 | from sklearn.metrics import accuracy_score 16 | from sklearn.model_selection import StratifiedShuffleSplit, GridSearchCV 17 | from sklearn.preprocessing import StandardScaler 18 | from sklearn.svm import SVC 19 | 20 | PACKAGE_PARENT = '..' 21 | SCRIPT_DIR = os.path.dirname(os.path.realpath(os.path.join(os.getcwd(), os.path.expanduser(__file__)))) 22 | sys.path.append(os.path.normpath(os.path.join(SCRIPT_DIR, PACKAGE_PARENT))) 23 | 24 | from topoml_util.slack_send import notify 25 | 26 | SCRIPT_VERSION = '1.0.1' 27 | SCRIPT_NAME = os.path.basename(__file__) 28 | TIMESTAMP = str(datetime.now()).replace(':', '.') 29 | NUM_CPUS = multiprocessing.cpu_count() - 1 or 1 30 | DATA_FOLDER = SCRIPT_DIR + '/../../files/buildings/' 31 | TRAIN_DATA_FILE = 'buildings_train_v7.npz' 32 | TEST_DATA_FILE = 'buildings_test_v7.npz' 33 | TRAIN_DATA_URL = 'https://dataverse.nl/api/access/datafile/11381' 34 | TEST_DATA_URL = 'https://dataverse.nl/api/access/datafile/11380' 35 | EFD_ORDERS = [0, 1, 2, 3, 4, 6, 8, 12, 16, 20, 24] 36 | SCRIPT_START = time() 37 | 38 | if __name__ == '__main__': # this is to squelch warnings on scikit-learn multithreaded grid search 39 | # Load training data 40 | path = Path(DATA_FOLDER + TRAIN_DATA_FILE) 41 | if not path.exists(): 42 | print("Retrieving training data from web...") 43 | urlretrieve(TRAIN_DATA_URL, DATA_FOLDER + TRAIN_DATA_FILE) 44 | 45 | train_loaded = np.load(DATA_FOLDER + TRAIN_DATA_FILE) 46 | train_fourier_descriptors = train_loaded['elliptic_fourier_descriptors'] 47 | train_labels = train_loaded['building_type'] 48 | 49 | scaler = StandardScaler().fit(train_fourier_descriptors) 50 | train_fourier_descriptors = scaler.transform(train_fourier_descriptors) 51 | 52 | C_range = [1e-2, 1e-1, 1e0, 1e1, 1e2, 1e3] 53 | gamma_range = np.logspace(-2, 3, 6) 54 | param_grid = dict(gamma=gamma_range, C=C_range) 55 | cv = StratifiedShuffleSplit(n_splits=5, test_size=0.2, random_state=42) 56 | grid = GridSearchCV( 57 | SVC(kernel='rbf', max_iter=int(1e8)), 58 | n_jobs=NUM_CPUS, 59 | param_grid=param_grid, 60 | verbose=2, 61 | cv=cv) 62 | 63 | print('Performing grid search on model...') 64 | print('Using {} threads for grid search'.format(NUM_CPUS)) 65 | print('Searching {} elliptic fourier descriptor orders'.format(EFD_ORDERS)) 66 | 67 | best_order = 0 68 | best_score = 0 69 | best_params = {} 70 | 71 | for order in EFD_ORDERS: 72 | print('Fitting order {} fourier descriptors'.format(order)) 73 | stop_position = 3 + (order * 8) 74 | grid.fit(train_fourier_descriptors[::10, :stop_position], train_labels[::10]) 75 | print("The best parameters for order {} are {} with a score of {}\n".format( 76 | order, grid.best_params_, grid.best_score_)) 77 | if grid.best_score_ > best_score: 78 | best_score = grid.best_score_ 79 | best_order = order 80 | best_params = grid.best_params_ 81 | 82 | print('Training model on order {} with best parameters {}'.format( 83 | best_order, best_params)) 84 | stop_position = 3 + (best_order * 8) 85 | clf = SVC(kernel='rbf', C=best_params['C'], gamma=best_params['gamma']) 86 | clf.fit(X=train_fourier_descriptors[:, :stop_position], y=train_labels) 87 | 88 | # Run predictions on unseen test data to verify generalization 89 | path = Path(DATA_FOLDER + TEST_DATA_FILE) 90 | if not path.exists(): 91 | print("Retrieving test data from web...") 92 | urlretrieve(TEST_DATA_URL, DATA_FOLDER + TEST_DATA_FILE) 93 | 94 | test_loaded = np.load(DATA_FOLDER + TEST_DATA_FILE) 95 | test_fourier_descriptors = test_loaded['elliptic_fourier_descriptors'] 96 | test_labels = np.asarray(test_loaded['building_type'], dtype=int) 97 | test_fourier_descriptors = scaler.transform(test_fourier_descriptors) 98 | 99 | print('Run on test data...') 100 | predictions = clf.predict(test_fourier_descriptors[:, :stop_position]) 101 | test_accuracy = accuracy_score(test_labels, predictions) 102 | 103 | runtime = time() - SCRIPT_START 104 | message = '\nTest accuracy of {} for fourier descriptor order {} with {} in {}'.format( 105 | test_accuracy, best_order, best_params, timedelta(seconds=runtime)) 106 | print(message) 107 | notify(SCRIPT_NAME, message) 108 | -------------------------------------------------------------------------------- /model/baseline/neighborhood_inhabintants_decision_tree.py: -------------------------------------------------------------------------------- 1 | """ 2 | This script executes the task of estimating the number of inhabitants of a neighborhood to be under or over the 3 | median of all neighborhoods in the Netherlands, based solely on the geometry for that neighborhood. 4 | The data for this script can be found at http://hdl.handle.net/10411/GYPPBR. 5 | """ 6 | 7 | import multiprocessing 8 | import os 9 | import sys 10 | from datetime import datetime, timedelta 11 | from pathlib import Path 12 | from time import time 13 | from urllib.request import urlretrieve 14 | 15 | import numpy as np 16 | from sklearn.metrics import accuracy_score 17 | from sklearn.model_selection import cross_val_score, StratifiedShuffleSplit, GridSearchCV 18 | from sklearn.preprocessing import StandardScaler 19 | from sklearn.tree import DecisionTreeClassifier 20 | 21 | PACKAGE_PARENT = '..' 22 | SCRIPT_DIR = os.path.dirname(os.path.realpath(os.path.join(os.getcwd(), os.path.expanduser(__file__)))) 23 | sys.path.append(os.path.normpath(os.path.join(SCRIPT_DIR, PACKAGE_PARENT))) 24 | 25 | from topoml_util.slack_send import notify 26 | 27 | SCRIPT_VERSION = '1.0.8' 28 | SCRIPT_NAME = os.path.basename(__file__) 29 | TIMESTAMP = str(datetime.now()).replace(':', '.') 30 | NUM_CPUS = multiprocessing.cpu_count() - 1 or 1 31 | DATA_FOLDER = SCRIPT_DIR + '/../../files/neighborhoods/' 32 | TRAIN_DATA_FILE = 'neighborhoods_train_v7.npz' 33 | TEST_DATA_FILE = 'neighborhoods_test_v7.npz' 34 | TRAIN_DATA_URL = 'https://dataverse.nl/api/access/datafile/11378' 35 | TEST_DATA_URL = 'https://dataverse.nl/api/access/datafile/11379' 36 | EFD_ORDERS = [0, 1, 2, 3, 4, 6, 8, 12, 16, 20, 24] 37 | SCRIPT_START = time() 38 | 39 | if __name__ == '__main__': # this is to squelch warnings on scikit-learn multithreaded grid search 40 | # Load training data 41 | path = Path(DATA_FOLDER + TRAIN_DATA_FILE) 42 | if not path.exists(): 43 | print("Retrieving training data from web...") 44 | urlretrieve(TRAIN_DATA_URL, DATA_FOLDER + TRAIN_DATA_FILE) 45 | 46 | train_loaded = np.load(DATA_FOLDER + TRAIN_DATA_FILE) 47 | train_fourier_descriptors = train_loaded['elliptic_fourier_descriptors'] 48 | train_labels = train_loaded['above_or_below_median'] 49 | 50 | scaler = StandardScaler().fit(train_fourier_descriptors) 51 | train_fourier_descriptors = scaler.transform(train_fourier_descriptors) 52 | 53 | param_grid = {'max_depth': range(4, 10)} 54 | cv = StratifiedShuffleSplit(n_splits=5, test_size=0.2, random_state=42) 55 | grid = GridSearchCV( 56 | DecisionTreeClassifier(), 57 | n_jobs=NUM_CPUS, 58 | param_grid=param_grid, 59 | verbose=2, 60 | cv=cv) 61 | 62 | print('Performing grid search on model...') 63 | print('Using {} threads for grid search'.format(NUM_CPUS)) 64 | print('Searching {} elliptic fourier descriptor orders'.format(EFD_ORDERS)) 65 | 66 | best_order = 0 67 | best_score = 0 68 | best_params = {} 69 | 70 | for order in EFD_ORDERS: 71 | print('Fitting order {} fourier descriptors'.format(order)) 72 | stop_position = 3 + (order * 8) 73 | grid.fit(train_fourier_descriptors[:, :stop_position], train_labels) 74 | print("The best parameters for order {} are {} with a score of {}\n".format( 75 | order, grid.best_params_, grid.best_score_)) 76 | if grid.best_score_ > best_score: 77 | best_score = grid.best_score_ 78 | best_order = order 79 | best_params = grid.best_params_ 80 | 81 | print('Training model on order {} with best parameters {}'.format( 82 | best_order, best_params)) 83 | stop_position = 3 + (best_order * 8) 84 | clf = DecisionTreeClassifier(max_depth=best_params['max_depth']) 85 | scores = cross_val_score(clf, train_fourier_descriptors[:, :stop_position], train_labels, cv=10, n_jobs=NUM_CPUS) 86 | print('Cross-validation scores:', scores) 87 | clf.fit(train_fourier_descriptors[:, :stop_position], train_labels) 88 | 89 | # Run predictions on unseen test data to verify generalization 90 | path = Path(DATA_FOLDER + TEST_DATA_FILE) 91 | if not path.exists(): 92 | print("Retrieving test data from web...") 93 | urlretrieve(TEST_DATA_URL, DATA_FOLDER + TEST_DATA_FILE) 94 | 95 | test_loaded = np.load(DATA_FOLDER + TEST_DATA_FILE) 96 | test_fourier_descriptors = test_loaded['elliptic_fourier_descriptors'] 97 | test_labels = np.asarray(test_loaded['above_or_below_median'], dtype=int) 98 | test_fourier_descriptors = scaler.transform(test_fourier_descriptors) 99 | 100 | print('Run on test data...') 101 | predictions = clf.predict(test_fourier_descriptors[:, :stop_position]) 102 | test_accuracy = accuracy_score(test_labels, predictions) 103 | 104 | runtime = time() - SCRIPT_START 105 | message = '\nTest accuracy of {} for fourier descriptor order {} with {} in {}'.format( 106 | test_accuracy, best_order, best_params, timedelta(seconds=runtime)) 107 | print(message) 108 | notify(SCRIPT_NAME, message) 109 | -------------------------------------------------------------------------------- /model/baseline/neighborhood_inhabintants_knn.py: -------------------------------------------------------------------------------- 1 | """ 2 | This script executes the task of estimating the number of inhabitants of a neighborhood to be under or over the 3 | median of all neighborhoods in the Netherlands, based solely on the geometry for that neighborhood. 4 | The data for this script can be found at http://hdl.handle.net/10411/GYPPBR. 5 | """ 6 | 7 | import multiprocessing 8 | import os 9 | import sys 10 | from datetime import datetime, timedelta 11 | from pathlib import Path 12 | from time import time 13 | from urllib.request import urlretrieve 14 | 15 | import numpy as np 16 | from sklearn.metrics import accuracy_score 17 | from sklearn.model_selection import cross_val_score, StratifiedShuffleSplit, GridSearchCV 18 | from sklearn.neighbors import KNeighborsClassifier 19 | from sklearn.preprocessing import StandardScaler 20 | 21 | PACKAGE_PARENT = '..' 22 | SCRIPT_DIR = os.path.dirname(os.path.realpath(os.path.join(os.getcwd(), os.path.expanduser(__file__)))) 23 | sys.path.append(os.path.normpath(os.path.join(SCRIPT_DIR, PACKAGE_PARENT))) 24 | 25 | from topoml_util.slack_send import notify 26 | 27 | SCRIPT_VERSION = '1.0.1' 28 | SCRIPT_NAME = os.path.basename(__file__) 29 | TIMESTAMP = str(datetime.now()).replace(':', '.') 30 | NUM_CPUS = multiprocessing.cpu_count() - 1 or 1 31 | DATA_FOLDER = SCRIPT_DIR + '/../../files/neighborhoods/' 32 | TRAIN_DATA_FILE = 'neighborhoods_train_v7.npz' 33 | TEST_DATA_FILE = 'neighborhoods_test_v7.npz' 34 | TRAIN_DATA_URL = 'https://dataverse.nl/api/access/datafile/11378' 35 | TEST_DATA_URL = 'https://dataverse.nl/api/access/datafile/11379' 36 | EFD_ORDERS = [0, 1, 2, 3, 4, 6, 8, 12, 16, 20, 24] 37 | SCRIPT_START = time() 38 | 39 | if __name__ == '__main__': # this is to squelch warnings on scikit-learn multithreaded grid search 40 | # Load training data 41 | path = Path(DATA_FOLDER + TRAIN_DATA_FILE) 42 | if not path.exists(): 43 | print("Retrieving training data from web...") 44 | urlretrieve(TRAIN_DATA_URL, DATA_FOLDER + TRAIN_DATA_FILE) 45 | 46 | train_loaded = np.load(DATA_FOLDER + TRAIN_DATA_FILE) 47 | train_fourier_descriptors = train_loaded['elliptic_fourier_descriptors'] 48 | train_labels = train_loaded['above_or_below_median'][:, 0] 49 | 50 | scaler = StandardScaler().fit(train_fourier_descriptors) 51 | train_fourier_descriptors = scaler.transform(train_fourier_descriptors) 52 | 53 | k_range = np.linspace(start=21, stop=30, num=10, dtype=int) 54 | param_grid = dict(n_neighbors=k_range) 55 | cv = StratifiedShuffleSplit(n_splits=5, test_size=0.2, random_state=42) 56 | grid = GridSearchCV( 57 | KNeighborsClassifier(), 58 | n_jobs=NUM_CPUS, 59 | param_grid=param_grid, 60 | verbose=2, 61 | cv=cv) 62 | 63 | print('Performing grid search on model...') 64 | print('Using {} threads for grid search'.format(NUM_CPUS)) 65 | print('Searching {} elliptic fourier descriptor orders'.format(EFD_ORDERS)) 66 | 67 | best_order = 0 68 | best_score = 0 69 | best_params = {} 70 | 71 | for order in EFD_ORDERS: 72 | print('Fitting order {} fourier descriptors'.format(order)) 73 | stop_position = 3 + (order * 8) 74 | grid.fit(train_fourier_descriptors[:, :stop_position], train_labels) 75 | print("The best parameters for order {} are {} with a score of {}\n".format( 76 | order, grid.best_params_, grid.best_score_)) 77 | if grid.best_score_ > best_score: 78 | best_score = grid.best_score_ 79 | best_order = order 80 | best_params = grid.best_params_ 81 | 82 | print('Training model on order {} with best parameters {}'.format( 83 | best_order, best_params)) 84 | stop_position = 3 + (best_order * 8) 85 | clf = KNeighborsClassifier(n_neighbors=best_params['n_neighbors']) 86 | scores = cross_val_score(clf, train_fourier_descriptors[:, :stop_position], train_labels, cv=10, n_jobs=NUM_CPUS) 87 | print('Cross-validation scores:', scores) 88 | clf.fit(train_fourier_descriptors[:, :stop_position], train_labels) 89 | 90 | # Run predictions on unseen test data to verify generalization 91 | path = Path(DATA_FOLDER + TEST_DATA_FILE) 92 | if not path.exists(): 93 | print("Retrieving test data from web...") 94 | urlretrieve(TEST_DATA_URL, DATA_FOLDER + TEST_DATA_FILE) 95 | 96 | test_loaded = np.load(DATA_FOLDER + TEST_DATA_FILE) 97 | test_fourier_descriptors = test_loaded['elliptic_fourier_descriptors'] 98 | test_labels = test_loaded['above_or_below_median'][:, 0] 99 | test_fourier_descriptors = scaler.transform(test_fourier_descriptors) 100 | 101 | print('Run on test data...') 102 | predictions = clf.predict(test_fourier_descriptors[:, :stop_position]) 103 | test_accuracy = accuracy_score(test_labels, predictions) 104 | 105 | runtime = time() - SCRIPT_START 106 | message = '\nTest accuracy of {} for fourier descriptor order {} with {} in {}'.format( 107 | test_accuracy, best_order, best_params, timedelta(seconds=runtime)) 108 | print(message) 109 | notify(SCRIPT_NAME, message) 110 | -------------------------------------------------------------------------------- /model/baseline/neighborhood_inhabintants_logistic_regression.py: -------------------------------------------------------------------------------- 1 | """ 2 | This script executes the task of estimating the number of inhabitants of a neighborhood to be under or over the 3 | median of all neighborhoods in the Netherlands, based solely on the geometry for that neighborhood. 4 | The data for this script can be found at http://hdl.handle.net/10411/GYPPBR. 5 | """ 6 | 7 | import multiprocessing 8 | import os 9 | import sys 10 | from datetime import datetime, timedelta 11 | from pathlib import Path 12 | from time import time 13 | from urllib.request import urlretrieve 14 | 15 | import numpy as np 16 | from sklearn.linear_model import LogisticRegression 17 | from sklearn.metrics import accuracy_score 18 | from sklearn.model_selection import cross_val_score, StratifiedShuffleSplit, GridSearchCV 19 | from sklearn.preprocessing import StandardScaler 20 | 21 | PACKAGE_PARENT = '..' 22 | SCRIPT_DIR = os.path.dirname(os.path.realpath(os.path.join(os.getcwd(), os.path.expanduser(__file__)))) 23 | sys.path.append(os.path.normpath(os.path.join(SCRIPT_DIR, PACKAGE_PARENT))) 24 | 25 | from topoml_util.slack_send import notify 26 | 27 | SCRIPT_VERSION = '1.0.0' 28 | SCRIPT_NAME = os.path.basename(__file__) 29 | TIMESTAMP = str(datetime.now()).replace(':', '.') 30 | NUM_CPUS = multiprocessing.cpu_count() - 1 or 1 31 | DATA_FOLDER = SCRIPT_DIR + '/../../files/neighborhoods/' 32 | TRAIN_DATA_FILE = 'neighborhoods_train_v7.npz' 33 | TEST_DATA_FILE = 'neighborhoods_test_v7.npz' 34 | TRAIN_DATA_URL = 'https://dataverse.nl/api/access/datafile/11378' 35 | TEST_DATA_URL = 'https://dataverse.nl/api/access/datafile/11379' 36 | EFD_ORDERS = [0, 1, 2, 3, 4, 6, 8, 12, 16, 20, 24] 37 | SCRIPT_START = time() 38 | 39 | if __name__ == '__main__': # this is to squelch warnings on scikit-learn multithreaded grid search 40 | # Load training data 41 | path = Path(DATA_FOLDER + TRAIN_DATA_FILE) 42 | if not path.exists(): 43 | print("Retrieving training data from web...") 44 | urlretrieve(TRAIN_DATA_URL, DATA_FOLDER + TRAIN_DATA_FILE) 45 | 46 | train_loaded = np.load(DATA_FOLDER + TRAIN_DATA_FILE) 47 | train_fourier_descriptors = train_loaded['elliptic_fourier_descriptors'] 48 | train_labels = train_loaded['above_or_below_median'][:, 0] 49 | 50 | scaler = StandardScaler().fit(train_fourier_descriptors) 51 | train_fourier_descriptors = scaler.transform(train_fourier_descriptors) 52 | 53 | # Grid search 54 | C_range = [1e-3, 1e-2, 1e-1, 1e0, 1e1] 55 | param_grid = dict(C=C_range) 56 | cv = StratifiedShuffleSplit(n_splits=5, test_size=0.2, random_state=42) 57 | grid = GridSearchCV( 58 | LogisticRegression(), 59 | n_jobs=NUM_CPUS, 60 | param_grid=param_grid, 61 | verbose=2, 62 | cv=cv) 63 | 64 | print('Performing grid search on model...') 65 | print('Using {} threads for grid search'.format(NUM_CPUS)) 66 | print('Searching {} elliptic fourier descriptor orders'.format(EFD_ORDERS)) 67 | 68 | best_order = 0 69 | best_score = 0 70 | best_params = {} 71 | 72 | for order in EFD_ORDERS: 73 | print('Fitting order {} fourier descriptors'.format(order)) 74 | stop_position = 3 + (order * 8) 75 | grid.fit(train_fourier_descriptors[:, :stop_position], train_labels) 76 | print("The best parameters for order {} are {} with a score of {}\n".format( 77 | order, grid.best_params_, grid.best_score_)) 78 | if grid.best_score_ > best_score: 79 | best_score = grid.best_score_ 80 | best_order = order 81 | best_params = grid.best_params_ 82 | 83 | print('Training model on order {} with best parameters {}'.format( 84 | best_order, best_params)) 85 | stop_position = 3 + (best_order * 8) 86 | clf = LogisticRegression(C=best_params['C']) 87 | scores = cross_val_score(clf, train_fourier_descriptors[:, :stop_position], train_labels, cv=10, n_jobs=NUM_CPUS) 88 | print('Cross-validation scores:', scores) 89 | clf.fit(train_fourier_descriptors[:, :stop_position], train_labels) 90 | 91 | # Run predictions on unseen test data to verify generalization 92 | path = Path(DATA_FOLDER + TEST_DATA_FILE) 93 | if not path.exists(): 94 | print("Retrieving test data from web...") 95 | urlretrieve(TEST_DATA_URL, DATA_FOLDER + TEST_DATA_FILE) 96 | 97 | test_loaded = np.load(DATA_FOLDER + TEST_DATA_FILE) 98 | test_fourier_descriptors = test_loaded['elliptic_fourier_descriptors'] 99 | test_labels = np.asarray(test_loaded['above_or_below_median'][:, 0], dtype=int) 100 | test_fourier_descriptors = scaler.transform(test_fourier_descriptors) 101 | 102 | print('Run on test data...') 103 | predictions = clf.predict(test_fourier_descriptors[:, :stop_position]) 104 | test_accuracy = accuracy_score(test_labels, predictions) 105 | 106 | runtime = time() - SCRIPT_START 107 | message = '\nTest accuracy of {} for fourier descriptor order {} with {} in {}'.format( 108 | test_accuracy, best_order, best_params, timedelta(seconds=runtime)) 109 | print(message) 110 | notify(SCRIPT_NAME, message) 111 | -------------------------------------------------------------------------------- /model/baseline/neighborhood_inhabintants_svm_linear.py: -------------------------------------------------------------------------------- 1 | """ 2 | This script executes the task of estimating the number of inhabitants of a neighborhood to be under or over the 3 | median of all neighborhoods in the Netherlands, based solely on the geometry for that neighborhood. 4 | The data for this script can be found at http://hdl.handle.net/10411/GYPPBR. 5 | """ 6 | 7 | import multiprocessing 8 | import os 9 | import sys 10 | from datetime import datetime, timedelta 11 | from pathlib import Path 12 | from time import time 13 | from urllib.request import urlretrieve 14 | 15 | import numpy as np 16 | from sklearn.metrics import accuracy_score 17 | from sklearn.model_selection import StratifiedShuffleSplit, GridSearchCV 18 | from sklearn.preprocessing import StandardScaler 19 | from sklearn.svm import SVC 20 | 21 | PACKAGE_PARENT = '..' 22 | SCRIPT_DIR = os.path.dirname(os.path.realpath(os.path.join(os.getcwd(), os.path.expanduser(__file__)))) 23 | sys.path.append(os.path.normpath(os.path.join(SCRIPT_DIR, PACKAGE_PARENT))) 24 | 25 | from topoml_util.slack_send import notify 26 | 27 | SCRIPT_VERSION = '1.0.0' 28 | SCRIPT_NAME = os.path.basename(__file__) 29 | TIMESTAMP = str(datetime.now()).replace(':', '.') 30 | NUM_CPUS = multiprocessing.cpu_count() - 1 or 1 31 | DATA_FOLDER = SCRIPT_DIR + '/../../files/neighborhoods/' 32 | TRAIN_DATA_FILE = 'neighborhoods_train_v7.npz' 33 | TEST_DATA_FILE = 'neighborhoods_test_v7.npz' 34 | TRAIN_DATA_URL = 'https://dataverse.nl/api/access/datafile/11378' 35 | TEST_DATA_URL = 'https://dataverse.nl/api/access/datafile/11379' 36 | EFD_ORDERS = [0, 1, 2, 3, 4, 6, 8, 12, 16, 20, 24] 37 | SCRIPT_START = time() 38 | 39 | if __name__ == '__main__': # this is to squelch warnings on scikit-learn multithreaded grid search 40 | # Load training data 41 | path = Path(DATA_FOLDER + TRAIN_DATA_FILE) 42 | if not path.exists(): 43 | print("Retrieving training data from web...") 44 | urlretrieve(TRAIN_DATA_URL, DATA_FOLDER + TRAIN_DATA_FILE) 45 | 46 | train_loaded = np.load(DATA_FOLDER + TRAIN_DATA_FILE) 47 | train_fourier_descriptors = train_loaded['elliptic_fourier_descriptors'] 48 | train_labels = train_loaded['above_or_below_median'][:, 0] 49 | train_labels = np.reshape(train_labels, (train_labels.shape[0])) 50 | 51 | scaler = StandardScaler().fit(train_fourier_descriptors) 52 | train_fourier_descriptors = scaler.transform(train_fourier_descriptors) 53 | 54 | C_range = [1e-2, 1e-1, 1e0, 1e1, 1e2, 1e3] 55 | param_grid = dict(C=C_range) 56 | cv = StratifiedShuffleSplit(n_splits=5, test_size=0.2, random_state=42) 57 | grid = GridSearchCV( 58 | SVC(kernel='linear'), 59 | n_jobs=NUM_CPUS, 60 | param_grid=param_grid, 61 | verbose=2, 62 | cv=cv) 63 | 64 | print('Performing grid search on model...') 65 | print('Using {} threads for grid search'.format(NUM_CPUS)) 66 | print('Searching {} elliptic fourier descriptor orders'.format(EFD_ORDERS)) 67 | 68 | best_order = 0 69 | best_score = 0 70 | best_params = {} 71 | 72 | for order in EFD_ORDERS: 73 | print('Fitting order {} fourier descriptors'.format(order)) 74 | stop_position = 3 + (order * 8) 75 | grid.fit(train_fourier_descriptors[::5, :stop_position], train_labels[::5]) 76 | print("The best parameters for order {} are {} with a score of {}\n".format( 77 | order, grid.best_params_, grid.best_score_)) 78 | if grid.best_score_ > best_score: 79 | best_score = grid.best_score_ 80 | best_order = order 81 | best_params = grid.best_params_ 82 | 83 | print('Training model on order {} with best parameters {}'.format( 84 | best_order, best_params)) 85 | stop_position = 3 + (best_order * 8) 86 | clf = SVC(kernel='linear', C=best_params['C'], max_iter=int(1e7)) 87 | clf.fit(X=train_fourier_descriptors[:, :stop_position], y=train_labels) 88 | 89 | # Run predictions on unseen test data to verify generalization 90 | path = Path(DATA_FOLDER + TEST_DATA_FILE) 91 | if not path.exists(): 92 | print("Retrieving test data from web...") 93 | urlretrieve(TEST_DATA_URL, DATA_FOLDER + TEST_DATA_FILE) 94 | 95 | test_loaded = np.load(DATA_FOLDER + TEST_DATA_FILE) 96 | test_fourier_descriptors = test_loaded['elliptic_fourier_descriptors'] 97 | test_labels = test_loaded['above_or_below_median'][:, 0] 98 | test_fourier_descriptors = scaler.transform(test_fourier_descriptors) 99 | test_labels = np.reshape(test_labels, (test_labels.shape[0])) 100 | 101 | print('Run on test data...') 102 | predictions = clf.predict(test_fourier_descriptors[:, :stop_position]) 103 | test_accuracy = accuracy_score(test_labels, predictions) 104 | 105 | runtime = time() - SCRIPT_START 106 | message = '\nTest accuracy of {} for fourier descriptor order {} with {} in {}'.format( 107 | test_accuracy, best_order, best_params, timedelta(seconds=runtime)) 108 | print(message) 109 | notify(SCRIPT_NAME, message) 110 | -------------------------------------------------------------------------------- /model/baseline/neighborhood_inhabintants_svm_polynomial.py: -------------------------------------------------------------------------------- 1 | """ 2 | This script executes the task of estimating the number of inhabitants of a neighborhood to be under or over the 3 | median of all neighborhoods in the Netherlands, based solely on the geometry for that neighborhood. 4 | The data for this script can be found at http://hdl.handle.net/10411/GYPPBR. 5 | """ 6 | 7 | import multiprocessing 8 | import os 9 | import sys 10 | from datetime import datetime, timedelta 11 | from pathlib import Path 12 | from time import time 13 | from urllib.request import urlretrieve 14 | 15 | import numpy as np 16 | from sklearn.metrics import accuracy_score 17 | from sklearn.model_selection import StratifiedShuffleSplit, GridSearchCV 18 | from sklearn.preprocessing import StandardScaler 19 | from sklearn.svm import SVC 20 | 21 | PACKAGE_PARENT = '..' 22 | SCRIPT_DIR = os.path.dirname(os.path.realpath(os.path.join(os.getcwd(), os.path.expanduser(__file__)))) 23 | sys.path.append(os.path.normpath(os.path.join(SCRIPT_DIR, PACKAGE_PARENT))) 24 | 25 | from topoml_util.slack_send import notify 26 | 27 | SCRIPT_VERSION = '1.0.1' 28 | SCRIPT_NAME = os.path.basename(__file__) 29 | TIMESTAMP = str(datetime.now()).replace(':', '.') 30 | NUM_CPUS = multiprocessing.cpu_count() - 1 or 1 31 | DATA_FOLDER = SCRIPT_DIR + '/../../files/neighborhoods/' 32 | TRAIN_DATA_FILE = 'neighborhoods_train_v7.npz' 33 | TEST_DATA_FILE = 'neighborhoods_test_v7.npz' 34 | TRAIN_DATA_URL = 'https://dataverse.nl/api/access/datafile/11378' 35 | TEST_DATA_URL = 'https://dataverse.nl/api/access/datafile/11379' 36 | EFD_ORDERS = [0, 1, 2, 3, 4, 6, 8, 12, 16, 20, 24] 37 | SCRIPT_START = time() 38 | 39 | if __name__ == '__main__': # this is to squelch warnings on scikit-learn multithreaded grid search 40 | # Load training data 41 | path = Path(DATA_FOLDER + TRAIN_DATA_FILE) 42 | if not path.exists(): 43 | print("Retrieving training data from web...") 44 | urlretrieve(TRAIN_DATA_URL, DATA_FOLDER + TRAIN_DATA_FILE) 45 | 46 | train_loaded = np.load(DATA_FOLDER + TRAIN_DATA_FILE) 47 | train_fourier_descriptors = train_loaded['elliptic_fourier_descriptors'] 48 | train_labels = train_loaded['above_or_below_median'][:, 0] 49 | train_labels = np.reshape(train_labels, (train_labels.shape[0])) 50 | 51 | scaler = StandardScaler().fit(train_fourier_descriptors) 52 | train_fourier_descriptors = scaler.transform(train_fourier_descriptors) 53 | 54 | C_range = [1e0, 1e1, 1e2, 1e3, 1e4, 1e5] 55 | degree_range = range(1, 7) 56 | param_grid = dict(degree=degree_range, C=C_range) 57 | cv = StratifiedShuffleSplit(n_splits=5, test_size=0.2, random_state=42) 58 | grid = GridSearchCV( 59 | SVC(kernel='poly'), 60 | n_jobs=NUM_CPUS, 61 | param_grid=param_grid, 62 | verbose=2, 63 | cv=cv) 64 | 65 | print('Performing grid search on model...') 66 | print('Using {} threads for grid search'.format(NUM_CPUS)) 67 | print('Searching {} elliptic fourier descriptor orders'.format(EFD_ORDERS)) 68 | 69 | best_order = 0 70 | best_score = 0 71 | best_params = {} 72 | 73 | for order in EFD_ORDERS: 74 | print('Fitting order {} fourier descriptors'.format(order)) 75 | stop_position = 3 + (order * 8) 76 | grid.fit(train_fourier_descriptors[:, :stop_position], train_labels) 77 | print("The best parameters for order {} are {} with a score of {}\n".format( 78 | order, grid.best_params_, grid.best_score_)) 79 | if grid.best_score_ > best_score: 80 | best_score = grid.best_score_ 81 | best_order = order 82 | best_params = grid.best_params_ 83 | 84 | print('Training model on order {} with best parameters {}'.format( 85 | best_order, best_params)) 86 | stop_position = 3 + (best_order * 8) 87 | clf = SVC(kernel='poly', 88 | C=best_params['C'], 89 | degree=best_params['degree']) 90 | clf.fit(X=train_fourier_descriptors[:, :stop_position], y=train_labels) 91 | 92 | # Run predictions on unseen test data to verify generalization 93 | path = Path(DATA_FOLDER + TEST_DATA_FILE) 94 | if not path.exists(): 95 | print("Retrieving test data from web...") 96 | urlretrieve(TEST_DATA_URL, DATA_FOLDER + TEST_DATA_FILE) 97 | 98 | test_loaded = np.load(DATA_FOLDER + TEST_DATA_FILE) 99 | test_fourier_descriptors = test_loaded['elliptic_fourier_descriptors'] 100 | test_labels = test_loaded['above_or_below_median'][:, 0] 101 | test_fourier_descriptors = scaler.transform(test_fourier_descriptors) 102 | test_labels = np.reshape(test_labels, (test_labels.shape[0])) 103 | 104 | print('Run on test data...') 105 | predictions = clf.predict(test_fourier_descriptors[:, :stop_position]) 106 | test_accuracy = accuracy_score(test_labels, predictions) 107 | 108 | runtime = time() - SCRIPT_START 109 | message = '\nTest accuracy of {} for fourier descriptor order {} with {} in {}'.format( 110 | test_accuracy, best_order, best_params, timedelta(seconds=runtime)) 111 | print(message) 112 | notify(SCRIPT_NAME, message) 113 | -------------------------------------------------------------------------------- /model/baseline/neighborhood_inhabintants_svm_rbf.py: -------------------------------------------------------------------------------- 1 | """ 2 | This script executes the task of estimating the number of inhabitants of a neighborhood to be under or over the 3 | median of all neighborhoods in the Netherlands, based solely on the geometry for that neighborhood. 4 | The data for this script can be found at http://hdl.handle.net/10411/GYPPBR. 5 | """ 6 | 7 | import multiprocessing 8 | import os 9 | import sys 10 | from datetime import datetime, timedelta 11 | from pathlib import Path 12 | from time import time 13 | from urllib.request import urlretrieve 14 | 15 | import numpy as np 16 | from sklearn.metrics import accuracy_score 17 | from sklearn.model_selection import StratifiedShuffleSplit, GridSearchCV 18 | from sklearn.preprocessing import StandardScaler 19 | from sklearn.svm import SVC 20 | 21 | PACKAGE_PARENT = '..' 22 | SCRIPT_DIR = os.path.dirname(os.path.realpath(os.path.join(os.getcwd(), os.path.expanduser(__file__)))) 23 | sys.path.append(os.path.normpath(os.path.join(SCRIPT_DIR, PACKAGE_PARENT))) 24 | 25 | from topoml_util.slack_send import notify 26 | 27 | SCRIPT_VERSION = '1.0.1' 28 | SCRIPT_NAME = os.path.basename(__file__) 29 | TIMESTAMP = str(datetime.now()).replace(':', '.') 30 | NUM_CPUS = multiprocessing.cpu_count() - 1 or 1 31 | DATA_FOLDER = SCRIPT_DIR + '/../../files/neighborhoods/' 32 | TRAIN_DATA_FILE = 'neighborhoods_train_v7.npz' 33 | TEST_DATA_FILE = 'neighborhoods_test_v7.npz' 34 | TRAIN_DATA_URL = 'https://dataverse.nl/api/access/datafile/11378' 35 | TEST_DATA_URL = 'https://dataverse.nl/api/access/datafile/11379' 36 | EFD_ORDERS = [0, 1, 2, 3, 4, 6, 8, 12, 16, 20, 24] 37 | SCRIPT_START = time() 38 | 39 | if __name__ == '__main__': # this is to squelch warnings on scikit-learn multithreaded grid search 40 | # Load training data 41 | path = Path(DATA_FOLDER + TRAIN_DATA_FILE) 42 | if not path.exists(): 43 | print("Retrieving training data from web...") 44 | urlretrieve(TRAIN_DATA_URL, DATA_FOLDER + TRAIN_DATA_FILE) 45 | 46 | train_loaded = np.load(DATA_FOLDER + TRAIN_DATA_FILE) 47 | train_fourier_descriptors = train_loaded['elliptic_fourier_descriptors'] 48 | train_labels = train_loaded['above_or_below_median'][:, 0] 49 | train_labels = np.reshape(train_labels, (train_labels.shape[0])) 50 | 51 | scaler = StandardScaler().fit(train_fourier_descriptors) 52 | train_fourier_descriptors = scaler.transform(train_fourier_descriptors) 53 | 54 | # Grid search 55 | C_range = [1e-2, 1e-1, 1e0, 1e1, 1e2, 1e3] 56 | gamma_range = np.logspace(-3, 3, 7) 57 | param_grid = dict(gamma=gamma_range, C=C_range) 58 | cv = StratifiedShuffleSplit(n_splits=5, test_size=0.2, random_state=42) 59 | grid = GridSearchCV( 60 | SVC(kernel='rbf'), 61 | n_jobs=NUM_CPUS, 62 | param_grid=param_grid, 63 | verbose=2, 64 | cv=cv) 65 | 66 | print('Performing grid search on model...') 67 | print('Using {} threads for grid search'.format(NUM_CPUS)) 68 | print('Searching {} elliptic fourier descriptor orders'.format(EFD_ORDERS)) 69 | 70 | best_order = 0 71 | best_score = 0 72 | best_params = {} 73 | 74 | for order in EFD_ORDERS: 75 | print('Fitting order {} fourier descriptors'.format(order)) 76 | stop_position = 3 + (order * 8) 77 | grid.fit(train_fourier_descriptors[::2, :stop_position], train_labels[::2]) 78 | print("The best parameters for order {} are {} with a score of {}\n".format( 79 | order, grid.best_params_, grid.best_score_)) 80 | if grid.best_score_ > best_score: 81 | best_score = grid.best_score_ 82 | best_order = order 83 | best_params = grid.best_params_ 84 | 85 | print('Training model on order {} with best parameters {}'.format( 86 | best_order, best_params)) 87 | stop_position = 3 + (best_order * 8) 88 | clf = SVC(kernel='rbf', C=best_params['C'], gamma=best_params['gamma']) 89 | clf.fit(X=train_fourier_descriptors[:, :stop_position], y=train_labels) 90 | 91 | # Run predictions on unseen test data to verify generalization 92 | path = Path(DATA_FOLDER + TEST_DATA_FILE) 93 | if not path.exists(): 94 | print("Retrieving test data from web...") 95 | urlretrieve(TEST_DATA_URL, DATA_FOLDER + TEST_DATA_FILE) 96 | 97 | test_loaded = np.load(DATA_FOLDER + TEST_DATA_FILE) 98 | test_fourier_descriptors = test_loaded['elliptic_fourier_descriptors'] 99 | test_labels = test_loaded['above_or_below_median'][:, 0] 100 | test_labels = np.reshape(test_labels, (test_labels.shape[0])) 101 | test_fourier_descriptors = scaler.transform(test_fourier_descriptors) 102 | 103 | print('Run on test data...') 104 | predictions = clf.predict(test_fourier_descriptors[:, :stop_position]) 105 | test_accuracy = accuracy_score(test_labels, predictions) 106 | 107 | runtime = time() - SCRIPT_START 108 | message = '\nTest accuracy of {} for fourier descriptor order {} with {} in {}'.format( 109 | test_accuracy, best_order, best_params, timedelta(seconds=runtime)) 110 | print(message) 111 | notify(SCRIPT_NAME, message) 112 | -------------------------------------------------------------------------------- /model/building_convnet_fixed.py: -------------------------------------------------------------------------------- 1 | """ 2 | This script executes the task of estimating the building type, based solely on the geometry for that building. 3 | The data for this script can be found at http://hdl.handle.net/10411/GYPPBR. 4 | """ 5 | 6 | import os 7 | import socket 8 | import sys 9 | from datetime import datetime, timedelta 10 | from pathlib import Path 11 | from time import time 12 | from urllib.request import urlretrieve 13 | 14 | import numpy as np 15 | from keras import Input 16 | from keras.callbacks import TensorBoard 17 | from keras.engine import Model 18 | from keras.layers import Dense, Conv1D, GlobalAveragePooling1D, Dropout 19 | from keras.optimizers import Adam 20 | from sklearn.metrics import accuracy_score 21 | from sklearn.model_selection import train_test_split 22 | 23 | from topoml_util import geom_scaler 24 | from topoml_util.slack_send import notify 25 | 26 | SCRIPT_VERSION = '2.0.3' 27 | SCRIPT_NAME = os.path.basename(__file__) 28 | TIMESTAMP = str(datetime.now()).replace(':', '.') 29 | SIGNATURE = SCRIPT_NAME + ' ' + SCRIPT_VERSION + ' ' + TIMESTAMP 30 | DATA_FOLDER = '../files/buildings/' 31 | TRAIN_DATA_FILE = 'buildings_train_v7.npz' 32 | TEST_DATA_FILE = 'buildings_test_v7.npz' 33 | TRAIN_DATA_URL = 'https://dataverse.nl/api/access/datafile/11381' 34 | TEST_DATA_URL = 'https://dataverse.nl/api/access/datafile/11380' 35 | SCRIPT_START = time() 36 | 37 | # Hyperparameters 38 | hp = { 39 | 'BATCH_SIZE': int(os.getenv('BATCH_SIZE', 32)), 40 | 'TRAIN_VALIDATE_SPLIT': float(os.getenv('TRAIN_VALIDATE_SPLIT', 0.1)), 41 | 'REPEAT_DEEP_ARCH': int(os.getenv('REPEAT_DEEP_ARCH', 0)), 42 | 'DENSE_SIZE': int(os.getenv('DENSE_SIZE', 32)), 43 | 'EPOCHS': int(os.getenv('EPOCHS', 200)), 44 | 'LEARNING_RATE': float(os.getenv('LEARNING_RATE', 1e-4)), 45 | 'DROPOUT': float(os.getenv('DROPOUT', 0.0)), 46 | 'GEOM_SCALE': float(os.getenv("GEOM_SCALE", 0)), # If no default or 0: overridden when data is known 47 | } 48 | OPTIMIZER = Adam(lr=hp['LEARNING_RATE']) 49 | 50 | # Load training data 51 | path = Path(DATA_FOLDER + TRAIN_DATA_FILE) 52 | if not path.exists(): 53 | print("Retrieving training data from web...") 54 | urlretrieve(TRAIN_DATA_URL, DATA_FOLDER + TRAIN_DATA_FILE) 55 | 56 | train_loaded = np.load(DATA_FOLDER + TRAIN_DATA_FILE) 57 | train_geoms = train_loaded['fixed_size_geoms'] 58 | train_labels = train_loaded['building_type'] 59 | 60 | # Determine final test mode or standard 61 | if len(sys.argv) > 1 and sys.argv[1] in ['-t', '--test']: 62 | print('Training in final test mode') 63 | path = Path(DATA_FOLDER + TEST_DATA_FILE) 64 | if not path.exists(): 65 | print("Retrieving test data from web...") 66 | urlretrieve(TEST_DATA_URL, DATA_FOLDER + TEST_DATA_FILE) 67 | 68 | test_loaded = np.load(DATA_FOLDER + TEST_DATA_FILE) 69 | test_geoms = test_loaded['fixed_size_geoms'] 70 | test_labels = test_loaded['building_type'] 71 | else: 72 | print('Training in standard training mode') 73 | # Split the training data in random seen/unseen sets 74 | train_geoms, test_geoms, train_labels, test_labels = train_test_split(train_geoms, train_labels, test_size=0.1) 75 | 76 | # Normalize 77 | geom_scale = hp['GEOM_SCALE'] or geom_scaler.scale(train_geoms) 78 | train_geoms = geom_scaler.transform(train_geoms, geom_scale) 79 | test_geoms = geom_scaler.transform(test_geoms, geom_scale) # re-use variance from training 80 | 81 | # Map types to one-hot vectors 82 | # noinspection PyUnresolvedReferences 83 | train_targets = np.zeros((len(train_labels), train_labels.max() + 1)) 84 | for index, building_type in enumerate(train_labels): 85 | train_targets[index, building_type] = 1 86 | 87 | # Shape determination 88 | geom_max_points, geom_vector_len = train_geoms.shape[1:] 89 | output_size = train_targets.shape[-1] 90 | 91 | # Build model 92 | inputs = Input(shape=(geom_max_points, geom_vector_len)) 93 | model = Conv1D(filters=32, kernel_size=(5,), activation='relu')(inputs) 94 | model = Conv1D(filters=48, kernel_size=(5,), activation='relu', strides=2)(model) 95 | model = Conv1D(filters=64, kernel_size=(5,), activation='relu', strides=2)(model) 96 | model = GlobalAveragePooling1D()(model) 97 | model = Dense(hp['DENSE_SIZE'], activation='relu')(model) 98 | model = Dropout(hp['DROPOUT'])(model) 99 | model = Dense(output_size, activation='softmax')(model) 100 | 101 | model = Model(inputs=inputs, outputs=model) 102 | model.compile( 103 | loss='categorical_crossentropy', 104 | metrics=['accuracy'], 105 | optimizer=OPTIMIZER), 106 | model.summary() 107 | 108 | # Callbacks 109 | callbacks = [TensorBoard(log_dir='./tensorboard_log/' + SIGNATURE, write_graph=False)] 110 | 111 | history = model.fit( 112 | x=train_geoms, 113 | y=train_targets, 114 | epochs=hp['EPOCHS'], 115 | batch_size=hp['BATCH_SIZE'], 116 | validation_split=hp['TRAIN_VALIDATE_SPLIT'], 117 | callbacks=callbacks).history 118 | 119 | # Run on unseen test data 120 | test_pred = [np.argmax(prediction) for prediction in model.predict(test_geoms)] 121 | accuracy = accuracy_score(test_labels, test_pred) 122 | 123 | runtime = time() - SCRIPT_START 124 | message = 'on {} completed with accuracy of \n{:f} \nin {} in {} epochs\n'.format( 125 | socket.gethostname(), accuracy, timedelta(seconds=runtime), len(history['val_loss'])) 126 | 127 | for key, value in sorted(hp.items()): 128 | message += '{}: {}\t'.format(key, value) 129 | 130 | notify(SIGNATURE, message) 131 | print(SCRIPT_NAME, 'finished successfully with', message) 132 | -------------------------------------------------------------------------------- /model/configs/README.md: -------------------------------------------------------------------------------- 1 | # Configurations 2 | This directory contains an archive of python test setup configurations. -------------------------------------------------------------------------------- /model/grid_search.py: -------------------------------------------------------------------------------- 1 | import os 2 | import socket 3 | 4 | import sys 5 | 6 | # import numpy as np 7 | from sklearn.model_selection import ParameterGrid 8 | from topoml_util.slack_send import notify 9 | 10 | SCRIPT_NAME = os.path.basename(__file__) 11 | SCRIPT_VERSION = '1.0.2' 12 | SIGNATURE = '{} {} on {}'.format(SCRIPT_NAME, SCRIPT_VERSION, socket.gethostname()) 13 | N_TIMES = 1 14 | 15 | if len(sys.argv) > 1: 16 | script_name = sys.argv[1] 17 | else: # resort to default, for 18 | # script_name = 'neighborhood_convnet.py' 19 | script_name = 'neighborhood_lstm.py' 20 | # script_name = 'building_convnet.py' 21 | # script_name = 'building_lstm.py' 22 | # script_name = 'archaeology_convnet.py' 23 | # script_name = 'archaeology_lstm.py' 24 | 25 | HYPERPARAMS = { 26 | 'BATCH_SIZE': [256], 27 | # 'REPEAT_DEEP_ARCH': [1, 2], 28 | # 'KERNEL_SIZE': np.linspace(1, 8, 8, dtype=int), 29 | # 'LSTM_SIZE': np.linspace(64, 128, 3, dtype=int), 30 | # 'DENSE_SIZE': [64], 31 | # 'EPOCHS': [200], 32 | # 'LEARNING_RATE': [8e-4, 6e-4, 4e-4, 2e-4, 1e-4], 33 | 'LEARNING_RATE': [5e-3, 1e-3], 34 | # 'LEARNING_RATE': [8e-5, 6e-5], 35 | # 'GEOM_SCALE': [1e0, 1e-1, 1e-2, 1e-3], 36 | # 'RECURRENT_DROPOUT': [0.0], 37 | # 'PATIENCE': [0, 1, 4, 8, 16, 32], 38 | # 'EARLY_STOPPING': [0], 39 | } 40 | grid = list(ParameterGrid(HYPERPARAMS)) 41 | 42 | for configuration in grid: 43 | envs = [] 44 | # Set environment variables (this allows you to do hyperparam searches from any scripting environment) 45 | for key, value in configuration.items(): 46 | os.environ[key] = str(value) 47 | 48 | # repeat to get a sense of results spread 49 | for _ in range(N_TIMES): 50 | r_code = os.system('python3 ' + script_name) 51 | if not r_code == 0: 52 | print('Grid search exited with error') 53 | notify(SIGNATURE, 'error') 54 | sys.exit(1) 55 | 56 | notify(SIGNATURE, 'success') 57 | print('Grid search {} finished successfully'.format(SIGNATURE)) 58 | -------------------------------------------------------------------------------- /model/neighborhood_convnet_fixed.py: -------------------------------------------------------------------------------- 1 | """ 2 | This script executes the task of estimating the number of inhabitants of a neighborhood to be under or over the 3 | median of all neighborhoods in the Netherlands, based solely on the geometry for that neighborhood. 4 | The data for this script can be found at http://hdl.handle.net/10411/GYPPBR. 5 | """ 6 | 7 | import os 8 | import socket 9 | import sys 10 | from datetime import datetime, timedelta 11 | from pathlib import Path 12 | from time import time 13 | from urllib.request import urlretrieve 14 | 15 | import numpy as np 16 | from keras import Input 17 | from keras.callbacks import TensorBoard 18 | from keras.engine import Model 19 | from keras.layers import Dense, Conv1D, MaxPooling1D, GlobalAveragePooling1D, Dropout 20 | from keras.optimizers import Adam 21 | from sklearn.metrics import accuracy_score 22 | from sklearn.model_selection import train_test_split 23 | 24 | from topoml_util import geom_scaler 25 | from topoml_util.slack_send import notify 26 | 27 | SCRIPT_VERSION = '2.0.5' 28 | SCRIPT_NAME = os.path.basename(__file__) 29 | TIMESTAMP = str(datetime.now()).replace(':', '.') 30 | SIGNATURE = SCRIPT_NAME + ' ' + SCRIPT_VERSION + ' ' + TIMESTAMP 31 | DATA_FOLDER = '../files/neighborhoods/' 32 | TRAIN_DATA_FILE = 'neighborhoods_train_v7.npz' 33 | TEST_DATA_FILE = 'neighborhoods_test_v7.npz' 34 | TRAIN_DATA_URL = 'https://dataverse.nl/api/access/datafile/11378' 35 | TEST_DATA_URL = 'https://dataverse.nl/api/access/datafile/11379' 36 | SCRIPT_START = time() 37 | 38 | # Hyperparameters 39 | hp = { 40 | 'BATCH_SIZE': int(os.getenv('BATCH_SIZE', 32)), 41 | 'TRAIN_VALIDATE_SPLIT': float(os.getenv('TRAIN_VALIDATE_SPLIT', 0.1)), 42 | 'REPEAT_DEEP_ARCH': int(os.getenv('REPEAT_DEEP_ARCH', 0)), 43 | 'DENSE_SIZE': int(os.getenv('DENSE_SIZE', 32)), 44 | 'EPOCHS': int(os.getenv('EPOCHS', 200)), 45 | 'LEARNING_RATE': float(os.getenv('LEARNING_RATE', 1e-3)), 46 | 'DROPOUT': float(os.getenv('DROPOUT', 0.0)), 47 | 'GEOM_SCALE': float(os.getenv("GEOM_SCALE", 0)), # If no default or 0: overridden when data is known 48 | } 49 | OPTIMIZER = Adam(lr=hp['LEARNING_RATE']) 50 | 51 | # Load training data 52 | path = Path(DATA_FOLDER + TRAIN_DATA_FILE) 53 | if not path.exists(): 54 | print("Retrieving training data from web...") 55 | urlretrieve(TRAIN_DATA_URL, DATA_FOLDER + TRAIN_DATA_FILE) 56 | 57 | train_loaded = np.load(DATA_FOLDER + TRAIN_DATA_FILE) 58 | train_geoms = train_loaded['fixed_size_geoms'] 59 | train_labels = train_loaded['above_or_below_median'] 60 | 61 | # Determine final test mode or standard 62 | if len(sys.argv) > 1 and sys.argv[1] in ['-t', '--test']: 63 | print('Training in final test mode') 64 | path = Path(DATA_FOLDER + TEST_DATA_FILE) 65 | if not path.exists(): 66 | print("Retrieving test data from web...") 67 | urlretrieve(TEST_DATA_URL, DATA_FOLDER + TEST_DATA_FILE) 68 | 69 | test_loaded = np.load(DATA_FOLDER + TEST_DATA_FILE) 70 | test_geoms = test_loaded['fixed_size_geoms'] 71 | test_labels = test_loaded['above_or_below_median'] 72 | else: 73 | print('Training in standard training mode') 74 | # Split the training data in random seen/unseen sets 75 | train_geoms, test_geoms, train_labels, test_labels = train_test_split(train_geoms, train_labels, test_size=0.1) 76 | 77 | # Normalize 78 | geom_scale = hp['GEOM_SCALE'] or geom_scaler.scale(train_geoms) 79 | train_geoms = geom_scaler.transform(train_geoms, geom_scale) 80 | test_geoms = geom_scaler.transform(test_geoms, geom_scale) # re-use variance from training 81 | 82 | # Map types to one-hot vectors 83 | # noinspection PyUnresolvedReferences 84 | train_targets = np.zeros((len(train_labels), train_labels.max() + 1)) 85 | for index, train_label in enumerate(train_labels): 86 | train_targets[index, train_label] = 1 87 | 88 | # Shape determination 89 | geom_vector_len = train_geoms[0].shape[1] 90 | 91 | # Build model 92 | inputs = Input(shape=(None, geom_vector_len)) 93 | model = Conv1D(32, (5,), activation='relu')(inputs) 94 | model = MaxPooling1D(3, padding='SAME')(model) 95 | model = Conv1D(64, (5,), activation='relu')(model) 96 | model = GlobalAveragePooling1D()(model) 97 | model = Dense(hp['DENSE_SIZE'], activation='relu')(model) 98 | model = Dropout(hp['DROPOUT'])(model) 99 | model = Dense(2, activation='softmax')(model) 100 | 101 | model = Model(inputs=inputs, outputs=model) 102 | model.compile( 103 | loss='categorical_crossentropy', 104 | metrics=['accuracy'], 105 | optimizer=OPTIMIZER), 106 | model.summary() 107 | 108 | # Callbacks 109 | callbacks = [TensorBoard(log_dir='./tensorboard_log/' + SIGNATURE, write_graph=False)] 110 | 111 | history = model.fit( 112 | x=train_geoms, 113 | y=train_targets, 114 | epochs=hp['EPOCHS'], 115 | batch_size=hp['BATCH_SIZE'], 116 | validation_split=hp['TRAIN_VALIDATE_SPLIT'], 117 | callbacks=callbacks).history 118 | 119 | # Run on unseen test data 120 | test_pred = [np.argmax(prediction) for prediction in model.predict(test_geoms)] 121 | accuracy = accuracy_score(test_labels, test_pred) 122 | 123 | runtime = time() - SCRIPT_START 124 | message = 'on {} completed with accuracy of \n{:f} \nin {} in {} epochs\n'.format( 125 | socket.gethostname(), accuracy, timedelta(seconds=runtime), len(history['val_loss'])) 126 | 127 | for key, value in sorted(hp.items()): 128 | message += '{}: {}\t'.format(key, value) 129 | 130 | notify(SIGNATURE, message) 131 | print(SCRIPT_NAME, 'finished successfully with', message) 132 | -------------------------------------------------------------------------------- /model/plots/README.md: -------------------------------------------------------------------------------- 1 | # Plots 2 | This directory is a logging directory for png-saved pyplots. -------------------------------------------------------------------------------- /model/topoml_util/ConsoleLogger.py: -------------------------------------------------------------------------------- 1 | from keras.callbacks import Callback 2 | import random 3 | from datetime import datetime 4 | import numpy as np 5 | 6 | 7 | class DecypherAll(Callback): 8 | def __init__(self, decypher): 9 | super().__init__() 10 | self.decypher = decypher 11 | 12 | def on_epoch_end(self, epoch, logs=None): 13 | random.seed(datetime.now()) 14 | sample_indexes = random.sample(range(len(self.validation_data[0])), 3) 15 | input_samples = [self.validation_data[0][sample] for sample in sample_indexes] 16 | target_samples = [self.validation_data[1][sample] for sample in sample_indexes] 17 | predictions = self.model.predict(np.array(input_samples)) 18 | 19 | print('') 20 | 21 | for (input, target, prediction) in zip(input_samples, target_samples, predictions): 22 | print('Input: %s' % self.decypher(input)) 23 | print('Target: %s' % self.decypher(target)) 24 | print('Prediction: %s\n' % self.decypher(prediction)) 25 | 26 | -------------------------------------------------------------------------------- /model/topoml_util/GaussianMixtureLoss.py: -------------------------------------------------------------------------------- 1 | from keras import backend as K 2 | 3 | from topoml_util.GeoVectorizer import RENDER_LEN, GEOM_TYPE_LEN, ONE_HOT_LEN 4 | from topoml_util.gaussian_loss import bivariate_gaussian, univariate_gaussian 5 | 6 | 7 | class GaussianMixtureLoss: 8 | def __init__(self, num_components, num_points): 9 | self.num_points = num_points 10 | self.num_components = num_components 11 | 12 | def geom_gaussian_mixture_loss(self, y_true, y_pred): 13 | """ 14 | Calculates a loss from a rank 3 sequence, representing a self.num_components * 6 slice (the mixture components) 15 | plus one-hot encoded sequences of geometry type (8) and render/stop action type (3) 16 | :param y_true: rank 3 of shape(records, points, true_point_features >= 17) truth values tensor 17 | :param y_pred: rank 3 of shape(records, points, pred_point_features >= 17) predicted values tensor 18 | :return: a summed mixture loss and categorical cross entropy losses for the geometry type and stop bits 19 | """ 20 | # loss fn based on eq #26 of http://arxiv.org/abs/1308.0850. 21 | # Reshape to one target component to be broadcasted over self.num_components 22 | true_coordinates = y_true[..., :2] 23 | # It would be nice to be able to do 24 | # shape = [*y_true.shape[:-1], 1, 2] 25 | shape = [-1, self.num_points, 1, 2] 26 | true_coordinates = K.reshape(true_coordinates, tuple(shape)) 27 | 28 | y_pred_gmm_components = y_pred[..., :-ONE_HOT_LEN] 29 | predicted_components = K.reshape( 30 | y_pred_gmm_components, 31 | # (*y_pred.shape[:-1], -1, 6)) # This would be nice 32 | (-1, self.num_points, self.num_components, 6)) 33 | 34 | pi_index = 5 # mixture component weight 35 | pi_weights = K.softmax(predicted_components[..., pi_index]) 36 | gmm = bivariate_gaussian(true_coordinates, predicted_components) * pi_weights 37 | gmm_loss = K.sum(-K.log(gmm + K.epsilon())) 38 | 39 | render_action = K.softmax(y_true[..., -RENDER_LEN:]) 40 | neg_full_stop_chance = 1 - render_action[..., 2] # 1 minus the chance of full stop 41 | gmm_loss = gmm_loss * neg_full_stop_chance 42 | 43 | geom_type_error = K.categorical_crossentropy( 44 | K.softmax(y_true[..., -(GEOM_TYPE_LEN + RENDER_LEN - 1):-RENDER_LEN]), 45 | K.softmax(y_pred[..., -(GEOM_TYPE_LEN + RENDER_LEN - 1):-RENDER_LEN])) 46 | render_error = K.categorical_crossentropy( 47 | K.softmax(y_true[..., -RENDER_LEN:]), 48 | K.softmax(y_pred[..., -RENDER_LEN:])) 49 | 50 | return gmm_loss + geom_type_error + render_error 51 | 52 | def univariate_gmm_loss(self, true, pred): 53 | """ 54 | A simple loss function for rank 3 single gaussian mixture models 55 | :param true: truth values tensor 56 | :param pred: prediction values tensor 57 | :return: loss values tensor 58 | """ 59 | if not true.shape == pred.shape: 60 | print( 61 | 'Warning: truth', true.shape, 'and prediction tensors', pred.shape, 'do not have the same shape. The ' 62 | 'outcome of the loss function may be unpredictable.') 63 | 64 | # true_components = K.reshape(true, (-1, self.num_components, 3)) 65 | # TODO: make reshape op rank agnostic 66 | predicted_components = K.reshape(pred, (-1, self.num_components, 3)) 67 | 68 | pi_index = 2 69 | pi_weights = K.softmax(pred[..., pi_index]) 70 | gmm = univariate_gaussian(true, predicted_components) * pi_weights 71 | gmm_loss = -K.log(K.sum(gmm + K.epsilon())) 72 | 73 | return gmm_loss 74 | -------------------------------------------------------------------------------- /model/topoml_util/LoggerCallback.py: -------------------------------------------------------------------------------- 1 | import pprint 2 | from keras.callbacks import Callback 3 | import random 4 | from datetime import datetime 5 | import numpy as np 6 | 7 | pp = pprint.PrettyPrinter() 8 | 9 | 10 | class EpochLogger(Callback): 11 | def __init__(self, input_func=None, target_func=None, predict_func=None, aggregate_func=None, sample_size=3, 12 | stdout=False, input_slice=lambda x: x[0:1], target_slice=lambda x: x[1:2]): 13 | super().__init__() 14 | self.input_func = input_func 15 | self.target_func = target_func 16 | self.predict_func = predict_func 17 | self.aggregate_func = aggregate_func 18 | self.sample_size = sample_size 19 | self.log_to_stdout = stdout 20 | self.input_slice = input_slice 21 | self.target_slice = target_slice 22 | 23 | def on_epoch_end(self, epoch, logs=None): 24 | random.seed(datetime.now()) 25 | sample_indexes = random.sample(range(len(self.validation_data[0])), self.sample_size) 26 | inputs = np.array(self.input_slice(self.validation_data)) 27 | targets = np.array(self.target_slice(self.validation_data)) 28 | input_samples = [inputs[:, sample_index] for sample_index in sample_indexes] 29 | target_samples = [targets[:, sample_index] for sample_index in sample_indexes] 30 | 31 | predictions = [] 32 | for sample_index in sample_indexes: 33 | sample = inputs[:, sample_index:sample_index + 1] 34 | predictions.append(self.model.predict([*sample])) 35 | 36 | print('\nLogging output for %i inputs, targets and predictions...' % len(predictions)) 37 | 38 | for (inputs, targets, predictions) in zip(input_samples, target_samples, predictions): 39 | 40 | if self.log_to_stdout: 41 | print('Input:') 42 | pp.pprint(inputs) 43 | print('Target:') 44 | pp.pprint(targets) 45 | print('Prediction:') 46 | pp.pprint(predictions) 47 | print('') 48 | 49 | if self.aggregate_func: 50 | self.aggregate_func( 51 | (self.input_func(inputs), self.target_func(targets), self.predict_func(predictions))) 52 | -------------------------------------------------------------------------------- /model/topoml_util/PyplotLogger.py: -------------------------------------------------------------------------------- 1 | import os 2 | import pprint 3 | import random 4 | from datetime import datetime 5 | 6 | import numpy as np 7 | from keras.callbacks import Callback 8 | from shapely.geometry import Point 9 | 10 | from .GeoVectorizer import GeoVectorizer 11 | from .wkt2pyplot import save_plot 12 | 13 | pp = pprint.PrettyPrinter() 14 | 15 | 16 | class DecypherAll(Callback): 17 | def __init__(self, gmm_size=1, sample_size=3, input_slice=lambda x: x[0:1], target_slice=lambda x: x[1:2], 18 | stdout=False, save_plots=True, plot_dir='plots'): 19 | """ 20 | Class constructor that instantiates with a few vital settings in order to decypher the output 21 | :type target_slice: object 22 | :param gmm_size: size as an integer of the gaussian mixture model 23 | :param sample_size: size as an integer of the number of samples to log 24 | :param stdout: boolean whether or not to log to stdout. Mixture models can have a lot of output. 25 | :param plot_dir: string of a directory to save plots to, relative to the path called to execute the script 26 | """ 27 | super().__init__() 28 | self.gmm_size = gmm_size 29 | self.sample_size = sample_size 30 | self.input_slice = input_slice 31 | self.target_slice = target_slice 32 | self.stdout = stdout 33 | self.save_plots = save_plots 34 | 35 | os.makedirs(plot_dir, exist_ok=True) 36 | self.plot_dir = plot_dir 37 | 38 | def on_epoch_end(self, epoch, logs=None): 39 | """ 40 | Epochal logging function that outputs to a pyplot saved to a timestamped .png file 41 | :param epoch: automatically instantiated by Keras 42 | :param logs: automatically instantiated by Keras 43 | """ 44 | random.seed(datetime.now()) 45 | 46 | sample_indexes = random.sample(range(len(self.validation_data[0])), self.sample_size) 47 | inputs = np.array(self.input_slice(self.validation_data)) 48 | targets = np.array(self.target_slice(self.validation_data)) 49 | input_samples = [inputs[:, sample_index] for sample_index in sample_indexes] 50 | target_samples = [targets[:, sample_index] for sample_index in sample_indexes] 51 | 52 | predictions = [] 53 | for sample_index in sample_indexes: 54 | sample = inputs[:, sample_index:sample_index + 1] 55 | predictions.append(self.model.predict([*sample])) 56 | 57 | print('\nPlotting output for %i inputs, targets and predictions...' % len(predictions)) 58 | 59 | for (input_vectors, target_vectors, prediction_vectors) in zip(input_samples, target_samples, predictions): 60 | timestamp = str(datetime.now()).replace(':', '.') 61 | 62 | if self.stdout: 63 | print('Input:') 64 | pp.pprint(input_vectors) 65 | print('Target:') 66 | pp.pprint(target_vectors) 67 | print('Prediction:') 68 | pp.pprint(prediction_vectors) 69 | 70 | if self.save_plots: 71 | input_polys = [GeoVectorizer.decypher(poly) for poly in input_vectors] 72 | target_polys = [GeoVectorizer.decypher(target_vectors[0])] 73 | prediction_points = [ 74 | Point(point).wkt for point in 75 | GeoVectorizer(gmm_size=self.gmm_size).decypher_gmm_geom(prediction_vectors[0], 500) 76 | ] 77 | 78 | geoms = input_polys, target_polys, prediction_points 79 | save_plot(geoms, self.plot_dir, timestamp) 80 | -------------------------------------------------------------------------------- /model/topoml_util/Tokenizer.py: -------------------------------------------------------------------------------- 1 | from keras.preprocessing.text import Tokenizer 2 | import numpy as np 3 | 4 | 5 | class Tokenize(Tokenizer): 6 | """Text tokenization wrapper around Keras text tokenization methods 7 | """ 8 | 9 | def __init__(self, texts): 10 | super().__init__(num_words=None, 11 | filters='\t\n', 12 | lower=True, 13 | split="", 14 | char_level=True) 15 | self.fit_on_texts(texts) 16 | 17 | @staticmethod 18 | def truncate(max_len, untruncated_training_set, untruncated_target_set): 19 | """ 20 | Method for truncating the training and target set to fit the maximum 21 | sequence length, batch and validation set size 22 | :param max_len: maximum length of characters per sequence/sentence 23 | :param untruncated_training_set: untruncated list of input sequences 24 | :param untruncated_target_set: untruncated list of target output sequences 25 | :return: training_set, target_set: a tuple of truncated training and target sets 26 | """ 27 | training_set = [] 28 | target_set = [] 29 | 30 | # Restrict input to be of less or equal length as the maximum length. 31 | for index, record in enumerate(untruncated_training_set): 32 | if len(record) <= max_len: 33 | training_set.append(record) 34 | target_set.append(untruncated_target_set[index]) 35 | 36 | return training_set, target_set 37 | 38 | @staticmethod 39 | def batch_truncate(batch_size, max_len, validation_split, untruncated_training_set, untruncated_target_set): 40 | """ 41 | Method for truncating the training and target set to fit the maximum 42 | sequence length, batch and validation set size 43 | :param batch_size: size of the epoch batch size 44 | :param max_len: maximum length of characters per sequence/sentence 45 | :param validation_split: ratio of the training/validation split 46 | :param untruncated_training_set: untruncated list of input sequences 47 | :param untruncated_target_set: untruncated list of target output sequences 48 | :return: training_set, target_set: a tuple of truncated training and target sets 49 | """ 50 | training_set = [] 51 | target_set = [] 52 | 53 | # Restrict input to be of less or equal length as the maximum length. 54 | for index, record in enumerate(untruncated_training_set): 55 | if len(record) <= max_len: 56 | training_set.append(record) 57 | target_set.append(untruncated_target_set[index]) 58 | 59 | # Truncate the array to the batch size, accounting for the validation set 60 | # The validation sample size must be a multiple of the batch size 61 | # Say the truncated length is 27,000 and the split ratio is 0.1, the validation sample size is 2700 62 | validation_size = int(len(training_set) * validation_split) 63 | # We need to get it down to 2000 64 | validation_size = validation_size - validation_size % batch_size 65 | # The truncated length must be a multiple of the validation sample size 66 | truncated_size = len(training_set) - len(training_set) % int(validation_size / validation_split) 67 | training_set = training_set[0:truncated_size] 68 | target_set = target_set[0:truncated_size] 69 | return training_set, target_set 70 | 71 | @staticmethod 72 | def max_sample(predictions): 73 | # helper function to sample an index from a probability array 74 | return np.argmax(predictions) 75 | 76 | def char_level_tokenize(self, texts): 77 | sequences = self.texts_to_sequences(texts) 78 | return sequences 79 | 80 | def decypher(self, sequences): 81 | """ 82 | Decyphers a encoded 3D array of one-hot vectors back to a 2D array of sentences 83 | :param sequences: 84 | :return: 85 | """ 86 | # sampled = [Tokenize.max_sample(token) for token in prediction] 87 | # sequence.append(sampled) 88 | inv_cipher = {v: k for k, v in self.word_index.items()} 89 | decyphered = [] 90 | for sequence in sequences: 91 | decyphered_sequence = [] 92 | for num in sequence: 93 | if num in inv_cipher: 94 | decyphered_sequence.append(inv_cipher[num]) 95 | else: 96 | decyphered_sequence.append(' ') 97 | decyphered.append(''.join([char for char in decyphered_sequence])) 98 | return decyphered 99 | 100 | def one_hot(self, input_sequences, maxlen): 101 | # The third dimension of the matrix is equal to the length of the word index plus one: 102 | # There is no '0' index in the word index. 103 | x = np.zeros((len(input_sequences), maxlen, len(self.word_index) + 1), dtype=np.bool) 104 | for i, sentence in enumerate(input_sequences): 105 | for t, char in enumerate(sentence): 106 | x[i, t, self.word_index[char]] = True 107 | return x 108 | 109 | -------------------------------------------------------------------------------- /model/topoml_util/__init__.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/SPINLab/geometry-learning/5300d421ef848c2748a2ba41ced5c6e2fba93200/model/topoml_util/__init__.py -------------------------------------------------------------------------------- /model/topoml_util/gaussian_loss.py: -------------------------------------------------------------------------------- 1 | import numpy as np 2 | from keras import backend as K 3 | from keras.backend import epsilon 4 | from keras.losses import mse, categorical_crossentropy 5 | 6 | from .GeoVectorizer import GEOM_TYPE_INDEX, RENDER_INDEX 7 | 8 | 9 | def geom_gaussian_loss(y_true, y_pred): 10 | # loss fn based on eq #26 of http://arxiv.org/abs/1308.0850. 11 | gaussian_loss = bivariate_gaussian_loss(y_true, y_pred) 12 | geom_type_error = categorical_crossentropy(K.softmax(y_true[..., GEOM_TYPE_INDEX:RENDER_INDEX]), 13 | K.softmax(y_pred[..., GEOM_TYPE_INDEX:RENDER_INDEX])) 14 | render_error = categorical_crossentropy(K.softmax(y_true[..., RENDER_INDEX:]), 15 | K.softmax(y_pred[..., RENDER_INDEX:])) 16 | return gaussian_loss + geom_type_error + render_error 17 | 18 | 19 | # Adapted to Keras from https://github.com/tensorflow/magenta/blob/master/magenta/models/sketch_rnn/model.py#L268 20 | # Adapted version of the probability density function of 21 | # https://en.wikipedia.org/wiki/Multivariate_normal_distribution#Bivariate_case 22 | def bivariate_gaussian(true, pred): 23 | """ 24 | Stabilized rank-agnostic bivariate gaussian probability function (pdf) 25 | Returns results of eq # 24 of http://arxiv.org/abs/1308.0850 26 | :param true: truth values with at least [mu1, mu2] 27 | :param pred: values predicted with at least [mu1, mu2, sigma1, sigma2, rho] 28 | :return: probability density function 29 | """ 30 | x_coord = true[..., 0] 31 | y_coord = true[..., 1] 32 | mu_x = pred[..., 0] 33 | mu_y = pred[..., 1] 34 | # exponentiate the sigmas and also make correlative rho between -1 and 1. 35 | # eq. # 21 and 22 of http://arxiv.org/abs/1308.0850 36 | # analogous to https://github.com/tensorflow/magenta/blob/master/magenta/models/sketch_rnn/model.py#L326 37 | sigma_x = K.exp(K.abs(pred[..., 2])) + epsilon() 38 | sigma_y = K.exp(K.abs(pred[..., 3])) + epsilon() 39 | rho = K.tanh(pred[..., 4]) * 0 # avoid drifting to -1 or 1 to prevent NaN 40 | norm1 = K.log(1 + K.abs(x_coord - mu_x)) 41 | norm2 = K.log(1 + K.abs(y_coord - mu_y)) 42 | variance_x = K.square(sigma_x) 43 | variance_y = K.square(sigma_y) 44 | s1s2 = sigma_x * sigma_y # very large if sigma_x and/or sigma_y are very large 45 | # eq 25 of http://arxiv.org/abs/1308.0850 46 | z = ((K.square(norm1) / variance_x) + 47 | (K.square(norm2) / variance_y) - 48 | (2 * rho * norm1 * norm2 / s1s2)) # z → -∞ if rho * norm1 * norm2 → ∞ and/or s1s2 → 0 49 | neg_rho = 1 - K.square(rho) # → 0 if rho → {1, -1} 50 | numerator = K.exp(-z / (2 * neg_rho)) # → ∞ if z → -∞ and/or neg_rho → 0 51 | denominator = (2 * np.pi * s1s2 * K.sqrt(neg_rho)) # → 0 if s1s2 → 0 and/or neg_rho → 0 52 | pdf = numerator / denominator # → ∞ if denominator → 0 and/or if numerator → ∞ 53 | return pdf 54 | 55 | 56 | # Adapted to Keras from https://github.com/tensorflow/magenta/blob/master/magenta/models/sketch_rnn/model.py#L268 57 | # Adapted version of the probability density function of 58 | # https://en.wikipedia.org/wiki/Multivariate_normal_distribution#Bivariate_case 59 | # augmented to negative log likelihood loss configuration 60 | def bivariate_gaussian_loss(true, pred): 61 | """ 62 | Bivariate gaussian loss function 63 | Returns results of eq # 24 of http://arxiv.org/abs/1308.0850 64 | :param true: truth values with at least [mu1, mu2] 65 | :param pred: values predicted with at least [mu1, mu2, sigma1, sigma2, rho] 66 | :return: the log of the summed max likelihood 67 | """ 68 | pdf = bivariate_gaussian(true, pred) 69 | return K.sum(-K.log(pdf + epsilon())) # → -∞ if pdf → ∞ 70 | 71 | 72 | def univariate_gaussian(true, pred): 73 | """ 74 | Generic, rank-agnostic bivariate gaussian function 75 | Returns results of eq # 24 of http://arxiv.org/abs/1308.0850 76 | :param true: truth values with at least [mu] 77 | :param pred: values predicted with at least [mu, sigma] 78 | :return: probability density function 79 | """ 80 | x = true[..., 0] 81 | mu = pred[..., 0] 82 | sigma = pred[..., 1] 83 | 84 | norm = K.log(1 + K.abs(x - mu)) # needs log of norm to counter large mu diffs 85 | variance = K.softplus(K.square(sigma)) 86 | z = K.exp(-K.square(K.abs(norm)) / (2 * variance) + epsilon()) # z -> 0 if sigma 87 | # pdf -> 0 if sigma is very large or z -> 0; NaN if variance -> 0 88 | pdf = z / K.sqrt((2 * np.pi * variance) + epsilon()) 89 | return pdf 90 | 91 | 92 | def univariate_gaussian_loss(true, pred): 93 | pdf = univariate_gaussian(true, pred) # pdf -> 0 if sigma is very large or z -> 0 94 | return -K.log(pdf + epsilon()) # inf if pdf -> 0 95 | -------------------------------------------------------------------------------- /model/topoml_util/geom_fourier_descriptors.py: -------------------------------------------------------------------------------- 1 | import numpy as np 2 | from pyefd import elliptic_fourier_descriptors 3 | 4 | np.seterr(all='raise') 5 | 6 | 7 | def geom_fourier_descriptors(shapes, order): 8 | """ 9 | Creates a stacked array of different variations of fourier descriptors: normalized, non-normalized 10 | :param shapes: a list of shapely shapes 11 | :param order: the fourier descriptor order (the size of the returned array along the 0-axis) 12 | :return: a 2d array with shape ((order * 2) + 3, 4) 13 | """ 14 | fourier_descriptors = [] 15 | for index, shape in enumerate(shapes): 16 | coeffs = create_geom_fourier_descriptor(shape, order) 17 | fourier_descriptors.append(coeffs) 18 | 19 | return fourier_descriptors 20 | 21 | 22 | def create_geom_fourier_descriptor(shape, order): 23 | boundary = shape.boundary 24 | while boundary.geom_type == "MultiLineString": 25 | boundary = boundary.geoms[0] 26 | # Set normalize to false to retain size information. 27 | non_normalized_coeffs = elliptic_fourier_descriptors( 28 | boundary.coords, order=order, normalize=False) 29 | # normalized Fouriers 30 | normalized_coeffs = elliptic_fourier_descriptors( 31 | boundary.coords, order=order, normalize=True) 32 | 33 | # TODO: create centroid distance fourier descriptors 34 | # See https://doi-org.vu-nl.idm.oclc.org/10.1016/j.image.2009.04.001 35 | # coords = np.array(boundary.coords) 36 | # centroid_distances = [boundary.centroid.distance(Point(point)) for point in coords] 37 | # centroid_fourier_descriptors = elliptic_fourier_descriptors(centroid_distances, normalize=True) 38 | 39 | # Stack 'em all 40 | coeffs = [shape.area, boundary.length, len(boundary.coords)] 41 | for nn, n in zip(non_normalized_coeffs, normalized_coeffs): 42 | coeffs = np.append(coeffs, nn) # without axis this will just create an array 43 | coeffs = np.append(coeffs, n) 44 | 45 | return coeffs 46 | -------------------------------------------------------------------------------- /model/topoml_util/geom_scaler.py: -------------------------------------------------------------------------------- 1 | import numpy as np 2 | from .GeoVectorizer import FULL_STOP_INDEX 3 | 4 | 5 | def scale(vectors): 6 | means = localized_mean(vectors) 7 | min_maxs = [] 8 | 9 | for index, data_point in enumerate(vectors): 10 | full_stop_point = data_point[:, FULL_STOP_INDEX].tolist() 11 | 12 | try: 13 | full_stop_point_index = full_stop_point.index(1) 14 | except Exception as e: # if a dummy point is encountered 15 | min_maxs.append([0, 0]) 16 | continue 17 | 18 | min_maxs.append([ 19 | np.min(data_point[..., :full_stop_point_index, :2] - means[index]), 20 | np.max(data_point[..., :full_stop_point_index, :2] - means[index]) 21 | ]) 22 | 23 | return np.std(min_maxs) 24 | 25 | 26 | def transform(vectors, scale=None): 27 | localized = np.copy(vectors) 28 | means = localized_mean(vectors) 29 | 30 | for index, data_point in enumerate(localized): 31 | full_stop_point = data_point[:, FULL_STOP_INDEX].tolist() 32 | 33 | try: 34 | full_stop_point_index = full_stop_point.index(1) 35 | except Exception as e: # if a dummy point is encountered 36 | continue 37 | 38 | data_point[..., :full_stop_point_index + 1, :2] -= means[index] 39 | data_point[..., :full_stop_point_index + 1, :2] /= scale 40 | 41 | return localized 42 | 43 | 44 | def localized_mean(vectors): 45 | geom_means = [] 46 | for data_point in vectors: 47 | full_stop_point = data_point[:, FULL_STOP_INDEX].tolist() 48 | 49 | try: 50 | full_stop_point_index = full_stop_point.index(1) 51 | except Exception as e: # if a dummy point is encountered 52 | geom_means.append([[[0, 0]]]) 53 | continue 54 | 55 | # Take the mean of all non-null points for localized origin 56 | geom_mean = np.mean(data_point[0:full_stop_point_index, 0:2], axis=0, keepdims=True) 57 | geom_means.append(geom_mean) 58 | 59 | return np.array(geom_means) 60 | -------------------------------------------------------------------------------- /model/topoml_util/np_gaussian_2d_loss.py: -------------------------------------------------------------------------------- 1 | import numpy as np 2 | 3 | epsilon = 1e-8 4 | 5 | 6 | def softplus(x): 7 | return np.logaddexp(1.0, x) 8 | 9 | 10 | def softmax(x): 11 | """Compute softmax values for each sets of scores in x.""" 12 | e_x = np.exp(x - np.max(x)) 13 | return e_x / e_x.sum() 14 | 15 | # Adapted version of the probability density function of 16 | # https://en.wikipedia.org/wiki/Multivariate_normal_distribution#Bivariate_case 17 | # augmented to negative log likelihood loss configuration 18 | def np_r2_bivariate_gaussian_loss(true, pred): 19 | """Returns results of eq # 24 of http://arxiv.org/abs/1308.0850""" 20 | x_coord = true[:, 0] 21 | y_coord = true[:, 1] 22 | mu_x = pred[:, 0] 23 | mu_y = pred[:, 1] 24 | 25 | # exponentiate the sigmas and also make correlative rho between -1 and 1. 26 | # eq. # 21 and 22 of http://arxiv.org/abs/1308.0850 27 | # analogous to https://github.com/tensorflow/magenta/blob/master/magenta/models/sketch_rnn/model.py#L326 28 | sigma_x = np.exp(np.abs(pred[:, 2])) 29 | sigma_y = np.exp(np.abs(pred[:, 3])) 30 | rho = np.tanh(pred[:, 4]) # hardcode to avoid drifting to -1 or 1 31 | 32 | norm1 = np.log(1 + np.abs(x_coord - mu_x)) 33 | norm2 = np.log(1 + np.abs(y_coord - mu_y)) 34 | 35 | variance_x = softplus(np.square(sigma_x)) 36 | variance_y = softplus(np.square(sigma_y)) 37 | s1s2 = softplus(sigma_x * sigma_y) # very large if sigma_x and/or sigma_y are very large 38 | 39 | # eq 25 of http://arxiv.org/abs/1308.0850 40 | z = ((np.square(norm1) / variance_x) + 41 | (np.square(norm2) / variance_y) - 42 | (2 * rho * norm1 * norm2 / s1s2)) # z → -∞ if rho * norm1 * norm2 → ∞ and/or s1s2 → 0 43 | neg_rho = 1 - np.square(rho) # → 0 if rho → {1, -1} 44 | numerator = np.exp(-z / (2 * neg_rho)) # → ∞ if z → -∞ and/or neg_rho → 0 45 | denominator = (2 * np.pi * s1s2 * np.sqrt(neg_rho)) + epsilon # → 0 if s1s2 → 0 and/or neg_rho → 0 46 | pdf = numerator / denominator # → ∞ if denominator → 0 and/or if numerator → ∞ 47 | return -np.log(pdf + epsilon) # → -∞ if pdf → ∞ 48 | 49 | 50 | # Adapted version of the probability density function of 51 | # https://en.wikipedia.org/wiki/Multivariate_normal_distribution#Bivariate_case 52 | # augmented to negative log likelihood loss configuration 53 | def np_r4_bivariate_gaussian_loss(true, pred): 54 | pdf = np_r4_bivariate_gaussian(true, pred) 55 | return -np.log(pdf + epsilon) # → -∞ if pdf → ∞ 56 | 57 | 58 | def np_r4_bivariate_gaussian(true, pred): 59 | """Returns results of eq # 24 of http://arxiv.org/abs/1308.0850""" 60 | x_coord = true[:, :, :, 0] 61 | y_coord = true[:, :, :, 1] 62 | mu_x = pred[:, :, :, 0] 63 | mu_y = pred[:, :, :, 1] 64 | # exponentiate the sigmas and also make correlative rho between -1 and 1. 65 | # eq. # 21 and 22 of http://arxiv.org/abs/1308.0850 66 | # analogous to https://github.com/tensorflow/magenta/blob/master/magenta/models/sketch_rnn/model.py#L326 67 | sigma_x = np.exp(np.abs(pred[:, :, :, 2])) 68 | sigma_y = np.exp(np.abs(pred[:, :, :, 3])) 69 | rho = np.tanh(pred[:, :, :, 4]) * 0.1 # hardcode to avoid drifting to -1 or 1 70 | 71 | norm1 = np.log(1 + np.abs(x_coord - mu_x)) 72 | norm2 = np.log(1 + np.abs(y_coord - mu_y)) 73 | 74 | variance_x = softplus(np.square(sigma_x)) 75 | variance_y = softplus(np.square(sigma_y)) 76 | s1s2 = softplus(sigma_x * sigma_y) # very large if sigma_x and/or sigma_y are very large 77 | # eq 25 of http://arxiv.org/abs/1308.0850 78 | z = ((np.square(norm1) / variance_x) + 79 | (np.square(norm2) / variance_y) - 80 | (2 * rho * norm1 * norm2 / s1s2)) # z → -∞ if rho * norm1 * norm2 → ∞ and/or s1s2 → 0 81 | neg_rho = 1 - np.square(rho) # → 0 if rho → {1, -1} 82 | numerator = np.exp(-z / (2 * neg_rho)) # → ∞ if z → -∞ and/or neg_rho → 0 83 | denominator = (2 * np.pi * s1s2 * np.sqrt(neg_rho)) + epsilon # → 0 if s1s2 → 0 and/or neg_rho → 0 84 | pdf = numerator / denominator # → ∞ if denominator → 0 and/or if numerator → ∞ 85 | return pdf 86 | -------------------------------------------------------------------------------- /model/topoml_util/np_gmm_loss.py: -------------------------------------------------------------------------------- 1 | from topoml_util.gaussian_loss import r4_bivariate_gaussian 2 | import numpy as np 3 | 4 | class GaussianMixtureLoss: 5 | def __init__(self, num_components): 6 | self.num_components = num_components 7 | 8 | @staticmethod 9 | def softmax(x): 10 | """Compute softmax values for each sets of scores in x.""" 11 | return np.exp(x) / np.sum(np.exp(x), axis=0) 12 | 13 | @staticmethod 14 | def epsilon(): 15 | return 1e-16 16 | 17 | def geom_gaussian_mixture_loss(self, y_true, y_pred): 18 | # loss fn based on eq #26 of http://arxiv.org/abs/1308.0850. 19 | (data_points, points, features) = y_pred.shape 20 | geom_type_index = 6 * self.num_components # Calculate offset from parameters times components 21 | render_index = geom_type_index + 8 22 | pi_index = 5 23 | 24 | predicted_components = np.reshape(y_pred[:geom_type_index], (-1, points.value, self.num_components, 6)) 25 | pi = self.softmax(predicted_components[:, :, :, pi_index]) 26 | 27 | true_components = np.reshape(y_true[:geom_type_index], (-1, points.value, self.num_components, 6)) 28 | 29 | gmm = r4_bivariate_gaussian(true_components, predicted_components) * pi 30 | gmm_loss = np.sum(-np.log(gmm + self.epsilon())) 31 | 32 | # Zero out loss terms beyond N_s, the last actual stroke 33 | render = 1 - np.mean(y_pred[:, :, render_index:render_index + 2]) # RENDER and STOP values 34 | 35 | gmm_loss = gmm_loss * render 36 | 37 | return gmm_loss 38 | 39 | 40 | -------------------------------------------------------------------------------- /model/topoml_util/sketch_rnn_model.py: -------------------------------------------------------------------------------- 1 | # Copyright 2017 Google Inc. All Rights Reserved. 2 | # 3 | # Licensed under the Apache License, Version 2.0 (the "License"); 4 | # you may not use this file except in compliance with the License. 5 | # You may obtain a copy of the License at 6 | # 7 | # http://www.apache.org/licenses/LICENSE-2.0 8 | # 9 | # Unless required by applicable law or agreed to in writing, software 10 | # distributed under the License is distributed on an "AS IS" BASIS, 11 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 12 | # See the License for the specific language governing permissions and 13 | # limitations under the License. 14 | """Sketch-RNN Model.""" 15 | 16 | from __future__ import absolute_import 17 | from __future__ import division 18 | from __future__ import print_function 19 | from keras import backend as K 20 | 21 | import random 22 | 23 | # internal imports 24 | 25 | import numpy as np 26 | import tensorflow as tf 27 | 28 | 29 | # NB: the below are inner functions, not methods of Model 30 | def tf_2d_normal(x1, x2, mu1, mu2, s1, s2, rho): 31 | """Returns result of eq # 24 of http://arxiv.org/abs/1308.0850.""" 32 | # exponentiate the sigmas and also make corr between -1 and 1. 33 | print_op = tf.Print() 34 | s1 = tf.exp(s1) 35 | s2 = tf.exp(s2) 36 | rho = tf.tanh(rho) 37 | 38 | norm1 = tf.subtract(x1, mu1) 39 | norm2 = tf.subtract(x2, mu2) 40 | s1s2 = tf.multiply(s1, s2) 41 | # eq 25 42 | z = (tf.square(tf.div(norm1, s1)) + tf.square(tf.div(norm2, s2)) - 43 | 2 * tf.div(tf.multiply(rho, tf.multiply(norm1, norm2)), s1s2)) 44 | neg_rho = 1 - tf.square(rho) 45 | result = tf.exp(tf.div(-z, 2 * neg_rho)) 46 | denom = 2 * np.pi * tf.multiply(s1s2, tf.sqrt(neg_rho)) 47 | result = tf.div(result, denom) 48 | return result 49 | 50 | 51 | def get_lossfunc(z_pi, z_mu1, z_mu2, z_sigma1, z_sigma2, z_corr, 52 | z_pen_logits, x1_data, x2_data, pen_data): 53 | """Returns a loss fn based on eq #26 of http://arxiv.org/abs/1308.0850.""" 54 | # This represents the L_R only (i.e. does not include the KL loss term). 55 | 56 | result0 = tf_2d_normal(x1_data, x2_data, z_mu1, z_mu2, z_sigma1, z_sigma2, 57 | z_corr) 58 | epsilon = 1e-6 59 | # result1 is the loss wrt pen offset (L_s in equation 9 of 60 | # https://arxiv.org/pdf/1704.03477.pdf) 61 | result1 = tf.multiply(result0, z_pi) 62 | result1 = tf.reduce_sum(result1, 1, keep_dims=True) 63 | result1 = -tf.log(result1 + epsilon) # avoid log(0) 64 | 65 | fs = 1.0 - pen_data[:, 2] # use training data for this 66 | fs = tf.reshape(fs, [-1, 1]) 67 | # Zero out loss terms beyond N_s, the last actual stroke 68 | result1 = tf.multiply(result1, fs) 69 | 70 | # result2: loss wrt pen state, (L_p in equation 9) 71 | result2 = tf.nn.softmax_cross_entropy_with_logits( 72 | labels=pen_data, logits=z_pen_logits) 73 | result2 = tf.reshape(result2, [-1, 1]) 74 | if not self.hps.is_training: # eval mode, mask eos columns 75 | result2 = tf.multiply(result2, fs) 76 | 77 | result = result1 + result2 78 | return result 79 | 80 | 81 | # below is where we need to do MDN (Mixture Density Network) splitting of 82 | # distribution params 83 | def get_mixture_coef(output): 84 | """Returns the tf slices containing mdn dist params.""" 85 | # This uses eqns 18 -> 23 of http://arxiv.org/abs/1308.0850. 86 | z = output 87 | z_pen_logits = z[:, 0:3] # pen states 88 | z_pi, z_mu1, z_mu2, z_sigma1, z_sigma2, z_corr = tf.split(z[:, 3:], 6, 1) 89 | 90 | # process output z's into MDN paramters 91 | 92 | # softmax all the pi's and pen states: 93 | z_pi = tf.nn.softmax(z_pi) 94 | z_pen = tf.nn.softmax(z_pen_logits) 95 | 96 | # exponentiate the sigmas and also make corr between -1 and 1. 97 | z_sigma1 = tf.exp(z_sigma1) 98 | z_sigma2 = tf.exp(z_sigma2) 99 | z_corr = tf.tanh(z_corr) 100 | 101 | r = [z_pi, z_mu1, z_mu2, z_sigma1, z_sigma2, z_corr, z_pen, z_pen_logits] 102 | return r 103 | -------------------------------------------------------------------------------- /model/topoml_util/slack_send.py: -------------------------------------------------------------------------------- 1 | import os 2 | from slackclient import SlackClient 3 | 4 | slack_token = os.environ.get("SLACK_API_TOKEN") 5 | 6 | 7 | def notify(signature, message): 8 | if slack_token: 9 | sc = SlackClient(slack_token) 10 | sc.api_call( 11 | "chat.postMessage", 12 | channel="#machinelearning", 13 | text="Session \n" + signature + "\ncompleted with: " + str(message)) 14 | else: 15 | print('No slack notification: no slack API token environment variable "SLACK_API_TOKEN" set.') 16 | -------------------------------------------------------------------------------- /model/topoml_util/test_GaussianMixtureLoss.py: -------------------------------------------------------------------------------- 1 | import unittest 2 | import tensorflow as tf 3 | import numpy as np 4 | 5 | from topoml_util.test_files import gmm_output 6 | from topoml_util.GaussianMixtureLoss import GaussianMixtureLoss 7 | 8 | sess = tf.InteractiveSession() 9 | DATA_FILE = '../files/geodata_vectorized.npz' 10 | 11 | 12 | class TestGaussianMixtureLoss(unittest.TestCase): 13 | def test_bivariate_gaussian_loss(self): 14 | true = np.array([gmm_output.target]) 15 | pred = np.array([gmm_output.prediction]) 16 | loss = GaussianMixtureLoss(num_components=5, num_points=14).geom_gaussian_mixture_loss(true, pred) 17 | print(loss.eval()) 18 | 19 | def test_single_gaussian_loss(self): 20 | true = np.array([ 21 | [1., 1., 0.], 22 | [1., 1., 0.], 23 | [1., 1., 0.], 24 | [1., 1., 0.], 25 | ]) 26 | pred1 = np.array([ 27 | [1., 1., 0.], 28 | [1., 1., 0.], 29 | [1., 1., 0.], 30 | [1., 1., 0.], 31 | ]) 32 | pred2 = np.array([ 33 | [0., 0., 0.], 34 | [0., 0., 0.], 35 | [0., 0., 0.], 36 | [0., 0., 0.], 37 | ]) 38 | loss1 = GaussianMixtureLoss(num_components=1, num_points=1).univariate_gmm_loss(true, pred1) 39 | loss2 = GaussianMixtureLoss(num_components=1, num_points=1).univariate_gmm_loss(true, pred2) 40 | self.assertLess(loss1.eval(), loss2.eval()) 41 | 42 | -------------------------------------------------------------------------------- /model/topoml_util/test_GeoVectorizer.py: -------------------------------------------------------------------------------- 1 | import unittest 2 | 3 | import numpy as np 4 | import pandas 5 | from GeoVectorizer import GeoVectorizer, GEO_VECTOR_LEN 6 | from shapely import wkt as wktreader 7 | 8 | TOPOLOGY_CSV = 'test_files/polygon_multipolygon.csv' 9 | SOURCE_DATA = pandas.read_csv(TOPOLOGY_CSV) 10 | brt_wkt = SOURCE_DATA['brt_wkt'] 11 | osm_wkt = SOURCE_DATA['osm_wkt'] 12 | target_wkt = SOURCE_DATA['intersection_wkt'] 13 | 14 | input_geom = np.array([ 15 | [0., 0., 1., 0., 0.], 16 | [0., 1., 1., 0., 0.], 17 | [1., 1., 1., 0., 0.], 18 | [1., 0., 1., 0., 0.], 19 | [0., 0., 0., 1., 0.], 20 | [0., 0., 1., 0., 0.], 21 | [0., -1., 1., 0., 0.], 22 | [-1., -1., 1., 0., 0.], 23 | [-1., 0., 1., 0., 0.], 24 | [0., 0., 0., 0., 1.], 25 | [0., 0., 0., 0., 0.] 26 | ]) 27 | 28 | output_geom = np.array([ 29 | [0.0, 0.00, 1., 0., 0.], 30 | [0.0, 0.25, 1., 0., 0.], 31 | [0.0, 0.50, 1., 0., 0.], 32 | [0.0, 0.75, 1., 0., 0.], 33 | [0.0, 1.00, 1., 0., 0.], 34 | [0.25, 1.0, 1., 0., 0.], 35 | [0.50, 1.0, 1., 0., 0.], 36 | [1.0, 1.00, 1., 0., 0.], 37 | [1.0, 0.50, 1., 0., 0.], 38 | [1.0, 0.00, 1., 0., 0.], 39 | [0.5, 0.00, 1., 0., 0.], 40 | [0.0, 0.00, 0., 1., 0.], 41 | [0.0, 0.00, 1., 0., 0.], 42 | [0.0, -0.5, 1., 0., 0.], 43 | [0.0, -1.0, 1., 0., 0.], 44 | [-0.5, -1., 1., 0., 0.], 45 | [-1., -1.0, 1., 0., 0.], 46 | [-1., -0.5, 1., 0., 0.], 47 | [-1., 0.00, 1., 0., 0.], 48 | [-0.5, 0.0, 1., 0., 0.], 49 | [0.00, 0.0, 0., 0., 1.], 50 | [0.00, 0.0, 0., 0., 0.] 51 | ]) 52 | 53 | non_empty_geom_collection = 'GEOMETRYCOLLECTION(LINESTRING(1 1, 3 5),POLYGON((-1 -1, -1 -5, -5 -5, -5 -1, -1 -1)))' 54 | 55 | 56 | class TestVectorizer(unittest.TestCase): 57 | def test_max_points(self): 58 | max_points = GeoVectorizer.max_points(brt_wkt, osm_wkt) 59 | self.assertEqual(max_points, 159) 60 | 61 | # def test_interpolate(self): 62 | # interpolated = GeoVectorizer.interpolate(input_geom, len(input_geom) * 2) 63 | # for index, _ in enumerate(interpolated): 64 | # result = list(interpolated[index]) 65 | # expected = list(output_geom[index]) 66 | # self.assertListEqual(result, expected, msg='Lists differ at index %i' % index) 67 | 68 | def test_vectorize_one_wkt(self): 69 | max_points = 20 70 | input_set = SOURCE_DATA['intersection_wkt'] 71 | vectorized = [] 72 | for index in range(len(input_set)): 73 | vectorized.append(GeoVectorizer.vectorize_wkt(input_set[index], max_points, simplify=True)) 74 | self.assertEqual(len(input_set), len(brt_wkt)) 75 | self.assertEqual(vectorized[0].shape, (19, GEO_VECTOR_LEN)) 76 | self.assertEqual(vectorized[1].shape, (1, GEO_VECTOR_LEN)) 77 | 78 | def test_fixed_size(self): 79 | max_points = 20 80 | input_set = SOURCE_DATA['intersection_wkt'] 81 | vectorized = [GeoVectorizer.vectorize_wkt(wkt, max_points, simplify=True, fixed_size=True) for wkt in input_set] 82 | self.assertEqual(np.array(vectorized).shape, (input_set.size, 20, GEO_VECTOR_LEN)) 83 | 84 | def test_non_empty_geom_coll(self): 85 | with self.assertRaises(ValueError): 86 | GeoVectorizer.vectorize_wkt(non_empty_geom_collection, 100) 87 | 88 | def test_point(self): 89 | point_matrix = GeoVectorizer.vectorize_wkt('POINT(12 14)', 5) 90 | self.assertEqual(point_matrix.shape, (1, GEO_VECTOR_LEN)) 91 | 92 | def test_unsupported_geom(self): 93 | # Since 94 | with self.assertRaises(Exception): 95 | GeoVectorizer.vectorize_wkt( 96 | 'TEST_FOR_UNKNOWN_GEOM_TYPE ((10 10, 20 20, 10 40),(40 40, 30 30, 40 20, 30 10))', 16) 97 | 98 | def test_vectorize_big_multipolygon(self): 99 | with open('test_files/big_multipolygon_wkt.txt', 'r') as file: 100 | wkt = file.read() 101 | max_points = GeoVectorizer.max_points([wkt]) 102 | vectorized = GeoVectorizer.vectorize_wkt(wkt, max_points) 103 | self.assertEqual((144, GEO_VECTOR_LEN), vectorized.shape) 104 | 105 | def test_simplify_multipolygon_gt_max_points(self): 106 | with open('test_files/multipart_multipolygon_wkt.txt', 'r') as file: 107 | wkt = file.read() 108 | max_points = 20 109 | vectorized = GeoVectorizer.vectorize_wkt(wkt, max_points, simplify=True) 110 | self.assertEqual((20, GEO_VECTOR_LEN), vectorized.shape) 111 | 112 | def test_multipolygon_exceed_max_points(self): 113 | with open('test_files/multipart_multipolygon_wkt.txt', 'r') as file: 114 | wkt = file.read() 115 | max_points = 20 116 | with self.assertRaises(Exception): 117 | GeoVectorizer.vectorize_wkt(wkt, max_points) 118 | 119 | def test_polygon_exceed_max_points(self): 120 | with open('test_files/multipart_multipolygon_wkt.txt', 'r') as file: 121 | wkt = file.read() 122 | shape = wktreader.loads(wkt) 123 | geom = shape.geoms[0] 124 | max_points = 20 125 | with self.assertRaises(Exception): 126 | GeoVectorizer.vectorize_wkt(geom.wkt, max_points) 127 | -------------------------------------------------------------------------------- /model/topoml_util/test_Tokenizer.py: -------------------------------------------------------------------------------- 1 | import unittest 2 | import pandas 3 | from Tokenizer import Tokenize 4 | 5 | TOPOLOGY_TRAINING_CSV = 'test_files/example.csv' 6 | source_data = pandas.read_csv(TOPOLOGY_TRAINING_CSV) 7 | raw_training_set = source_data['brt_wkt'] + ' ' + source_data['osm_wkt'] 8 | raw_target_set = source_data['intersection_wkt'] 9 | 10 | 11 | class TestUtil(unittest.TestCase): 12 | def test_truncate(self): 13 | max_len = 500 14 | (input_set, _) = Tokenize.truncate(max_len, raw_training_set, raw_target_set) 15 | for record in input_set: 16 | for field in record: 17 | self.assertLessEqual(len(field), max_len) 18 | 19 | def test_batch_truncate(self): 20 | batch_size = 3 21 | max_len = 1000 22 | validation_split = 0.1 23 | training_set, target_set = Tokenize.batch_truncate(batch_size, max_len, validation_split, raw_training_set, 24 | raw_target_set) 25 | self.assertEqual(len(training_set), 30) 26 | 27 | def test_tokenize(self): 28 | test_strings = ['A test string'] 29 | tokenizer = Tokenize(test_strings) 30 | tokenized = tokenizer.char_level_tokenize(test_strings) 31 | self.assertEqual((tokenizer.word_index, tokenized), 32 | ({' ': 2, 'A': 4, 'e': 5, 'g': 9, 'i': 7, 'n': 8, 'r': 6, 's': 3, 't': 1}, 33 | [[4, 2, 1, 5, 3, 1, 2, 3, 1, 6, 7, 8, 9]])) 34 | 35 | def test_tokenize_example(self): 36 | self.maxDiff = None 37 | test_strings = source_data.as_matrix() 38 | word_index = {'5': 1, '4': 2, '.': 3, '1': 4, '2': 5, '8': 6, ' ': 7, ',': 8, '3': 9, '6': 10, '0': 11, 39 | '9': 12, '7': 13, 'O': 14, '(': 15, ')': 16, 'L': 17, 'Y': 18, 'P': 19, 'G': 20, 'N': 21, 40 | 'T': 22, 'E': 23, 'M': 24, 'I': 25, 'C': 26, 'U': 27, 'R': 28} 41 | tokenizer = Tokenize(test_strings[0] + test_strings[1] + test_strings[2]) 42 | tokenized = tokenizer.char_level_tokenize(test_strings[0]) 43 | self.assertEqual((tokenizer.word_index, tokenized[0][0:15]), 44 | (word_index, 45 | [19, 14, 17, 18, 20, 14, 21, 15, 15, 2, 3, 6, 4, 4, 6])) 46 | 47 | def test_one_hot(self): 48 | source_matrix = source_data.as_matrix() 49 | test_strings = source_matrix[0] + source_matrix[1] 50 | 51 | max_len = 0 52 | for sentence in test_strings: 53 | if len(sentence) > max_len: 54 | max_len = len(sentence) 55 | 56 | tokenizer = Tokenize(test_strings) 57 | matrix = tokenizer.one_hot(test_strings, max_len) 58 | self.assertEqual(matrix[0][0][19], True) # 'P' for POLYGON 59 | 60 | def test_detokenize(self): 61 | test_strings = ['A test string'] 62 | tokenizer = Tokenize(test_strings) 63 | tokenized = tokenizer.char_level_tokenize(test_strings) 64 | detokenized = tokenizer.decypher(tokenized) 65 | self.assertEqual(detokenized, test_strings) 66 | -------------------------------------------------------------------------------- /model/topoml_util/test_files/big_multipolygon_wkt.txt: -------------------------------------------------------------------------------- 1 | MULTIPOLYGON (((6.83347875187002 53.319132848582356, 6.833999853819664 53.31918330312409, 6.836043494585205 53.31930920917599, 6.835895913178991 53.31813487667021, 6.835980841423087 53.31761993697426, 6.836337152462576 53.31719832819632, 6.837172560351007 53.31682710709162, 6.839040096236629 53.31657989801909, 6.841303534858739 53.31641267969816, 6.843051364598726 53.315732694765565, 6.843804351387401 53.31543051198378, 6.843844040555951 53.31496554356866, 6.849161839342751 53.31495092942536, 6.854083876463372 53.31678939207867, 6.855817302106883 53.31761756753287, 6.850097590717466 53.31124090458927, 6.856680386462843 53.3104102095955, 6.856912492582837 53.31122852049775, 6.859123448118157 53.31125088626337, 6.861203354792467 53.31152026628038, 6.86171860496607 53.311319047409356, 6.863699715806039 53.31102400147218, 6.868928270350355 53.31065634847005, 6.870198211839971 53.30892311646559, 6.856851778148959 53.306587127303814, 6.834839966102368 53.302824411833214, 6.831951384722409 53.30253790911038, 6.827197021362261 53.30176091777146, 6.823401207719804 53.30062293103662, 6.8149978182745246 53.2977805535316, 6.814368920545618 53.297587512118206, 6.814355036282553 53.297579899925616, 6.814344609167823 53.297591710031135, 6.814167366642931 53.29777480453906, 6.814165945015529 53.297776277130794, 6.814156763728136 53.297785759416136, 6.8132054150148535 53.29876810478963, 6.812798235162585 53.299188555730275, 6.8114504868753 53.30058019919939, 6.810796554979605 53.30125536048411, 6.8100540057226056 53.30202197233889, 6.808020590522993 53.3041211487334, 6.807369373578128 53.304791782942885, 6.8074642014179165 53.30482400231268, 6.809360787743384 53.3054683699806, 6.8094433474374805 53.30548788179985, 6.809507351512904 53.30549521230928, 6.809561812550777 53.30549977944671, 6.809633709214066 53.30550530900296, 6.809881554858264 53.30547343849502, 6.809936166023247 53.30547800368545, 6.809968511855471 53.305485259570084, 6.8100214353329465 53.305502336099075, 6.81010782501163 53.30553788833553, 6.810170686045728 53.3055696750993, 6.810242128379014 53.30561142506304, 6.810336248676477 53.30570844357485, 6.810399054046888 53.30578399518662, 6.810520660444627 53.305909174570495, 6.810625372616963 53.305990700250284, 6.8106842287552265 53.30602837564755, 6.810757613771548 53.30605626270446, 6.810840881580534 53.306079000749335, 6.810955806184349 53.30610172291632, 6.811057894566426 53.306117407970795, 6.811492049300044 53.30619808385792, 6.811603756545034 53.30621877682113, 6.809514757555597 53.30895295506413, 6.809417978884093 53.309084764676555, 6.808907354913822 53.30977980216965, 6.808352734605847 53.3104333906923, 6.80827109565832 53.31040110596566, 6.80779479031262 53.31021262930991, 6.8076086001684395 53.31012658269649, 6.80658157663805 53.311034401770506, 6.806528647701738 53.31108112768483, 6.807344330519582 53.31140057658718, 6.808051642865825 53.31167762711978, 6.808064449528665 53.311697425463, 6.808056273864496 53.311713428402804, 6.80745021347699 53.31224603265588, 6.807323362419456 53.312367259332134, 6.807027905044086 53.31264957413987, 6.806965634405601 53.31271797782896, 6.806910322822276 53.31276985438571, 6.806796452836691 53.31286160363022, 6.806611786750899 53.31301808332564, 6.806449445197397 53.31315533716344, 6.806273163757848 53.313352335840335, 6.8061145136413055 53.313487838146735, 6.805917675122718 53.31362567786664, 6.80580331189864 53.313716263642355, 6.805494112983976 53.31398285776417, 6.805107140671515 53.31433493177683, 6.80479416442644 53.31461927164204, 6.804440870955177 53.314937156267426, 6.803874223055891 53.31542515300257, 6.803685238349022 53.31560293172313, 6.803652993117111 53.315629507152174, 6.803599976096668 53.315678389334735, 6.803524239948995 53.31574829797531, 6.803338105146352 53.31592015611583, 6.803499314648956 53.31596705293939, 6.803527881389022 53.31597636095488, 6.803642509617262 53.31601371265091, 6.803771479418676 53.3160704253172, 6.80391455866432 53.316131823477136, 6.804051480927207 53.31619760793609, 6.804202494474142 53.316281199053954, 6.804207839380495 53.31628859496464, 6.804568764582166 53.31650810246428, 6.804806604068407 53.31665357416869, 6.804896149856973 53.316706975941216, 6.8049633807597 53.316743476412405, 6.805088161704932 53.31680050640781, 6.805242275838586 53.31685961670394, 6.805550628475197 53.316958783406996, 6.805626899179788 53.31698268549428, 6.805725540059064 53.31700686321823, 6.805827736940912 53.317029460370534, 6.807760988463772 53.31669925345747, 6.808857000386165 53.31667225343516, 6.809526380893851 53.316805307837264, 6.810005620768624 53.317002954508865, 6.810811943353177 53.31742700406379, 6.811731841739554 53.31769693190602, 6.812809105837255 53.31769089841459, 6.813935828099859 53.31822963677398, 6.815413044903393 53.31846820665727, 6.816475138225733 53.31859093880479, 6.817382611586303 53.318350427544644, 6.8184281201242944 53.31829136328779, 6.819391014094212 53.31827259252478, 6.820452966469373 53.31865870909369, 6.821619054574962 53.31897448879506, 6.822971223362212 53.31920210221392, 6.82416316749997 53.31920920031828, 6.825290045360304 53.31929738314048, 6.827008187082374 53.31960594368717, 6.828086428006082 53.3196205421661, 6.829063536762218 53.31949312285018, 6.831350888042208 53.31922920403511, 6.832784801743416 53.31906761895827, 6.83347875187002 53.319132848582356))) -------------------------------------------------------------------------------- /model/topoml_util/test_fourier_descriptors.py: -------------------------------------------------------------------------------- 1 | import unittest 2 | import numpy as np 3 | from pyefd import elliptic_fourier_descriptors 4 | 5 | 6 | class TestFourierDescriptors(unittest.TestCase): 7 | def test_same_descriptors(self): 8 | square1 = [[0, 0], [1, 0], [1, 0.5], [1, 1], [0, 1], [0, 0]] 9 | square2 = [[0, 0], [0.5, 0], [1, 0], [1, 1], [0, 1], [0, 0]] 10 | descriptors1 = elliptic_fourier_descriptors(square1) 11 | descriptors2 = elliptic_fourier_descriptors(square2) 12 | np.testing.assert_array_almost_equal(descriptors1, descriptors2) 13 | 14 | def test_different_descriptors(self): 15 | square1 = [[0, 0], [1, 0], [1, 0.5], [1, 1], [0, 1], [0, 0]] 16 | square2 = [[0, 0], [0.5, 0], [1, 0], [200, 300], [0, 1], [0, 0]] 17 | descriptors1 = elliptic_fourier_descriptors(square1) 18 | descriptors2 = elliptic_fourier_descriptors(square2) 19 | coeffs = np.append(descriptors1, descriptors2, axis=0) 20 | try: 21 | np.testing.assert_array_almost_equal(descriptors1, descriptors2) 22 | except Exception as e: 23 | self.assertEqual('Arrays are not almost equal to 6 decimals', e.args[0][1:42]) 24 | 25 | def test_normalized_descriptors(self): 26 | square1 = [[0, 0], [1, 0], [1, 0.5], [1, 1], [0, 1], [0, 0]] 27 | descriptors1 = elliptic_fourier_descriptors(square1, normalize=True) 28 | descriptors2 = elliptic_fourier_descriptors(square1) 29 | try: 30 | np.testing.assert_array_almost_equal(descriptors1, descriptors2) 31 | except Exception as e: 32 | self.assertEqual('Arrays are not almost equal to 6 decimals', e.args[0][1:42]) 33 | -------------------------------------------------------------------------------- /model/topoml_util/test_geom_scaler.py: -------------------------------------------------------------------------------- 1 | import unittest 2 | 3 | import numpy as np 4 | 5 | from topoml_util import geom_scaler as gs 6 | 7 | # noinspection PyUnresolvedReferences 8 | dummy_geom = np.zeros((1, 1, 5)) 9 | 10 | square = np.array([[ 11 | [0., 0., 1., 0., 0.], 12 | [1., 0., 1., 0., 0.], 13 | [1., 1., 1., 0., 0.], 14 | [0., 1., 1., 0., 0.], 15 | [0., 0., 0., 0., 1.], 16 | ]]) 17 | 18 | square_duplicate_nodes = np.array([[ 19 | [0., 0., 1., 0., 0.], 20 | [1., 0., 1., 0., 0.], 21 | [1., 1., 1., 0., 0.], 22 | [1., 1., 1., 0., 0.], 23 | [1., 1., 1., 0., 0.], 24 | [1., 1., 1., 0., 0.], 25 | [1., 1., 1., 0., 0.], 26 | [0., 1., 1., 0., 0.], 27 | [0., 0., 0., 0., 1.], 28 | ]]) 29 | 30 | rectangle = np.array([[ 31 | [0., 0., 1., 0., 0.], 32 | [1., 0., 1., 0., 0.], 33 | [1., 2., 1., 0., 0.], 34 | [0., 2., 1., 0., 0.], 35 | [0., 0., 0., 0., 1.], 36 | ]]) 37 | 38 | normalized_square = np.array([[ 39 | [-1., -1., 1., 0., 0.], 40 | [ 1., -1., 1., 0., 0.], 41 | [ 1., 1., 1., 0., 0.], 42 | [-1., 1., 1., 0., 0.], 43 | [-1., -1., 0., 0., 1.], 44 | ]]) 45 | 46 | 47 | class TestGeomScaler(unittest.TestCase): 48 | def test_localized_mean(self): 49 | means = gs.localized_mean(square) 50 | for mean in means[0]: 51 | self.assertTrue((mean == 0.5).all()) 52 | 53 | def test_localized_mean_rectangle(self): 54 | means = gs.localized_mean(rectangle) 55 | self.assertEqual(means[0, 0, 0], 0.5) 56 | self.assertEqual(means[0, 0, 1], 1) 57 | 58 | def test_localized_mean_dup_nodes(self): 59 | means = gs.localized_mean(square_duplicate_nodes) 60 | self.assertTrue((means == 0.75).all()) 61 | 62 | def test_scaling_square(self): 63 | scale = gs.scale(square) 64 | self.assertEqual(scale, 0.5) 65 | 66 | def test_scaling_square_dup_nodes(self): 67 | scale = gs.scale(square_duplicate_nodes) 68 | self.assertEqual(scale, 0.5) 69 | 70 | def test_transform(self): 71 | # scaled_square = square[0] * 2 72 | # scaled_square[4, 12] = 1. 73 | scale = gs.scale(square) 74 | n_square = gs.transform(square, scale=scale) 75 | self.assertTrue((n_square == normalized_square).all()) 76 | coords = [geom[:, :2].flatten() for geom in n_square] 77 | coords = [item for sublist in coords for item in sublist] 78 | std = np.std(coords) 79 | self.assertAlmostEqual(std, 1., 1) 80 | 81 | def test_upsized_transform(self): 82 | square_0 = square[0] * 2 83 | square_0[:4, 2] = 1. 84 | square_0[4, 4] = 1. 85 | scale = gs.scale([square_0]) 86 | n_square = gs.transform([square_0], scale=scale) 87 | self.assertTrue((n_square == normalized_square).all()) 88 | coords = [geom[:, :2].flatten() for geom in n_square] 89 | coords = [item for sublist in coords for item in sublist] 90 | std = np.std(coords) 91 | self.assertAlmostEqual(std, 1., 1) 92 | -------------------------------------------------------------------------------- /model/topoml_util/test_np_gaussian_2d_loss.py: -------------------------------------------------------------------------------- 1 | import unittest 2 | import numpy as np 3 | from np_gaussian_2d_loss import np_r2_bivariate_gaussian_loss, np_r4_bivariate_gaussian, softmax, epsilon, \ 4 | np_r4_bivariate_gaussian_loss 5 | 6 | 7 | class TestNumpy2DGaussianLoss(unittest.TestCase): 8 | def test_r2_2d_loss(self): 9 | vec_in = np.array([[1, 1, 0, 0, 0]]) 10 | vec_out = vec_in 11 | loss1 = np_r2_bivariate_gaussian_loss(vec_in, vec_out) 12 | vec_out = np.array([[1, 1, 5, 5, 0]]) 13 | loss2 = np_r2_bivariate_gaussian_loss(vec_in, vec_out) 14 | self.assertLess(loss1, loss2) 15 | 16 | vec_out = np.array([[1, 1, 5, -5, 0]]) 17 | loss3 = np_r2_bivariate_gaussian_loss(vec_in, vec_out) 18 | self.assertLess(loss1, loss3) 19 | self.assertEqual(loss2, loss3) 20 | 21 | def test_r4_bivariate_gaussian_loss(self): 22 | vec_in = np.array([[[[1, 1, 0, 0, 0, 0]]]]) 23 | vec_out = vec_in 24 | loss1 = np_r4_bivariate_gaussian_loss(vec_in, vec_out) 25 | vec_out = np.array([[[[1, 1, 5, 5, 0, 0]]]]) 26 | loss2 = np_r4_bivariate_gaussian_loss(vec_in, vec_out) 27 | self.assertLess(loss1, loss2) 28 | 29 | vec_out = np.array([[[[1, 1, 5, -5, 0, 0]]]]) 30 | loss3 = np_r4_bivariate_gaussian_loss(vec_in, vec_out) 31 | self.assertLess(loss1, loss3) 32 | self.assertEqual(loss2, loss3) 33 | 34 | def test_r4_bivariate_gmm_zeros_loss(self): 35 | vec_in = np.array([[[[0, 0, 0, 0, 0, 0]]]]) 36 | vec_in = np.repeat(vec_in, 6, axis=2) # 6 gaussian mixture components 37 | pi_index = 5 38 | pi_weights = softmax(vec_in[:, :, :, pi_index]) 39 | vec_out = vec_in 40 | loss1 = np_r4_bivariate_gaussian(vec_in, vec_out) 41 | loss1 = loss1 * pi_weights 42 | gmm_loss1 = np.sum(-np.log(loss1 + epsilon), keepdims=True) 43 | 44 | vec_out = np.array([[[[1, 1, 5, 5, 0, 0]]]]) 45 | vec_out = np.repeat(vec_out, 6, axis=2) 46 | pi_weights = softmax(vec_out[:, :, :, pi_index]) 47 | loss2 = np_r4_bivariate_gaussian(vec_in, vec_out) * pi_weights 48 | gmm_loss2 = np.sum(-np.log(loss2 + epsilon), keepdims=True) 49 | self.assertLess(gmm_loss1[0, 0, 0], gmm_loss2[0, 0, 0]) 50 | -------------------------------------------------------------------------------- /model/topoml_util/test_rasterization.py: -------------------------------------------------------------------------------- 1 | import unittest 2 | 3 | from rasterio import features 4 | from shapely import wkt 5 | 6 | 7 | class TestRasterize(unittest.TestCase): 8 | def test_first(self): 9 | size = 20 10 | first = "POLYGON(({0} {0}, {0} -{0}, -{0} -{0}, -{0} {0}, {0} {0}))".format(size) 11 | geo_interfaces = [wkt.loads(first).__geo_interface__] 12 | raster = features.rasterize(geo_interfaces, out_shape=[255, 255]) 13 | self.assertEqual(raster[100, 100], 1) 14 | -------------------------------------------------------------------------------- /model/topoml_util/test_sketch-rnn-model.py: -------------------------------------------------------------------------------- 1 | import unittest 2 | from keras import backend as K 3 | import tensorflow as tf 4 | import numpy as np 5 | from sketch_rnn_model import tf_2d_normal 6 | 7 | PRECISION = 6 8 | sess = tf.InteractiveSession() 9 | 10 | 11 | class TestSketchRnnLoss(unittest.TestCase): 12 | def test_2d_gaussian_zeros(self): 13 | target = np.array([[[0, 0]]], dtype=float) 14 | prediction = np.array([[[0, 0, 0, 0, 0]]], dtype=float) 15 | args = np.append(target, prediction) 16 | loss = -K.log(tf_2d_normal(*args) + K.epsilon()).eval() 17 | self.assertAlmostEqual(loss, 1.1048509233685306, places=PRECISION) 18 | 19 | def test_2d_gaussian_small_mu_diff(self): 20 | target = np.array([[[5, 52]]], dtype=float) 21 | prediction = np.array([[[5 + 1e-6, 52 + 1e-6, 0, 0, 0]]], dtype=float) 22 | args = np.append(target, prediction) 23 | loss = -K.log(tf_2d_normal(*args) + K.epsilon()).eval() 24 | self.assertAlmostEqual(loss, 1.1048509233706119, places=PRECISION) 25 | 26 | def test_2d_gaussian_small_sigma_diff(self): 27 | tensor_train = np.array([[[5, 52]]], dtype=float) 28 | tensor_predict = np.array([[[5, 52, 1e-6, 1e-6, 0]]], dtype=float) 29 | loss = tf_2d_normal(tensor_train, tensor_predict).eval() 30 | self.assertAlmostEqual(loss, 1.1048523660629765, places=PRECISION) 31 | 32 | def test_2d_gaussian_mu_ones(self): 33 | target = np.array([[[1, 1]]], dtype=float) 34 | prediction = np.array([[[1, 1, 1, 1, 0]]], dtype=float) 35 | args = np.append(target, prediction) 36 | loss = -K.log(tf_2d_normal(*args) + K.epsilon()).eval() 37 | self.assertAlmostEqual(loss, 2.3829037437816121, places=PRECISION) 38 | 39 | def test_2d_gaussian_mu_minus_ones(self): 40 | target = np.array([[[1, 1]]], dtype=float) 41 | prediction = np.array([[[1, 1, -1, -1, 0]]], dtype=float) 42 | args = np.append(target, prediction) 43 | loss = -K.log(tf_2d_normal(*args) + K.epsilon()).eval() 44 | self.assertAlmostEqual(loss, 2.3829037437816121, places=PRECISION) 45 | 46 | def test_2d_gaussian_ones(self): 47 | target = np.array([[[1, 1]]], dtype=float) 48 | prediction = np.array([[[1, 1, 1, 1, 1]]], dtype=float) 49 | args = np.append(target, prediction) 50 | loss = -K.log(tf_2d_normal(*args) + K.epsilon()).eval() 51 | self.assertAlmostEqual(loss, 1.9491232946784192, places=PRECISION) 52 | 53 | def test_2d_gaussian_rho_one(self): 54 | target = np.array([[[1, 2]]], dtype=float) 55 | prediction = np.array([[[1, 2, 0, 0, 1]]], dtype=float) 56 | args = np.append(target, prediction) 57 | loss = -K.log(tf_2d_normal(*args) + K.epsilon()).eval() 58 | self.assertAlmostEqual(loss, 1.9491232946784192, places=PRECISION) 59 | 60 | def test_2d_gaussian_rho_minus_one(self): 61 | target = np.array([[[1, 2]]], dtype=float) 62 | prediction = np.array([[[1, 2, 0, 0, -1]]], dtype=float) 63 | args = np.append(target, prediction) 64 | loss = -K.log(tf_2d_normal(*args) + K.epsilon()).eval() 65 | self.assertAlmostEqual(loss, 1.9491232946784192, places=PRECISION) 66 | 67 | def test_2d_gaussian_rho_two(self): 68 | target = np.array([[[1, 2]]], dtype=float) 69 | prediction = np.array([[[1, 2, 0, 0, 2]]], dtype=float) 70 | args = np.append(target, prediction) 71 | loss = -K.log(tf_2d_normal(*args) + K.epsilon()).eval() 72 | self.assertAlmostEqual(loss, 1.9491232946784192, places=PRECISION) 73 | 74 | def test_2d_gaussian_rho_minus_two(self): 75 | target = np.array([[[1, 2]]], dtype=float) 76 | prediction = np.array([[[1, 2, 0, 0, -2]]], dtype=float) 77 | args = np.append(target, prediction) 78 | loss = -K.log(tf_2d_normal(*args) + K.epsilon()).eval() 79 | self.assertAlmostEqual(loss, 1.9491232946784192, places=PRECISION) 80 | 81 | def test_2d_gaussian_big_diff(self): 82 | target = np.array([[[5, 52]]], dtype=float) 83 | prediction = np.array([[[1, 2, 3, 4, 5]]], dtype=float) 84 | args = np.append(target, prediction) 85 | loss = -K.log(tf_2d_normal(*args) + K.epsilon()).eval() 86 | self.assertAlmostEqual(loss, 16.11809565095832, places=PRECISION) 87 | 88 | def test_2d_gaussian_really_big_diff(self): 89 | target = np.array([[[5, 52]]], dtype=float) 90 | prediction = np.array([[[0, 0, 3, 4, 5]]], dtype=float) 91 | args = np.append(target, prediction) 92 | loss = -K.log(tf_2d_normal(*args) + K.epsilon()).eval() 93 | self.assertAlmostEqual(loss, 16.11809565095832, places=PRECISION) 94 | 95 | def test_2d_gaussian_max_neg_rho(self): 96 | min_rho = -19.06 # This is about the limit of rho before geom_gaussian_loss returns NaN 97 | target = np.array([[[5, 52]]], dtype=float) 98 | prediction = np.array([[[5, 52, -1, -1, min_rho]]], dtype=float) 99 | args = np.append(target, prediction) 100 | loss = -K.log(tf_2d_normal(*args) + K.epsilon()).eval() 101 | self.assertAlmostEqual(loss, -18.505382378927028, places=PRECISION) -------------------------------------------------------------------------------- /model/topoml_util/test_wkt2pyplot.py: -------------------------------------------------------------------------------- 1 | import unittest 2 | import numpy as np 3 | from datetime import datetime 4 | 5 | from shapely.geometry import Point 6 | from topoml_util.wkt2pyplot import wkt2pyplot 7 | 8 | from topoml_util.GeoVectorizer import GeoVectorizer 9 | 10 | 11 | class TestWktToPyplotPoly(unittest.TestCase): 12 | def test_geometrycollection_empty(self): 13 | inputs = ['GEOMETRYCOLLECTION EMPTY'] # This is valid WKT 14 | plt, fig, ax = wkt2pyplot(inputs) 15 | plt.show() # It should show an empty plot 16 | 17 | def test_polygon_conversion(self): 18 | TIMESTAMP = str(datetime.now()).replace(':', '.') 19 | 20 | inputs = 'POLYGON((1.09872727273 -0.289454545452,-0.241272727273 0.682545454538,-0.992272727274 ' \ 21 | '0.292545454528,0.347727272727 -0.680454545474,1.09872727273 -0.289454545452))\nPOLYGON((' \ 22 | '-0.976272727273 0.302545454574,-0.25627272727 0.676545454539,1.05372727273 -0.276454545443,' \ 23 | '0.320727272731 -0.654454545455,-0.477272727268 -0.0664545454754,-0.976272727273 ' \ 24 | '0.302545454574))' 25 | inputs = inputs.split('\n') 26 | 27 | target = 'POLYGON((-0.974272727277 0.301545454562,-0.255272727276 0.675545454527,1.05372727273 ' \ 28 | '-0.276454545443,0.320727272731 -0.654454545455,-0.477272727268 -0.0664545454754,-0.974272727277 ' \ 29 | '0.301545454562))' 30 | 31 | prediction = [ 32 | 'POINT(-0.974272727277 0.301545454562)', 33 | 'POINT(-0.255272727276 0.675545454527)', 34 | 'POINT(1.05372727273 -0.276454545443)', 35 | 'POINT(0.320727272731 -0.654454545455)', 36 | 'POINT(-0.477272727268 -0.0664545454754)', 37 | 'POINT(-0.974272727277 0.301545454562)', 38 | ] 39 | plt, fig, ax = wkt2pyplot(inputs, [target], prediction) 40 | plt.text(0.01, 0.06, 'prediction: some more text', transform=ax.transAxes) 41 | plt.text(0.01, 0.01, 'target: some text', transform=ax.transAxes) 42 | 43 | plt.show() 44 | 45 | def test_gaussian_sample_plot(self): 46 | 47 | inputs = 'POLYGON((1.09872727273 -0.289454545452,-0.241272727273 0.682545454538,-0.992272727274 ' \ 48 | '0.292545454528,0.347727272727 -0.680454545474,1.09872727273 -0.289454545452))\nPOLYGON((' \ 49 | '-0.976272727273 0.302545454574,-0.25627272727 0.676545454539,1.05372727273 -0.276454545443,' \ 50 | '0.320727272731 -0.654454545455,-0.477272727268 -0.0664545454754,-0.976272727273 ' \ 51 | '0.302545454574))' 52 | inputs = inputs.split('\n') 53 | 54 | target = np.array([ 55 | # mu1 mu2 s1 s2 rho pi [geo type one-hot ] [render 1hot] 56 | [0.1, 0.1, 0.1, 0.1, 0., 0., 0., 0., 0., 1., 0., 0., 0., 0., 1., 0., 0., ], 57 | [0.1, -0.1, 0.1, 0.1, 0., 0., 0., 0., 0., 1., 0., 0., 0., 0., 1., 0., 0., ], 58 | [-0.1, -0.1, 0.1, 0.1, 0., 0., 0., 0., 0., 1., 0., 0., 0., 0., 1., 0., 0., ], 59 | [-0.1, 0.1, 0.1, 0.1, 0., 0., 0., 0., 0., 1., 0., 0., 0., 0., 1., 0., 0., ], 60 | [0.1, 0.1, 0.1, 0.1, 0., 0., 0., 0., 0., 1., 0., 0., 0., 0., 0., 0., 1., ], 61 | [0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 1., ], 62 | [0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 1., ] 63 | ]) 64 | 65 | target = [Point(point).wkt for point in 66 | GeoVectorizer(gmm_size=1).decypher_gmm_geom(target, 1000)] 67 | 68 | plt, fig, ax = wkt2pyplot(inputs, target, None) 69 | plt.show() 70 | -------------------------------------------------------------------------------- /model/topoml_util/wkt2pyplot.py: -------------------------------------------------------------------------------- 1 | from datetime import datetime 2 | 3 | import matplotlib 4 | import os 5 | 6 | if not os.environ.get('MATPLOTLIB_TEST'): 7 | matplotlib.use('Agg') # for headless machine instances 8 | 9 | from shapely import wkt 10 | from matplotlib import pyplot as plt 11 | 12 | 13 | def wkt2pyplot(input_wkts, target_wkts=None, prediction_wkts=None, 14 | input_color='green', target_color='red', pred_color='blue'): 15 | """ 16 | Convert arrays of input, target and prediction well-known encoded geometry arrays to pyplot 17 | :param input_wkts: an array of input geometries, rendered in (standard) green 18 | :param target_wkts: optional array of target geometries, rendered in (standard) red 19 | :param prediction_wkts: optional array of prediction geometries, rendered in (standard) blue 20 | :param input_color: a pyplot-compatible notation of color, default green 21 | :param pred_color: a pyplot-compatible notation of color, default red 22 | :param target_color: a pyplot-compatible notation of color, default blue 23 | :return: a matplotlib pyplot fig, ax and plt 24 | """ 25 | input_geoms = [wkt.loads(input_wkt) for input_wkt in input_wkts] 26 | 27 | fig, ax = plt.subplots() 28 | 29 | input_polys = [] 30 | for input_geom in input_geoms: 31 | if len(input_geom.bounds) > 0 and input_geom.geom_type == 'Polygon': 32 | input_polys.append(matplotlib.patches.Polygon(input_geom.boundary.coords)) 33 | 34 | inputs = matplotlib.collections.PatchCollection(input_polys, alpha=0.4, linewidth=1) 35 | inputs.set_color(input_color) 36 | ax.add_collection(inputs) 37 | 38 | # target_polys = [Polygon(target_geom.boundary.coords) for target_geom in target_geoms] 39 | # targets = PatchCollection(target_polys, alpha=0.4, linewidth=1) 40 | # targets.set_color(target_color) 41 | # ax.add_collection(targets) 42 | 43 | # TODO: handle other types of geometries 44 | # TODO: handle holes in polygons (donuts) 45 | if target_wkts: 46 | target_geoms = [wkt.loads(target_wkt) for target_wkt in target_wkts] 47 | for geom in target_geoms: 48 | if geom.type == 'Point': 49 | plt.plot(geom.coords.xy[0][0], geom.coords.xy[1][0], 50 | marker='o', color=target_color, alpha=0.4, linewidth=0) 51 | elif geom.type == 'Polygon': 52 | collection = matplotlib.collections.PatchCollection([matplotlib.patches.Polygon(geom.boundary.coords)], 53 | alpha=0.4, linewidth=1) 54 | collection.set_color(target_color) 55 | ax.add_collection(collection) 56 | 57 | if prediction_wkts: 58 | prediction_geoms = [wkt.loads(prediction_wkt) for prediction_wkt in prediction_wkts] 59 | for geom in prediction_geoms: 60 | if geom.geom_type == 'Point': 61 | plt.plot(geom.coords.xy[0][0], geom.coords.xy[1][0], 62 | marker='o', color=pred_color, alpha=0.1, linewidth=0) 63 | elif geom.type == 'Polygon': 64 | collection = matplotlib.collections.PatchCollection([matplotlib.patches.Polygon(geom.boundary.coords)], 65 | alpha=0.4, linewidth=1) 66 | collection.set_color(pred_color) 67 | ax.add_collection(collection) 68 | 69 | plt.axis('auto') 70 | 71 | return plt, fig, ax 72 | 73 | 74 | def save_plot(geoms, plot_dir='plots', timestamp=None): 75 | os.makedirs(str(plot_dir), exist_ok=True) 76 | plt, fig, ax = wkt2pyplot(*geoms) 77 | plt.savefig(plot_dir + '/plt_' + timestamp + '.png') 78 | plt.close('all') 79 | -------------------------------------------------------------------------------- /prep/ProgressBar.py: -------------------------------------------------------------------------------- 1 | """ 2 | Adapted from http://stackoverflow.com/questions/3160699/python-progress-bar 3 | """ 4 | 5 | import sys 6 | 7 | from time import time 8 | 9 | 10 | class ProgressBar: 11 | """ 12 | Class for creating std output progress indication bars 13 | """ 14 | def __init__(self, bar_length=40): 15 | """ 16 | Constructor 17 | :param bar_length: length of the bar in characters 18 | """ 19 | self.start_seconds = time() 20 | self.bar_length = bar_length 21 | 22 | def update_progress(self, progress, status=''): 23 | """ 24 | update_progress() : Displays or updates a std out progress bar 25 | 26 | The method simply repeats on the console each time the method is called 27 | :param status: Optional status message 28 | :param progress: Accepts a float between 0 and 1. Any int will be converted to a float. 29 | A value under 0 represents a 'halt'. 30 | A value at 1 or bigger represents 100% 31 | :return: None 32 | """ 33 | 34 | if isinstance(progress, int): 35 | progress = float(progress) 36 | if not isinstance(progress, float): 37 | raise ValueError("error: progress must be numeric") 38 | if progress < 0: 39 | progress = 0 40 | status = "Halt...\r\n" 41 | if progress >= 1: 42 | progress = 1 43 | status = "Done...\r\n" 44 | 45 | progress_rounded = "{:10.2f}".format(float(progress*100)) 46 | elapsed_time = time() - self.start_seconds 47 | if progress > 0: 48 | projected_time = elapsed_time / progress - elapsed_time 49 | else: 50 | projected_time = 0 51 | 52 | block = round(self.bar_length * min(progress, 1)) 53 | progress_line = "\U000025B0" * (max(0, block - 1)) + "\U000025BA" 54 | progress_line += "\U000025B1" * (self.bar_length - block) 55 | 56 | hours, remainder = divmod(projected_time, 3600) 57 | minutes, seconds = divmod(remainder, 60) 58 | eta = '{}h{}m{}s'.format(int(hours), int(minutes), int(seconds)) 59 | 60 | text = "\r\U0001F680{}\U0001F3C1 {}% {} {}".format(progress_line, progress_rounded, eta, status) 61 | sys.stdout.write(text) 62 | sys.stdout.flush() 63 | -------------------------------------------------------------------------------- /prep/densified.py: -------------------------------------------------------------------------------- 1 | import numpy as np 2 | 3 | from topoml_util.GeoVectorizer import GeoVectorizer 4 | from topoml_util.geom_scaler import localized_mean, localized_normal 5 | 6 | DATA_FILE = '../files/brt_osm/brt_osm.npz' 7 | TARGET_FILE = '../files/brt_osm/densified_vectorized.npz' 8 | DENSIFIED = 100 9 | 10 | loaded = np.load(DATA_FILE) 11 | raw_training_vectors = loaded['input_geoms'] 12 | raw_target_vectors = loaded['intersection'] 13 | 14 | training_vectors = [] 15 | target_vectors = [] 16 | 17 | # skip non-intersecting geometries 18 | for train, target in zip(raw_training_vectors, raw_target_vectors): 19 | if not target[0, 0] == 0: # a zero coordinate designates an empty geometry 20 | training_vectors.append(train) 21 | target_vectors.append(target) 22 | 23 | print('Preprocessing vectors...') 24 | means = localized_mean(training_vectors) 25 | training_vectors = localized_normal(training_vectors, means, 1e4) 26 | training_vectors = np.array([GeoVectorizer.interpolate(vector, DENSIFIED) for vector in training_vectors]) 27 | target_vectors = localized_normal(target_vectors, means, 1e4) 28 | target_vectors = np.array([GeoVectorizer.interpolate(vector, 50) for vector in target_vectors]) 29 | 30 | print('Saving compressed numpy data file', TARGET_FILE) 31 | 32 | np.savez_compressed( 33 | TARGET_FILE, 34 | input_geoms=training_vectors, 35 | intersection=target_vectors 36 | ) 37 | 38 | print('Done!') 39 | -------------------------------------------------------------------------------- /prep/export-data.sh: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env bash 2 | echo "exporting data, this will take a couple of minutes..." 3 | rm /data/files/brt_osm/brt_osm.csv 4 | # extract the joined data 5 | # https://gis.stackexchange.com/questions/185072/ogr2ogr-sql-query-from-text-file#185141 6 | cd /data/files 7 | set -ex 8 | ogr2ogr -f CSV brt_osm/brt_osm.csv PG:"host=postgis port=5432 dbname=postgres user=postgres password=postgres" -sql @../prep/spatial-join.sql 9 | set -e 10 | lines=$(tail -n +2 brt_osm.csv | wc -l) 11 | echo 12 | echo "Wrote $lines number of data points" 13 | echo "The export script ran successfully. The generated data set was saved to files/brt_osm/brt_osm.csv" 14 | 15 | cd ../prep 16 | echo "Creating BRT/OSM numpy archive..." 17 | python3 vectorize_brt_osm.py 18 | echo "Creating neighborhoods numpy archive..." 19 | python3 get-neighborhoods.py 20 | 21 | echo "Done!" -------------------------------------------------------------------------------- /prep/get-data.sh: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env bash 2 | echo "importing data... this will take a while depending on your internet connection speed" 3 | set -ex 4 | mkdir -p /data/files 5 | cd /data/files 6 | 7 | # Get the Base Registration for Topograpy data 8 | curl -o base_registration_topography/TOP10NL_25W.zip https://geodata.nationaalgeoregister.nl/top10nlv2/extract/kaartbladen/TOP10NL_25W.zip?formaat=gml 9 | curl -o base_registration_topography/TOP10NL_34O.zip https://geodata.nationaalgeoregister.nl/top10nlv2/extract/kaartbladen/TOP10NL_34O.zip?formaat=gml 10 | 11 | # Get the OpenStreetMap data 12 | curl -o openstreetmap/netherlands-latest-free.shp.zip http://download.geofabrik.de/europe/netherlands-latest-free.shp.zip 13 | 14 | # Get neighborhoods 15 | curl -X GET \ 16 | -o neighborhoods/neighborhoods.csv \ 17 | 'https://geodata.nationaalgeoregister.nl/wijkenbuurten2017/wfs?request=GetFeature&service=WFS&version=2.0.0&typeName=cbs_buurten_2017&outputFormat=csv&srsName=EPSG%3A4326&PropertyName=aantal_inwoners%2Cgeom' 18 | 19 | # Get BAG buildings 20 | types=( woonfunctie winkelfunctie bijeenkomstfunctie onderwijsfunctie gezondheidszorgfunctie kantoorfunctie industriefunctie sportfunctie logiesfunctie ) 21 | pages=( 1000 2000 3000 4000 5000 6000 7000 8000 9000 10000 11000 12000 13000 14000 15000 16000 17000 18000 19000 20000 21000 22000 ) 22 | 23 | for type in "${types[@]}" 24 | do 25 | url="https://geodata.nationaalgeoregister.nl/bag/wfs?request=GetFeature&service=WFS&version=2.0.0&typeName=pand&outputFormat=csv&srsName=EPSG%3A4326&PropertyName=geometrie%2Cgebruiksdoel&cql_filter=(gebruiksdoel%3D'${type}')" 26 | curl -X GET ${url} | grep -e gebruiksdoel -e pand > buildings/buildings-${type}.csv 27 | for page in "${pages[@]}" 28 | do 29 | url="https://geodata.nationaalgeoregister.nl/bag/wfs?request=GetFeature&service=WFS&version=2.0.0&typeName=pand&outputFormat=csv&srsName=EPSG%3A4326&PropertyName=geometrie%2Cgebruiksdoel&startIndex="${page}"&cql_filter=(gebruiksdoel%3D'${type}')" 30 | echo ${url} 31 | curl -X GET ${url} | grep -v gebruiksdoel | grep pand >> buildings/buildings-${type}.csv 32 | done 33 | done 34 | 35 | # Inflate 36 | unzip -o base_registration_topography/TOP10NL_25W.zip 37 | unzip -o base_registration_topography/TOP10NL_34O.zip 38 | unzip -o openstreetmap/netherlands-latest-free.shp.zip *buildings* 39 | 40 | # Load the database. Be sure to have the postgis container running 41 | ogr2ogr -f "PostgreSQL" PG:"host=postgis port=5432 dbname=postgres user=postgres password=postgres" base_registration_topography/TOP10NL_25W.gml -overwrite -progress -t_srs "EPSG:4326" -oo GML_ATTRIBUTES_TO_OGR_FIELDS=YES 42 | ogr2ogr -f "PostgreSQL" PG:"host=postgis port=5432 dbname=postgres user=postgres password=postgres" base_registration_topography/TOP10NL_34O.gml -append -progress -t_srs "EPSG:4326" -oo GML_ATTRIBUTES_TO_OGR_FIELDS=YES 43 | # https://trac.osgeo.org/gdal/ticket/4939 44 | # http://www.bostongis.com/PrinterFriendly.aspx?content_name=ogr_cheatsheet 45 | ogr2ogr -f "PostgreSQL" PG:"host=postgis port=5432 dbname=postgres user=postgres password=postgres" openstreetmap/gis.osm_buildings_a_free_1.shp -overwrite -progress -nln osm_buildings -nlt PROMOTE_TO_MULTI -lco EXTRACT_SCHEMA_FROM_LAYER_NAME=no 46 | 47 | bash ./export-data.sh -------------------------------------------------------------------------------- /prep/preprocess-neighborhoods.py: -------------------------------------------------------------------------------- 1 | """ 2 | Preprocessing script to convert well-known-text geometries to matrix representations thereof. 3 | With a SANE_NUMBER_OF_POINTS set to 2048, it simplifies only 248 4 | """ 5 | 6 | import os 7 | from datetime import timedelta 8 | from time import time 9 | from zipfile import ZipFile 10 | 11 | import matplotlib.pyplot as plt 12 | import numpy as np 13 | from pandas import read_csv 14 | from shapely import wkt 15 | from sklearn.model_selection import train_test_split 16 | 17 | from model.topoml_util.GeoVectorizer import GeoVectorizer 18 | from model.topoml_util.geom_fourier_descriptors import create_geom_fourier_descriptor 19 | from prep.ProgressBar import ProgressBar 20 | 21 | SCRIPT_VERSION = '7' 22 | SOURCE_DIR = '../files/neighborhoods/' 23 | SOURCE_ZIP = SOURCE_DIR + 'neighborhoods.csv.zip' 24 | SOURCE_CSV = 'neighborhoods.csv' 25 | LOG_FILE = 'neighborhoods_preprocessing.log' 26 | TRAIN_DATA_FILE = SOURCE_DIR + 'neighborhoods_train_v' + SCRIPT_VERSION 27 | TEST_DATA_FILE = SOURCE_DIR + 'neighborhoods_test_v' + SCRIPT_VERSION 28 | SANE_NUMBER_OF_POINTS = 2048 29 | REDUCED_POINTS = 256 30 | TRAIN_TEST_SPLIT = 0.1 31 | FOURIER_DESCRIPTOR_ORDER = 32 # The axis 0 size 32 | SCRIPT_START = time() 33 | 34 | if not os.path.isfile(SOURCE_ZIP): 35 | raise FileNotFoundError('Unable to locate {}. Please run the prep/get-data.sh script first'.format(SOURCE_ZIP)) 36 | 37 | print('Preprocessing archaeological features...') 38 | zip_file = ZipFile(SOURCE_ZIP) 39 | df = read_csv(zip_file.open(SOURCE_CSV)) 40 | df = df[df.aantal_inwoners >= 0] # Filter out negative placeholder values for unknowns 41 | 42 | print('Creating geometry vectors and descriptors...') 43 | wkt_vectors = [] 44 | shapes = [wkt.loads(wkt_string) for wkt_string in df.geom.values] 45 | number_of_vertices = [GeoVectorizer.num_points_from_wkt(shape.wkt) for shape in shapes] 46 | 47 | plt.hist(number_of_vertices, bins=20, log=True) 48 | plt.savefig('neighborhood_geom_vertices_distr.png') 49 | geoms_above_threshold = len([v for v in number_of_vertices if v > SANE_NUMBER_OF_POINTS]) 50 | print('{} of the {} geometries are over the max {} vertices threshold and will be simplified.\n'.format( 51 | geoms_above_threshold, len(shapes), SANE_NUMBER_OF_POINTS)) 52 | 53 | pgb = ProgressBar() 54 | logfile = open(LOG_FILE, 'w') 55 | selected_data = [] 56 | simplified_geometries = 0 57 | errors = 0 58 | 59 | for index, (inhabitants, wkt_string) in enumerate(zip(df.aantal_inwoners.values, df.geom.values)): 60 | pgb.update_progress(index/len(df.geom.values), '{} geometries, {} errors in logfile'.format(index, errors)) 61 | try: 62 | shape = wkt.loads(wkt_string) 63 | fixed_size_wkt_vector = GeoVectorizer.vectorize_wkt(wkt_string, REDUCED_POINTS, simplify=True, fixed_size=True) 64 | 65 | geom_len = min(GeoVectorizer.num_points_from_wkt(shape.wkt), SANE_NUMBER_OF_POINTS) 66 | if geom_len == SANE_NUMBER_OF_POINTS: 67 | simplified_geometries += 1 68 | wkt_vector = GeoVectorizer.vectorize_wkt(wkt_string, geom_len, simplify=True) 69 | 70 | # If multipart multipolygon: select the largest, but it will throw off the accuracy a bit. 71 | if shape.geom_type == 'MultiPolygon': 72 | if len(shape.geoms) > 1: 73 | geometries = sorted(shape.geoms, key=lambda x: x.area) 74 | shape = geometries[-1] 75 | else: 76 | shape = shape.geoms[0] 77 | elif shape.geom_type == 'Polygon': 78 | pass 79 | else: 80 | logfile.write('skipping record: no (multi)polygon entry in {} on line {}'.format( 81 | SOURCE_CSV, index + 2)) 82 | errors += 1 83 | continue 84 | 85 | efds = create_geom_fourier_descriptor(shape, FOURIER_DESCRIPTOR_ORDER) 86 | 87 | except Exception as e: 88 | logfile.write('Skipping record on account of geometry entry in {} on line {} with error: {}\n'.format( 89 | SOURCE_CSV, index + 2, e)) 90 | errors += 1 91 | continue 92 | 93 | # Append the converted values if all went well 94 | selected_data.append({ 95 | 'geom': wkt_vector, 96 | 'fixed_size_geom': fixed_size_wkt_vector, 97 | 'elliptic_fourier_descriptors': efds, 98 | 'inhabitants': inhabitants, 99 | }) 100 | 101 | logfile.close() 102 | print('\ncreated {} data points with {} simplified geometries and {} errors'.format( 103 | len(selected_data), simplified_geometries, errors)) 104 | 105 | median = np.median([p['inhabitants'] for p in selected_data]) 106 | print('Median:', median, 'inhabitants') 107 | 108 | # Split and save data 109 | train, test = train_test_split(selected_data, test_size=0.1, random_state=42) 110 | 111 | print('Saving test data...') 112 | # Test data is small enough to put in one archive 113 | np.savez_compressed( 114 | TEST_DATA_FILE, 115 | geoms=[record['geom'] for record in test], 116 | fixed_size_geoms=[record['fixed_size_geom'] for record in test], 117 | elliptic_fourier_descriptors=[record['elliptic_fourier_descriptors'] for record in test], 118 | inhabitants=[record['inhabitants'] for record in test], 119 | above_or_below_median=[int(record['inhabitants'] > median) for record in test], 120 | type_index={0: 'less than median', 1: 'greater than or equal to median'}, 121 | ) 122 | 123 | print('Saving training data...') 124 | np.savez_compressed( 125 | TRAIN_DATA_FILE, 126 | geoms=[record['geom'] for record in train], 127 | fixed_size_geoms=[record['fixed_size_geom'] for record in train], 128 | elliptic_fourier_descriptors=[record['elliptic_fourier_descriptors'] for record in train], 129 | inhabitants=[record['inhabitants'] for record in test], 130 | above_or_below_median=[int(record['inhabitants'] > median) for record in train], 131 | type_index={0: 'less than median', 1: 'greater than or equal to median'}, 132 | ) 133 | 134 | runtime = time() - SCRIPT_START 135 | print('Done in {}'.format(timedelta(seconds=runtime))) 136 | -------------------------------------------------------------------------------- /prep/spatial-join.sql: -------------------------------------------------------------------------------- 1 | SELECT st_astext(st_snaptogrid(gebouw.wkb_geometry, 0.0000001)) AS brt_wkt, 2 | st_astext(st_snaptogrid(ST_GeometryN(osm_buildings.wkb_geometry, 1), 0.0000001)) AS osm_wkt, 3 | st_astext(st_snaptogrid(st_intersection(gebouw.wkb_geometry, osm_buildings.wkb_geometry), 0.0000001)) AS intersection_wkt, 4 | st_distance(st_transform(st_centroid(gebouw.wkb_geometry), 28992), st_transform(st_centroid(osm_buildings.wkb_geometry), 28992))::real AS centroid_distance, 5 | st_distance(st_transform(gebouw.wkb_geometry, 28992), st_transform(osm_buildings.wkb_geometry, 28992))::real AS geom_distance, 6 | st_astext(st_snaptogrid(st_centroid(gebouw.wkb_geometry), 0.0000001)) AS brt_centroid_wkt, 7 | st_astext(st_snaptogrid(st_centroid(osm_buildings.wkb_geometry), 0.0000001)) AS osm_centroid_wkt, 8 | st_astext(st_snaptogrid(st_transform(st_centroid(gebouw.wkb_geometry), 28992), 0.0000001)) AS brt_centroid_wkt_rd, 9 | st_astext(st_snaptogrid(st_transform(st_centroid(osm_buildings.wkb_geometry), 28992), 0.0000001)) AS osm_centroid_wkt_rd, 10 | st_area(st_transform(st_intersection(gebouw.wkb_geometry, osm_buildings.wkb_geometry), 28992))::real AS intersection_surface 11 | FROM gebouw, osm_buildings 12 | WHERE 13 | -- Allow only polygons (there are a few point buildings in there, don't ask me why) 14 | ST_GeometryType(gebouw.wkb_geometry) = 'ST_Polygon' AND 15 | -- Expand each source geometry with a buffer of a few meters to include non-intersecting target geometries 16 | st_intersects(st_buffer(gebouw.wkb_geometry, 0.00005), osm_buildings.wkb_geometry) AND 17 | -- Guarantee good geometries 18 | st_issimple(st_snaptogrid(gebouw.wkb_geometry, 0.000001)) AND 19 | st_issimple(st_snaptogrid(osm_buildings.wkb_geometry, 0.000001)) AND 20 | st_issimple(st_snaptogrid(st_intersection(gebouw.wkb_geometry, osm_buildings.wkb_geometry), 0.000001)) AND 21 | -- Restrict to ringless polygons 22 | ST_NumInteriorRings(gebouw.wkb_geometry) = 0 AND 23 | ST_NumInteriorRings(ST_GeometryN(osm_buildings.wkb_geometry, 1)) = 0 24 | LIMIT 500000 -------------------------------------------------------------------------------- /prep/triangles.py: -------------------------------------------------------------------------------- 1 | import numpy as np 2 | from model.topoml_util.GeoVectorizer import GeoVectorizer 3 | from shapely.geometry import Polygon 4 | from shapely.wkt import loads 5 | 6 | SET_SIZE = 100000 7 | TRIANGLES = '../files/triangles.npz' 8 | 9 | 10 | print('Creating triangles') 11 | raw_training_vectors = np.random.normal(size=(SET_SIZE, 6, 2)) 12 | triangle_sets = np.array([[Polygon(point_set[0:3]).wkt, Polygon(point_set[3:]).wkt] 13 | for point_set in raw_training_vectors]) 14 | max_points = GeoVectorizer.max_points(triangle_sets[:, 0], triangle_sets[:, 1]) 15 | 16 | print('Intersecting triangles and pruning') 17 | intersection_area = [] 18 | intersection_vectors = [] 19 | for index, (a, b) in enumerate(triangle_sets): 20 | # if loads(a).intersection_surface_area(loads(b)).type == 'Polygon': # constrain to actually intersecting 21 | intersection = loads(a).intersection(loads(b)) 22 | intersection_area.append(intersection.area) 23 | intersection_vectors.append(GeoVectorizer.vectorize_wkt(intersection.wkt, 12)) 24 | 25 | training_vectors = np.reshape(raw_training_vectors, (SET_SIZE, 12)) 26 | (_, GEO_VECTOR_LEN) = np.array(training_vectors).shape 27 | intersection_area = np.array(intersection_area) 28 | 29 | print('Saving compressed numpy data file', TRIANGLES) 30 | 31 | np.savez_compressed( 32 | TRIANGLES, 33 | point_sequence=training_vectors, # Sets of two geometries in WGS84 lon/lat, 25% of them overlapping 34 | intersection_geoms=intersection_vectors, # Geometries representing the intersection_surface_area in WGS84 lon/lat 35 | intersection_surface=intersection_area, # Surface in square meters of the intersection_surface_area 36 | ) 37 | print('Saved vectorized geometries to', TRIANGLES) 38 | -------------------------------------------------------------------------------- /prep/util/__init__.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/SPINLab/geometry-learning/5300d421ef848c2748a2ba41ced5c6e2fba93200/prep/util/__init__.py -------------------------------------------------------------------------------- /prep/util/layerToWGS.py: -------------------------------------------------------------------------------- 1 | from osgeo import osr, gdal 2 | 3 | 4 | def gdal_error_handler(err_class, err_num, err_msg): 5 | err_type = { 6 | gdal.CE_None: 'None', 7 | gdal.CE_Debug: 'Debug', 8 | gdal.CE_Warning: 'Warning', 9 | gdal.CE_Failure: 'Failure', 10 | gdal.CE_Fatal: 'Fatal' 11 | } 12 | err_msg = err_msg.replace('\n', ' ') 13 | err_class = err_type.get(err_class, 'None') 14 | print('Error Number: %s' % err_num) 15 | print('Error Type: %s' % err_class) 16 | print('Error Message: %s' % err_msg) 17 | 18 | 19 | # install error handler 20 | gdal.PushErrorHandler(gdal_error_handler) 21 | 22 | 23 | def layerToWGS(in_layer): 24 | out_driver = gdal.ogr.GetDriverByName('MEMORY') 25 | out_dataset = out_driver.CreateDataSource('Output datasource') 26 | out_layer = out_dataset.CreateLayer('Gebouw', geom_type=in_layer.GetGeomType()) 27 | 28 | # input SpatialReference 29 | in_spatial_ref = osr.SpatialReference() 30 | in_spatial_ref.ImportFromEPSG(28992) 31 | 32 | # output SpatialReference 33 | out_spatial_ref = osr.SpatialReference() 34 | out_spatial_ref.ImportFromEPSG(4326) 35 | 36 | # create the CoordinateTransformation 37 | coord_trans = osr.CoordinateTransformation(in_spatial_ref, out_spatial_ref) 38 | 39 | in_layer_defn = in_layer.GetLayerDefn() 40 | # get the output layer's feature definition 41 | out_layer_defn = out_layer.GetLayerDefn() 42 | 43 | for i in range(0, in_layer_defn.GetFieldCount()): 44 | field_defn = in_layer_defn.GetFieldDefn(i) 45 | out_layer.CreateField(field_defn) 46 | 47 | # loop through the input features 48 | in_feature = in_layer.GetNextFeature() 49 | while in_feature: 50 | # get the input geometry 51 | geometry = in_feature.GetGeometryRef() 52 | # reproject the geometry 53 | geometry.Transform(coord_trans) 54 | # create a new feature 55 | out_feature = in_feature.Clone() 56 | # set the geometry and attribute 57 | out_feature.SetGeometry(geometry) 58 | # out_feature.SetFieldsFrom(in_feature) 59 | # for i in range(0, out_layer_defn.GetFieldCount()): 60 | # out_feature.SetField(out_layer_defn.GetFieldDefn(i).GetNameRef(), in_feature.GetField(i)) 61 | # add the feature to the layer 62 | out_layer.CreateFeature(out_feature) 63 | # dereference the features and get the next input feature 64 | out_feature = None 65 | in_feature = in_layer.GetNextFeature() 66 | 67 | return out_layer 68 | -------------------------------------------------------------------------------- /requirements.txt: -------------------------------------------------------------------------------- 1 | boto3 2 | Keras>=2.1.2 3 | numpy>=1.14.0 4 | pandas>=0.22.0 5 | scikit-learn>=0.19.1 6 | scipy>=1.0.0 7 | Shapely>=1.6.3 8 | slackclient>=1.1.0 9 | tensorflow-gpu>=1.4.1 10 | matplotlib>=2.1.2 11 | pyefd>=1.0 12 | -------------------------------------------------------------------------------- /script/build-script.sh: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env bash 2 | set -x 3 | echo "Changes:" 4 | cat $1 5 | 6 | # TeamCity style 7 | CHANGED_MODEL_FILES="$(cat $1 | \ 8 | grep -v REMOVED | \ 9 | cut -d \: -f 1 | \ 10 | grep -e model | \ 11 | grep .py | \ 12 | grep -v util | \ 13 | grep -v baseline | \ 14 | grep -v png \ 15 | )" 16 | echo ${CHANGED_MODEL_FILES} 17 | 18 | # Comment out line below to enable automated script execution 19 | #CHANGED_MODEL_FILES="echo ${CHANGED_MODEL_FILES} | grep DISABLE_AUTOMATED_EXECUTION" 20 | 21 | set -e 22 | cd model 23 | for FILE in ${CHANGED_MODEL_FILES} 24 | do 25 | python3 ../${FILE} 26 | done 27 | 28 | echo "built!" -------------------------------------------------------------------------------- /script/install-docker-ubuntu.sh: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env bash 2 | sudo apt-get update 3 | sudo apt-get install -y \ 4 | apt-transport-https \ 5 | ca-certificates \ 6 | curl \ 7 | software-properties-common 8 | curl -fsSL https://download.docker.com/linux/ubuntu/gpg | sudo apt-key add - 9 | sudo add-apt-repository \ 10 | "deb [arch=amd64] https://download.docker.com/linux/ubuntu \ 11 | $(lsb_release -cs) \ 12 | stable" 13 | sudo apt-get update 14 | sudo apt-get -y install docker-ce docker-compose 15 | sudo gpasswd -a ${USER} docker 16 | newgrp docker 17 | 18 | # nvidia-docker 19 | curl -s -L https://nvidia.github.io/nvidia-docker/gpgkey | \ 20 | sudo apt-key add - 21 | distribution=$(. /etc/os-release;echo ${ID}${VERSION_ID}) 22 | curl -s -L https://nvidia.github.io/nvidia-docker/$distribution/nvidia-docker.list | \ 23 | sudo tee /etc/apt/sources.list.d/nvidia-docker.list 24 | sudo apt-get update 25 | sudo apt-get install -y nvidia-docker2 26 | sudo tee /etc/docker/daemon.json < /etc/apt/sources.list.d/owncloud-client.list" 18 | sudo apt-get update 19 | sudo apt-get install owncloud-client 20 | 21 | # Geospatial dependencies 22 | sudo add-apt-repository ppa:ubuntugis/ppa 23 | sudo apt-get update 24 | sudo apt-get install -y python-numpy gdal-bin libgdal-dev 25 | pip3 install shapely rasterio 26 | sudo apt-get install -y libgeos-dev python3-tk # reinstall python3? 27 | 28 | # Machine learning dependencies 29 | sudo pip3 install --upgrade keras # check ~/.keras/keras.json for correct settings! 30 | # Install magenta requirement cuda 8.0 v6 for tf 1.2 - 1.4 31 | # From https://gitlab.com/nvidia/cuda/blob/c5e8c8d7a9fd444c4e45573f36cbeb8f4e10f71c/8.0/runtime/cudnn6/Dockerfile 32 | # And https://stackoverflow.com/questions/41991101/importerror-libcudnn-when-running-a-tensorflow-program 33 | 34 | # Updated drivers 35 | sudo add-apt-repository ppa:graphics-drivers/ppa 36 | sudo apt-get update 37 | 38 | #Install the recommended driver (currently nvidia-390) 39 | sudo ubuntu-drivers autoinstall 40 | 41 | # cuda toolkit, see also https://developer.nvidia.com/cuda-toolkit-archive 42 | wget -O cuda_8_linux.run https://developer.nvidia.com/compute/cuda/8.0/Prod2/local_installers/cuda_8.0.61_375.26_linux-run 43 | sudo chmod +x cuda_8_linux.run 44 | ech./cuda_8_linux.run 45 | #Do you accept the previously read EULA? 46 | #accept 47 | #Install NVIDIA Accelerated Graphics Driver for Linux-x86_64 367.48? 48 | #n (we installed drivers previously) 49 | #Install the CUDA 8.0 Toolkit? 50 | #y 51 | #Enter Toolkit Location: 52 | #/usr/local/cuda-8.0 (enter) 53 | #Do you wish to run the installation with ‚sudo’? 54 | #y 55 | #Do you want to install a symbolic link at /usr/local/cuda? 56 | #y 57 | #Install the CUDA 8.0 Samples? 58 | #y 59 | #Enter CUDA Samples Location: 60 | #enter 61 | 62 | sudo apt-get install -y libcupti-dev 63 | 64 | # Install cudnn 65 | cd ~ 66 | wget http://developer.download.nvidia.com/compute/redist/cudnn/v6.0/cudnn-8.0-linux-x64-v6.0.tgz 67 | tar xvzf cudnn-8.0-linux-x64-v6.0.tgz 68 | sudo cp -P cuda/include/cudnn.h /usr/local/cuda-8.0/include 69 | sudo cp -P cuda/lib64/libcudnn* /usr/local/cuda-8.0/lib64/ 70 | sudo chmod a+r /usr/local/cuda-8.0/include/cudnn.h /usr/local/cuda-8.0/lib64/libcudnn* 71 | 72 | # set environment variables 73 | echo export PATH=/usr/local/cuda-8.0/bin${PATH:+:${PATH}} >> ~/.bashrc 74 | echo export LD_LIBRARY_PATH=/usr/local/cuda-8.0/lib64/:/usr/lib/nvidia-384${LD_LIBRARY_PATH:+:${LD_LIBRARY_PATH}} >> ~/.bashrc 75 | echo export CUDA_HOME=/usr/local/cuda-8.0 >> ~/.bashrc 76 | source ~/.bashrc 77 | 78 | # GUI and remote access 79 | sudo apt-get install -y lxde 80 | # sudo rm /home/ubuntu/.Xauthority 81 | sudo startlxde 82 | sudo add-apt-repository -y ppa:x2go/stable 83 | sudo apt-get update 84 | sudo apt-get install -y x2goserver x2goserver-xsession 85 | wget https://download.jetbrains.com/python/pycharm-community-2017.2.3.tar.gz 86 | tar xvzf pycharm-community-2017.2.3.tar.gz 87 | 88 | # time zone and numlock config 89 | sudo timedatectl set-timezone Europe/Amsterdam 90 | sudo apt-get install numlockx 91 | sudo sed -i 's|^exit 0.*$|# Numlock enable\n[ -x /usr/bin/numlockx ] \&\& numlockx on\n\nexit 0|' /etc/rc.local 92 | echo "/usr/bin/numlockx on" | sudo tee -a /etc/X11/xinit/xinitrc 93 | echo "JAVA_HOME=\"/usr/lib/jvm/java-8-openjdk-amd64\"" | sudo tee -a /etc/environment 94 | sudo reboot 95 | -------------------------------------------------------------------------------- /script/run-all-models.sh: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env bash 2 | set -x 3 | "${SLACK_API_TOKEN:?You need to provide a SLACK_TOKEN_API environment parameter}" 4 | for script in '../model/*.py' 5 | do 6 | python3 $1 7 | EC=$? 8 | if [ ${EC} -eq 0 ] 9 | then 10 | echo "${1} completed successfully." 11 | else 12 | echo "${1} failed, sending notification..." 13 | python3 ./slack_notify.py "Failed at executing ${1}" 14 | fi 15 | done -------------------------------------------------------------------------------- /script/slack_notify.py: -------------------------------------------------------------------------------- 1 | import os 2 | 3 | import sys 4 | from slackclient import SlackClient 5 | 6 | slack_token = os.environ.get("SLACK_API_TOKEN") 7 | 8 | 9 | if slack_token: 10 | sc = SlackClient(slack_token) 11 | sc.api_call( 12 | "chat.postMessage", 13 | channel="#machinelearning", 14 | text="Notification: " + ', '.join(sys.argv[1:])) 15 | else: 16 | print('No slack notification: no slack API token environment variable "SLACK_API_TOKEN" set.') 17 | -------------------------------------------------------------------------------- /script/test-tensorflow.py: -------------------------------------------------------------------------------- 1 | import tensorflow as tf 2 | hello = tf.constant('Hello, TensorFlow!') 3 | sess = tf.Session() 4 | print(sess.run(hello)) 5 | -------------------------------------------------------------------------------- /setup.py: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env python 2 | 3 | from distutils.core import setup 4 | 5 | setup(name='Topology learning', 6 | version='1.0', 7 | description='Machine learning experiments for geospatial vector geometries', 8 | author='Rein van \'t Veer', 9 | author_email='rein.van.t.veer@geodan.nl', 10 | url='https://github.com/reinvantveer/Topology-Learning', 11 | packages=['model', 'model.topoml_util', 'model.baseline'], 12 | license='MIT', 13 | install_requires=[ 14 | 'sklearn', 15 | 'slackclient', 16 | 'scipy', 17 | 'keras', 18 | 'numpy', 19 | 'shapely', 20 | 'tensorflow-gpu' 21 | ], 22 | ) 23 | --------------------------------------------------------------------------------