├── .gitattributes
├── .gitignore
├── Dockerfile
├── LICENSE
├── README.md
├── build_server
    ├── docker-compose.yml.example
    ├── teamcity_agent
    │   ├── Dockerfile
    │   ├── requirements.txt
    │   └── shut_down_on_empty_queue.py
    ├── teamcity_logs
    │   └── README.md
    └── teamcity_server_data
    │   └── README.md
├── docker-compose.yml
├── efd_orders.svg
├── efd_orders.svg.png
├── files
    ├── archaeology
    │   ├── LICENSE
    │   ├── README.md
    │   └── archaeology.csv.zip
    ├── buildings
    │   └── buildings.csv.zip
    ├── neighborhoods
    │   ├── LICENSE.md
    │   └── neighborhoods.csv.zip
    └── roads
    │   └── boogstralen.zip
├── model
    ├── __init__.py
    ├── all_grid_search.py
    ├── all_test.py
    ├── archaeology_convnet.py
    ├── archaeology_convnet_fixed.py
    ├── archaeology_lstm.py
    ├── baseline
    │   ├── __init__.py
    │   ├── all_baseline_models.py
    │   ├── archaeo_feature_type_decision_tree.py
    │   ├── archaeo_feature_type_knn.py
    │   ├── archaeo_feature_type_logistic_regression.py
    │   ├── archaeo_feature_type_svm_linear.py
    │   ├── archaeo_feature_type_svm_polynomial.py
    │   ├── archaeo_feature_type_svm_rbf.py
    │   ├── building_type_decision_tree.py
    │   ├── building_type_knn.py
    │   ├── building_type_logistic_regression.py
    │   ├── building_type_svm_linear.py
    │   ├── building_type_svm_polynomial.py
    │   ├── building_type_svm_rbf.py
    │   ├── neighborhood_inhabintants_decision_tree.py
    │   ├── neighborhood_inhabintants_knn.py
    │   ├── neighborhood_inhabintants_logistic_regression.py
    │   ├── neighborhood_inhabintants_svm_linear.py
    │   ├── neighborhood_inhabintants_svm_polynomial.py
    │   └── neighborhood_inhabintants_svm_rbf.py
    ├── building_convnet.py
    ├── building_convnet_fixed.py
    ├── building_lstm.py
    ├── configs
    │   └── README.md
    ├── grid_search.py
    ├── neighborhood_convnet.py
    ├── neighborhood_convnet_fixed.py
    ├── neighborhood_lstm.py
    ├── plots
    │   └── README.md
    └── topoml_util
    │   ├── ConsoleLogger.py
    │   ├── GaussianMixtureLoss.py
    │   ├── GeoVectorizer.py
    │   ├── LoggerCallback.py
    │   ├── PyplotLogger.py
    │   ├── Tokenizer.py
    │   ├── __init__.py
    │   ├── gaussian_loss.py
    │   ├── geom_fourier_descriptors.py
    │   ├── geom_scaler.py
    │   ├── np_gaussian_2d_loss.py
    │   ├── np_gmm_loss.py
    │   ├── sketch_rnn_model.py
    │   ├── slack_send.py
    │   ├── test_GaussianMixtureLoss.py
    │   ├── test_GeoVectorizer.py
    │   ├── test_Tokenizer.py
    │   ├── test_files
    │       ├── big_multipolygon_wkt.txt
    │       ├── example.csv
    │       ├── gmm_output.py
    │       ├── multipart_multipolygon_wkt.txt
    │       └── polygon_multipolygon.csv
    │   ├── test_fourier_descriptors.py
    │   ├── test_geom_loss.py
    │   ├── test_geom_scaler.py
    │   ├── test_np_gaussian_2d_loss.py
    │   ├── test_rasterization.py
    │   ├── test_sketch-rnn-model.py
    │   ├── test_wkt2pyplot.py
    │   └── wkt2pyplot.py
├── prep
    ├── ProgressBar.py
    ├── densified.py
    ├── export-data.sh
    ├── get-data.sh
    ├── preprocess-archaeology.py
    ├── preprocess-buildings.py
    ├── preprocess-neighborhoods.py
    ├── spatial-join.sql
    ├── triangles.py
    ├── util
    │   ├── __init__.py
    │   ├── layerToWGS.py
    │   └── sim.c
    └── vectorize_brt_osm.py
├── requirements.txt
├── script
    ├── build-script.sh
    ├── install-docker-ubuntu.sh
    ├── install-requirements.sh
    ├── run-all-models.sh
    ├── slack_notify.py
    └── test-tensorflow.py
└── setup.py


/.gitattributes:
--------------------------------------------------------------------------------
1 | *.sh     text eol=lf
2 | *.npz filter=lfs diff=lfs merge=lfs -text
3 | 


--------------------------------------------------------------------------------
/.gitignore:
--------------------------------------------------------------------------------
  1 | # Byte-compiled / optimized / DLL files
  2 | __pycache__/
  3 | *.py[cod]
  4 | *$py.class
  5 | 
  6 | # C extensions
  7 | *.so
  8 | 
  9 | # Distribution / packaging
 10 | .Python
 11 | env/
 12 | build/
 13 | develop-eggs/
 14 | dist/
 15 | downloads/
 16 | eggs/
 17 | .eggs/
 18 | lib
 19 | lib64/
 20 | parts/
 21 | sdist/
 22 | var/
 23 | wheels/
 24 | *.egg-info/
 25 | .installed.cfg
 26 | *.egg
 27 | 
 28 | # PyInstaller
 29 | #  Usually these files are written by a python script from a template
 30 | #  before PyInstaller builds the exe, so as to inject date/other infos into it.
 31 | *.manifest
 32 | *.spec
 33 | 
 34 | # Installer logs
 35 | pip-log.txt
 36 | pip-delete-this-directory.txt
 37 | 
 38 | # Unit test / coverage reports
 39 | htmlcov/
 40 | .tox/
 41 | .coverage
 42 | .coverage.*
 43 | .cache
 44 | nosetests.xml
 45 | coverage.xml
 46 | *.cover
 47 | .hypothesis/
 48 | 
 49 | # Translations
 50 | *.mo
 51 | *.pot
 52 | 
 53 | # Django stuff:
 54 | *.log
 55 | local_settings.py
 56 | 
 57 | # Flask stuff:
 58 | instance/
 59 | .webassets-cache
 60 | 
 61 | # Scrapy stuff:
 62 | .scrapy
 63 | 
 64 | # Sphinx documentation
 65 | docs/_build/
 66 | 
 67 | # PyBuilder
 68 | target/
 69 | 
 70 | # Jupyter Notebook
 71 | .ipynb_checkpoints
 72 | 
 73 | # pyenv
 74 | .python-version
 75 | 
 76 | # celery beat schedule file
 77 | celerybeat-schedule
 78 | 
 79 | # SageMath parsed files
 80 | *.sage.py
 81 | 
 82 | # dotenv
 83 | .env
 84 | 
 85 | # virtualenv
 86 | .venv
 87 | venv/
 88 | ENV/
 89 | 
 90 | # Spyder project settings
 91 | .spyderproject
 92 | .spyproject
 93 | 
 94 | # Rope project settings
 95 | .ropeproject
 96 | 
 97 | # mkdocs documentation
 98 | /site
 99 | 
100 | # mypy
101 | .mypy_cache/
102 | 
103 | # Custom stuff
104 | /.idea/
105 | /model/tensorboard_log/
106 | /model/plots/*.png
107 | /model/configs/*.py
108 | /script/cuda/
109 | !/continuous_delivery/teamcity_logs/README.md
110 | !/continuous_delivery/teamcity_server_data/README.md
111 | !/continuous_delivery/docker-compose.yml
112 | !/continuous_delivery/teamcity_agent/Dockerfile
113 | !/files/roads/boogstralen.zip
114 | !/continuous_delivery/
115 | !/files/neighborhoods/neighborhoods_order_30_train.npz
116 | !/files/neighborhoods/neighborhoods_order_30_test.npz
117 | *.csv
118 | !/model/topoml_util/test_files/example.csv
119 | !/model/topoml_util/test_files/polygon_multipolygon.csv
120 | # Prevent committing credentials set in compose file
121 | build_server/docker-compose.yml
122 | /build_server/teamcity_server_data/config/
123 | /build_server/teamcity_server_data/plugins/
124 | /build_server/teamcity_server_data/system/
125 | 


--------------------------------------------------------------------------------
/Dockerfile:
--------------------------------------------------------------------------------
1 | FROM geodata/gdal:2.1.3
2 | 
3 | RUN apt-get update && apt-get install -y curl unzip
4 | RUN pip install shapely numpy
5 | WORKDIR /data/prep


--------------------------------------------------------------------------------
/LICENSE:
--------------------------------------------------------------------------------
 1 | MIT License
 2 | 
 3 | Copyright (c) 2017 Rein van 't Veer
 4 | 
 5 | Permission is hereby granted, free of charge, to any person obtaining a copy
 6 | of this software and associated documentation files (the "Software"), to deal
 7 | in the Software without restriction, including without limitation the rights
 8 | to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
 9 | copies of the Software, and to permit persons to whom the Software is
10 | furnished to do so, subject to the following conditions:
11 | 
12 | The above copyright notice and this permission notice shall be included in all
13 | copies or substantial portions of the Software.
14 | 
15 | THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
16 | IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
17 | FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
18 | AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
19 | LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
20 | OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
21 | SOFTWARE.
22 | 


--------------------------------------------------------------------------------
/build_server/docker-compose.yml.example:
--------------------------------------------------------------------------------
 1 | version: "2"
 2 | 
 3 | services:
 4 |   teamcity:
 5 |     image: jetbrains/teamcity-server
 6 |     restart: unless-stopped
 7 |     volumes:
 8 |       - ./teamcity_logs:/opt/teamcity/logs
 9 |       - ./teamcity_server_data:/data/teamcity_server/datadir
10 |     ports:
11 |       - "8111:8111"
12 | 
13 |   teamcity_agent:
14 |     build: teamcity_agent
15 |     restart: unless-stopped
16 |     depends_on:
17 |       - teamcity
18 |     volumes:
19 |       - ./teamcity_agent:/data/teamcity_agent/conf
20 |     environment:
21 |       - SERVER_URL=teamcity:8111
22 |       - SLACK_API_TOKEN=yourslackapitoken
23 |       - SLACK_CHANNEL=#machinelearning
24 |       - AWS_ACCESS_KEY_ID=yourawsaccesskeyid
25 |       - AWS_SECRET_ACCESS_KEY=yourawsaccesskeytoken
26 | 


--------------------------------------------------------------------------------
/build_server/teamcity_agent/Dockerfile:
--------------------------------------------------------------------------------
 1 | FROM jetbrains/teamcity-agent
 2 | 
 3 | RUN NVIDIA_GPGKEY_SUM=d1be581509378368edeec8c1eb2958702feedf3bc3d17011adbf24efacce4ab5 && \
 4 |     NVIDIA_GPGKEY_FPR=ae09fe4bbd223a84b2ccfce3f60f4b3d7fa2af80 && \
 5 |     apt-key adv --fetch-keys http://developer.download.nvidia.com/compute/cuda/repos/ubuntu1604/x86_64/7fa2af80.pub && \
 6 |     apt-key adv --export --no-emit-version -a $NVIDIA_GPGKEY_FPR | tail -n +5 > cudasign.pub && \
 7 |     echo "$NVIDIA_GPGKEY_SUM  cudasign.pub" | sha256sum -c --strict - && rm cudasign.pub && \
 8 |     echo "deb http://developer.download.nvidia.com/compute/cuda/repos/ubuntu1604/x86_64 /" > /etc/apt/sources.list.d/cuda.list && \
 9 |     echo "deb http://developer.download.nvidia.com/compute/machine-learning/repos/ubuntu1604/x86_64 /" > /etc/apt/sources.list.d/nvidia-ml.list
10 | 
11 | ENV CUDA_VERSION 9.0.176
12 | 
13 | ENV CUDA_PKG_VERSION 9-0=$CUDA_VERSION-1
14 | RUN apt-get update && apt-get install -y --no-install-recommends \
15 |         cuda-cudart-$CUDA_PKG_VERSION && \
16 |     ln -s cuda-9.0 /usr/local/cuda && \
17 |     rm -rf /var/lib/apt/lists/*
18 | 
19 | # nvidia-docker 1.0
20 | LABEL com.nvidia.volumes.needed="nvidia_driver"
21 | LABEL com.nvidia.cuda.version="${CUDA_VERSION}"
22 | 
23 | RUN echo "/usr/local/nvidia/lib" >> /etc/ld.so.conf.d/nvidia.conf && \
24 |     echo "/usr/local/nvidia/lib64" >> /etc/ld.so.conf.d/nvidia.conf
25 | 
26 | # nvidia-container-runtime
27 | ENV NVIDIA_VISIBLE_DEVICES all
28 | ENV NVIDIA_DRIVER_CAPABILITIES compute,utility
29 | ENV NVIDIA_REQUIRE_CUDA "cuda>=9.0"
30 | 
31 | # NCCL
32 | ENV NCCL_VERSION 2.1.15
33 | 
34 | RUN apt-get update && apt-get install -y --no-install-recommends \
35 |         cuda-libraries-$CUDA_PKG_VERSION \
36 |         libnccl2=$NCCL_VERSION-1+cuda9.0 && \
37 |     rm -rf /var/lib/apt/lists/*
38 | 
39 | # CUDNN
40 | #ENV CUDNN_VERSION 7.1.1.5
41 | ENV CUDNN_VERSION 7.0.3.11
42 | LABEL com.nvidia.cudnn.version="${CUDNN_VERSION}"
43 | 
44 | RUN apt-get update && apt-get install -y --no-install-recommends \
45 |             libcudnn7=$CUDNN_VERSION-1+cuda9.0 && \
46 |     rm -rf /var/lib/apt/lists/*
47 | 
48 | # Extra env vars
49 | ENV LD_LIBRARY_PATH /usr/local/cuda/lib64/
50 | ENV CUDA_HOME=/usr/local/cuda
51 | 
52 | RUN apt-get update && apt-get install -y python3-pip
53 | RUN pip3 install --upgrade pip
54 | COPY requirements.txt /requirements.txt
55 | RUN pip3 install -r requirements.txt
56 | 
57 | # Install Git Large File Storage
58 | RUN curl -s https://packagecloud.io/install/repositories/github/git-lfs/script.deb.sh | bash
59 | RUN apt-get update && apt-get install -y --no-install-recommends \
60 |             git-lfs && \
61 |     rm -rf /var/lib/apt/lists/*
62 | RUN git lfs install
63 | 


--------------------------------------------------------------------------------
/build_server/teamcity_agent/requirements.txt:
--------------------------------------------------------------------------------
 1 | Keras>=2.1.2
 2 | numpy>=1.14.0
 3 | pandas>=0.22.0
 4 | scikit-learn>=0.19.1
 5 | scipy>=1.0.0
 6 | Shapely>=1.6.3
 7 | slackclient>=1.1.0
 8 | tensorflow-gpu>=1.6.0
 9 | matplotlib>=2.1.2
10 | pyefd>=1.0
11 | boto3>=1.6


--------------------------------------------------------------------------------
/build_server/teamcity_agent/shut_down_on_empty_queue.py:
--------------------------------------------------------------------------------
 1 | import http
 2 | import os
 3 | from datetime import datetime
 4 | 
 5 | import boto3
 6 | import requests
 7 | 
 8 | from slackclient import SlackClient
 9 | 
10 | SCRIPT_NAME = os.path.basename(__file__)
11 | TIMESTAMP = str(datetime.now()).replace(':', '.')
12 | # Set this to the appropriate region
13 | REGION_NAME = 'eu-west-1'
14 | 
15 | # Get environment variables
16 | # Slack is required. We need to know if something is wrong
17 | slack_token = os.environ['SLACK_API_TOKEN']
18 | slack_channel = os.environ['SLACK_CHANNEL']
19 | # We are also going to require Amazon credentials, set as environment variables
20 | amazon_id = os.environ['AWS_ACCESS_KEY_ID']
21 | amazon_key = os.environ['AWS_SECRET_ACCESS_KEY']
22 | 
23 | # Initialize frameworks
24 | ec2 = boto3.client('ec2', region_name=REGION_NAME)
25 | sc = SlackClient(slack_token)
26 | 
27 | 
28 | # Slack notification function
29 | def notify(signature, message):
30 |     sc.api_call("chat.postMessage", channel=slack_channel,
31 |                 text="Script " + signature + " notification: " + str(message))
32 | 
33 | 
34 | # Get build queue length
35 | queue = "http://teamcity:8111/guestAuth/app/rest/buildQueue"
36 | headers = {
37 |     'Accept': "application/json",
38 |     'Cache-Control': "no-cache",
39 | }
40 | queue_res = requests.get(queue, headers=headers)
41 | queue_status = queue_res.json()
42 | queue_length = queue_status['count']
43 | 
44 | # Get instance id for this machine
45 | # https://stackoverflow.com/questions/33301880/how-to-obtain-current-instance-id-from-boto3#33307704
46 | try:
47 |     instance_metadata = requests.get('http://169.254.169.254/latest/meta-data/instance-id')
48 | except ConnectionError as e:
49 |     notify(SCRIPT_NAME, 'ERROR getting instance id, cannot issue commands')
50 |     raise ConnectionError(e)
51 | 
52 | instance_id = instance_metadata.text
53 | 
54 | if queue_length == 0:
55 |     print('build server reports empty queue, shutting down.')
56 |     shutdown_res = ec2.stop_instances(InstanceIds=[instance_id])
57 |     http_status_code = shutdown_res['ResponseMetadata']['HTTPStatusCode']
58 |     http_status = http.HTTPStatus(http_status_code).name
59 | 
60 |     if http_status_code == 200:
61 |         print('Stop instances:', http_status)
62 |         notify(SCRIPT_NAME, 'successful shutdown of {} with response {}'.format(instance_id, http_status))
63 |     else:
64 |         notify(SCRIPT_NAME, 'ERROR shutting down instance id: {}'.format(http_status))
65 | else:
66 |     notify(SCRIPT_NAME, 'job finished, build server reports non-empty queue, continuing.')
67 | 
68 | 
69 | 


--------------------------------------------------------------------------------
/build_server/teamcity_logs/README.md:
--------------------------------------------------------------------------------
1 | # Log dir
2 | This is the directory the build server logs are kept.


--------------------------------------------------------------------------------
/build_server/teamcity_server_data/README.md:
--------------------------------------------------------------------------------
1 | # Server data
2 | This is the directory the server data are kept.


--------------------------------------------------------------------------------
/docker-compose.yml:
--------------------------------------------------------------------------------
 1 | version: "3"
 2 | 
 3 | services:
 4 |   postgis:
 5 |     image: mdillon/postgis
 6 |     restart: unless-stopped
 7 |     ports:
 8 |       - '5432:5432'
 9 | 
10 |   data-prep:
11 |     build: .
12 |     volumes:
13 |       - ./files:/data/files
14 |       - ./prep:/data/prep
15 |     command: bash /data/prep/get-data.sh
16 |     depends_on:
17 |       - postgis
18 | 


--------------------------------------------------------------------------------
/efd_orders.svg.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/SPINLab/geometry-learning/5300d421ef848c2748a2ba41ced5c6e2fba93200/efd_orders.svg.png


--------------------------------------------------------------------------------
/files/archaeology/LICENSE:
--------------------------------------------------------------------------------
1 | Data in this folder is copyright (c) ADC ArcheoProjecten
2 | 
3 | This work is licensed under the Creative Commons Attribution-ShareAlike 4.0 International License. To view a copy of this license, visit http://creativecommons.org/licenses/by-sa/4.0/ or send a letter to Creative Commons, PO Box 1866, Mountain View, CA 94042, USA.


--------------------------------------------------------------------------------
/files/archaeology/README.md:
--------------------------------------------------------------------------------
 1 | # Archaeological feature data
 2 | Data in this folder is (c) ADC ArcheoProjecten, redistributed under [CC-BY-SA 4.0 license](http://creativecommons.org/licenses/by-sa/4.0/) by kind permission of ADC ArcheoProjecten.
 3 | 
 4 | Data from the following projects was used: 
 5 | 
 6 | Project | Reference | No. of features | has definitions
 7 | --- | --- | --- | ---
 8 | ENKN_09   |  ADC ArcheoProjecten; Roessingh, W.; Lohof, E.; (2010): Enkhuizen Kadijken 5A en 5B Opgraving. DANS. https://doi.org/10.17026/dans-27r-e5f8  | 11058 | Yes
 9 | VENO13_08 |  ADC ArcheoProjecten; Gerrets, D.A.; Jacobs, E.; (2011): Venlo Venlo TPN Deelgebied 1 en 2 Opgraving. DANS. https://doi.org/10.17026/dans-26f-55zu  | 5101  | Yes (joined)
10 | MONF_09   |  ADC ArcheoProjecten; Veken, B. van der; Prangsma, N.M.; (2011): Montferland Didam Westelijke Randweg Kerkwijk Opgraving. DANS. https://doi.org/10.17026/dans-zmk-35vy   | 5603  | Yes (joined)
11 | VEEE_07   |  ADC ArcheoProjecten; Dijkstra, J.; Zuidhoff, F.S.; (2011): Veere Rijksweg N57 Proefsleuven Begeleiding Opgraving. DANS. https://doi.org/10.17026/dans-xyc-re2w | 5243  | Yes
12 | GOUA_08   |  ADC ArcheoProjecten; Dijkstra, J.; Houkes, M.C. ; Ostkamp, S. ; (2010): Gouda Bolwerk Opgraving en Begeleiding. DANS. https://doi.org/10.17026/dans-xzm-x29h  | 5306  | Yes
13 | VENO_02   |  ADC ArcheoProjecten; Velde, H. van de; Ostkamp, S.; Veldman, H.A.P.; Wyns, S.; (2002): Venlo Maasboulevard. DANS. https://doi.org/10.17026/dans-x84-msac | 5207  | Yes
14 | KATK_08   |  ADC ArcheoProjecten; Velde, H.M. van der; (2011): Katwijk Zanderij Westerbaan Opgraving. DANS. https://doi.org/10.17026/dans-znz-r2ba | 3187 | Yes (joined)
15 | WIJD_07   |  Dijkstra, J. (ADC ArcheoProjecten) (2012): Wijk bij Duurstede Veilingterrein DO Opgraving. DANS. https://doi.org/10.17026/dans-x8d-qmae | 12131 | Yes (joined)
16 | OOST_10   |  Roessingh, W. (ADC ArcheoProjecten); Blom, E. (ADC ArcheoProjecten) (2012): Oosterhout Vrachelen De Contreie Vrachelen 4 Opgraving. DANS. https://doi.org/10.17026/dans-25d-fpe5 | 17251 | Yes (joined)
17 | VEGL_10   |  ADC ArcheoProjecten; Van der Veken, B. (ADC ArcheoProjecten); Blom, E. (ADC ArcheoProjecten) (2012): Veghel Scheiffelaar II Opgraving. DANS. https://doi.org/10.17026/dans-z93-7zbe | 4271  | Yes (joined)
18 | | | | 
19 | TOTAL | | 74358


--------------------------------------------------------------------------------
/files/archaeology/archaeology.csv.zip:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/SPINLab/geometry-learning/5300d421ef848c2748a2ba41ced5c6e2fba93200/files/archaeology/archaeology.csv.zip


--------------------------------------------------------------------------------
/files/buildings/buildings.csv.zip:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/SPINLab/geometry-learning/5300d421ef848c2748a2ba41ced5c6e2fba93200/files/buildings/buildings.csv.zip


--------------------------------------------------------------------------------
/files/neighborhoods/LICENSE.md:
--------------------------------------------------------------------------------
1 | # License
2 | Wijken en buurten data (c) Centraal Bureau voor de Statistiek,
3 | data licensed [CC-BY](https://creativecommons.org/licenses/by/4.0/)
4 | 
5 | # Source
6 | https://geodata.nationaalgeoregister.nl/wijkenbuurten2017/wfs?request=GetFeature&service=WFS&version=2.0.0&typeName=cbs_buurten_2017&outputFormat=csv&srsName=EPSG%3A4326&PropertyName=aantal_inwoners%2Cgeom
7 | 


--------------------------------------------------------------------------------
/files/neighborhoods/neighborhoods.csv.zip:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/SPINLab/geometry-learning/5300d421ef848c2748a2ba41ced5c6e2fba93200/files/neighborhoods/neighborhoods.csv.zip


--------------------------------------------------------------------------------
/files/roads/boogstralen.zip:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/SPINLab/geometry-learning/5300d421ef848c2748a2ba41ced5c6e2fba93200/files/roads/boogstralen.zip


--------------------------------------------------------------------------------
/model/__init__.py:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/SPINLab/geometry-learning/5300d421ef848c2748a2ba41ced5c6e2fba93200/model/__init__.py


--------------------------------------------------------------------------------
/model/all_grid_search.py:
--------------------------------------------------------------------------------
 1 | import os
 2 | 
 3 | import sys
 4 | from sklearn.model_selection import ParameterGrid
 5 | from topoml_util.slack_send import notify
 6 | 
 7 | SCRIPT_VERSION = '0.0.9'
 8 | N_TIMES = 6
 9 | 
10 | HYPERPARAMS = {
11 |     'BATCH_SIZE': [512],
12 |     'REPEAT_DEEP_ARCH': [1],
13 |     'LSTM_SIZE': [64],
14 |     'DENSE_SIZE': [32],
15 |     'EPOCHS': [200],
16 |     'LEARNING_RATE': [1e-4],
17 |     # 'GEOM_SCALE': [1e0, 1e-1, 1e-2, 1e-3],
18 |     'RECURRENT_DROPOUT': [0.10],
19 |     # 'PATIENCE': [8, 16, 24, 32, 40],
20 |     'EARLY_STOPPING': 1
21 | }
22 | grid = list(ParameterGrid(HYPERPARAMS))
23 | 
24 | scripts = [
25 |     # 'neighborhood_inhabitants.py',
26 |     # 'building_type.py',
27 |     # 'archaeological_features.py'
28 | ]
29 | 
30 | for configuration in grid:
31 |     # Set environment variables (this allows you to do hyperparam searches from any scripting environment)
32 |     for key, value in configuration.items():
33 |         os.environ[key] = str(value)
34 | 
35 |     # repeat to get a sense of results spread
36 |     for _ in range(N_TIMES):
37 |         for script in scripts:
38 |             r_code = os.system('python3 {}'.format(script))
39 |             if not r_code == 0:
40 |                 print('{} exited with error'.format(script))
41 |                 notify('{} grid search'.format(script), 'with error')
42 |                 sys.exit(1)
43 | 
44 | notify('All grid search', 'no errors')
45 | print('All grid search', 'finished successfully')
46 | 


--------------------------------------------------------------------------------
/model/all_test.py:
--------------------------------------------------------------------------------
 1 | """
 2 | Final test script for evaluation statistics
 3 | """
 4 | 
 5 | import os
 6 | 
 7 | import sys
 8 | from sklearn.model_selection import ParameterGrid
 9 | from topoml_util.slack_send import notify
10 | 
11 | notify('ALL TEST SCRIPT RUNNING FINAL TESTS', 'STARTING')
12 | 
13 | SCRIPT_VERSION = '1.0.0'
14 | N_TIMES = 1
15 | 
16 | HYPERPARAMS = {  # All using standard hyperparameters
17 |     # 'BATCH_SIZE': [512],
18 |     # 'REPEAT_DEEP_ARCH': [0],
19 |     # 'LSTM_SIZE': [64],
20 |     # 'DENSE_SIZE': [32],
21 |     # 'EPOCHS': [200],
22 |     # 'LEARNING_RATE': [1e-4],
23 |     # 'GEOM_SCALE': [1e0, 1e-1, 1e-2, 1e-3],  # Leave at standard normalization
24 |     # 'RECURRENT_DROPOUT': [0.10],
25 |     # 'PATIENCE': [8, 16, 24, 32, 40],  # Early stopping disabled by default
26 | }
27 | grid = list(ParameterGrid(HYPERPARAMS))
28 | 
29 | scripts = [
30 |     # 'neighborhood_convnet.py',
31 |     # 'neighborhood_lstm.py',
32 |     # 'building_convnet.py',
33 |     # 'building_lstm.py',
34 |     'archaeology_convnet.py',
35 |     'archaeology_lstm.py'
36 | ]
37 | 
38 | for configuration in grid:
39 |     # Set environment variables (this allows you to do hyperparam searches from any scripting environment)
40 |     for key, value in configuration.items():
41 |         os.environ[key] = str(value)
42 | 
43 |     # repeat to get a sense of results spread
44 |     for _ in range(N_TIMES):
45 |         for script in scripts:
46 |             r_code = os.system('python3 {} --test'.format(script))
47 |             if not r_code == 0:
48 |                 print('{} exited with error'.format(script))
49 |                 notify('{} grid search'.format(script), 'with error')
50 |                 sys.exit(1)
51 | 
52 | notify('ALL TEST', 'no errors')
53 | print('ALL TEST', 'finished successfully')
54 | 


--------------------------------------------------------------------------------
/model/baseline/__init__.py:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/SPINLab/geometry-learning/5300d421ef848c2748a2ba41ced5c6e2fba93200/model/baseline/__init__.py


--------------------------------------------------------------------------------
/model/baseline/all_baseline_models.py:
--------------------------------------------------------------------------------
 1 | import os
 2 | 
 3 | import sys
 4 | from sklearn.model_selection import ParameterGrid
 5 | from topoml_util.slack_send import notify
 6 | 
 7 | SCRIPT_VERSION = '0.0.9'
 8 | N_TIMES = 1
 9 | 
10 | HYPERPARAMS = {
11 |     # 'BATCH_SIZE': [512],
12 |     # 'REPEAT_DEEP_ARCH': [1],
13 |     # 'LSTM_SIZE': [64],
14 |     # 'DENSE_SIZE': [32],
15 |     # 'EPOCHS': [200],
16 |     # 'LEARNING_RATE': [1e-4],
17 |     # 'GEOM_SCALE': [1e0, 1e-1, 1e-2, 1e-3],
18 |     # 'RECURRENT_DROPOUT': [0.10],
19 |     # 'PATIENCE': [8, 16, 24, 32, 40],
20 |     # 'EARLY_STOPPING': 1
21 | }
22 | grid = list(ParameterGrid(HYPERPARAMS))
23 | 
24 | scripts = [
25 |     'archaeo_feature_type_decision_tree.py',
26 |     'archaeo_feature_type_knn.py',
27 |     'archaeo_feature_type_logistic_regression.py',
28 |     'archaeo_feature_type_svm_rbf.py',
29 |     'building_type_decision_tree.py',
30 |     'building_type_knn.py',
31 |     'building_type_logistic_regression.py',
32 |     'building_type_svm_rbf.py',
33 |     'neighborhood_inhabintants_decision_tree.py',
34 |     'neighborhood_inhabintants_knn.py',
35 |     'neighborhood_inhabintants_logistic_regression.py',
36 |     'neighborhood_inhabintants_svm_rbf.py',
37 | ]
38 | 
39 | for configuration in grid:
40 |     # Set environment variables (this allows you to do hyperparam searches from any scripting environment)
41 |     for key, value in configuration.items():
42 |         os.environ[key] = str(value)
43 | 
44 |     # repeat to get a sense of results spread
45 |     for _ in range(N_TIMES):
46 |         for script in scripts:
47 |             print('Executing', script)
48 |             r_code = os.system('python3 {}'.format(script))
49 |             if not r_code == 0:
50 |                 print('{} exited with error'.format(script))
51 |                 notify('{} grid search'.format(script), 'with error')
52 |                 sys.exit(1)
53 | 
54 | notify('All grid search', 'no errors')
55 | print('All grid search', 'finished successfully')
56 | 


--------------------------------------------------------------------------------
/model/baseline/archaeo_feature_type_decision_tree.py:
--------------------------------------------------------------------------------
  1 | """
  2 | This script executes the task of estimating the type of an archaeological feature, based solely on the geometry for
  3 | that feature. The data for this script can be found at http://hdl.handle.net/10411/GYPPBR.
  4 | """
  5 | 
  6 | import multiprocessing
  7 | import os
  8 | import sys
  9 | from datetime import datetime, timedelta
 10 | from pathlib import Path
 11 | from time import time
 12 | from urllib.request import urlretrieve
 13 | 
 14 | import numpy as np
 15 | from sklearn.metrics import accuracy_score
 16 | from sklearn.model_selection import cross_val_score, StratifiedShuffleSplit, GridSearchCV
 17 | from sklearn.preprocessing import StandardScaler
 18 | from sklearn.tree import DecisionTreeClassifier
 19 | 
 20 | PACKAGE_PARENT = '..'
 21 | SCRIPT_DIR = os.path.dirname(os.path.realpath(os.path.join(os.getcwd(), os.path.expanduser(__file__))))
 22 | sys.path.append(os.path.normpath(os.path.join(SCRIPT_DIR, PACKAGE_PARENT)))
 23 | 
 24 | from topoml_util.slack_send import notify
 25 | 
 26 | SCRIPT_VERSION = '1.0.2'
 27 | SCRIPT_NAME = os.path.basename(__file__)
 28 | TIMESTAMP = str(datetime.now()).replace(':', '.')
 29 | NUM_CPUS = multiprocessing.cpu_count() - 1 or 1
 30 | DATA_FOLDER = SCRIPT_DIR + '/../../files/archaeology/'
 31 | TRAIN_DATA_FILE = 'archaeology_train_v7.npz'
 32 | TEST_DATA_FILE = 'archaeology_test_v7.npz'
 33 | TRAIN_DATA_URL = 'https://dataverse.nl/api/access/datafile/11377'
 34 | TEST_DATA_URL = 'https://dataverse.nl/api/access/datafile/11376'
 35 | EFD_ORDERS = [0, 1, 2, 3, 4, 6, 8, 12, 16, 20, 24]
 36 | SCRIPT_START = time()
 37 | 
 38 | if __name__ == '__main__':  # this is to squelch warnings on scikit-learn multithreaded grid search
 39 |     # Load training data
 40 |     path = Path(DATA_FOLDER + TRAIN_DATA_FILE)
 41 |     if not path.exists():
 42 |         print("Retrieving training data from web...")
 43 |         urlretrieve(TRAIN_DATA_URL, DATA_FOLDER + TRAIN_DATA_FILE)
 44 | 
 45 |     train_loaded = np.load(DATA_FOLDER + TRAIN_DATA_FILE)
 46 |     train_fourier_descriptors = train_loaded['elliptic_fourier_descriptors']
 47 |     train_labels = train_loaded['feature_type']
 48 | 
 49 |     scaler = StandardScaler().fit(train_fourier_descriptors)
 50 |     train_fourier_descriptors = scaler.transform(train_fourier_descriptors)
 51 | 
 52 |     param_grid = {'max_depth': range(5, 11)}
 53 |     cv = StratifiedShuffleSplit(n_splits=5, test_size=0.2, random_state=42)
 54 |     grid = GridSearchCV(
 55 |         DecisionTreeClassifier(),
 56 |         n_jobs=NUM_CPUS,
 57 |         param_grid=param_grid,
 58 |         verbose=2,
 59 |         cv=cv)
 60 | 
 61 |     print('Performing grid search on model...')
 62 |     print('Using {} threads for grid search'.format(NUM_CPUS))
 63 |     print('Searching {} elliptic fourier descriptor orders'.format(EFD_ORDERS))
 64 | 
 65 |     best_order = 0
 66 |     best_score = 0
 67 |     best_params = {}
 68 | 
 69 |     for order in EFD_ORDERS:
 70 |         print('Fitting order {} fourier descriptors'.format(order))
 71 |         stop_position = 3 + (order * 8)
 72 |         grid.fit(train_fourier_descriptors[:, :stop_position], train_labels)
 73 |         print("The best parameters for order {} are {} with a score of {}\n".format(
 74 |             order, grid.best_params_, grid.best_score_))
 75 |         if grid.best_score_ > best_score:
 76 |             best_score = grid.best_score_
 77 |             best_order = order
 78 |             best_params = grid.best_params_
 79 | 
 80 |     print('Training model on order {} with best parameters {}'.format(
 81 |         best_order, best_params))
 82 |     stop_position = 3 + (best_order * 8)
 83 |     clf = DecisionTreeClassifier(max_depth=best_params['max_depth'])
 84 |     scores = cross_val_score(clf, train_fourier_descriptors[:, :stop_position], train_labels, cv=10, n_jobs=NUM_CPUS)
 85 |     print('Cross-validation scores:', scores)
 86 |     clf.fit(train_fourier_descriptors[:, :stop_position], train_labels)
 87 | 
 88 |     # Run predictions on unseen test data to verify generalization
 89 |     path = Path(DATA_FOLDER + TEST_DATA_FILE)
 90 |     if not path.exists():
 91 |         print("Retrieving test data from web...")
 92 |         urlretrieve(TEST_DATA_URL, DATA_FOLDER + TEST_DATA_FILE)
 93 | 
 94 |     test_loaded = np.load(DATA_FOLDER + TEST_DATA_FILE)
 95 |     test_fourier_descriptors = test_loaded['elliptic_fourier_descriptors']
 96 |     test_labels = np.asarray(test_loaded['feature_type'], dtype=int)
 97 |     test_fourier_descriptors = scaler.transform(test_fourier_descriptors)
 98 | 
 99 |     print('Run on test data...')
100 |     predictions = clf.predict(test_fourier_descriptors[:, :stop_position])
101 |     test_accuracy = accuracy_score(test_labels, predictions)
102 | 
103 |     runtime = time() - SCRIPT_START
104 |     message = '\nTest accuracy of {} for fourier descriptor order {} with {} in {}'.format(
105 |         test_accuracy, best_order, best_params, timedelta(seconds=runtime))
106 |     print(message)
107 |     notify(SCRIPT_NAME, message)
108 | 


--------------------------------------------------------------------------------
/model/baseline/archaeo_feature_type_knn.py:
--------------------------------------------------------------------------------
  1 | """
  2 | This script executes the task of estimating the type of an archaeological feature, based solely on the geometry for
  3 | that feature. The data for this script can be found at http://hdl.handle.net/10411/GYPPBR.
  4 | """
  5 | 
  6 | import multiprocessing
  7 | import os
  8 | import sys
  9 | from datetime import datetime, timedelta
 10 | from pathlib import Path
 11 | from time import time
 12 | from urllib.request import urlretrieve
 13 | 
 14 | import numpy as np
 15 | from sklearn.metrics import accuracy_score
 16 | from sklearn.model_selection import cross_val_score, StratifiedShuffleSplit, GridSearchCV
 17 | from sklearn.neighbors import KNeighborsClassifier
 18 | from sklearn.preprocessing import StandardScaler
 19 | 
 20 | PACKAGE_PARENT = '..'
 21 | SCRIPT_DIR = os.path.dirname(os.path.realpath(os.path.join(os.getcwd(), os.path.expanduser(__file__))))
 22 | sys.path.append(os.path.normpath(os.path.join(SCRIPT_DIR, PACKAGE_PARENT)))
 23 | 
 24 | from topoml_util.slack_send import notify
 25 | 
 26 | SCRIPT_VERSION = '1.0.5'
 27 | SCRIPT_NAME = os.path.basename(__file__)
 28 | TIMESTAMP = str(datetime.now()).replace(':', '.')
 29 | NUM_CPUS = multiprocessing.cpu_count() - 1 or 1
 30 | DATA_FOLDER = SCRIPT_DIR + '/../../files/archaeology/'
 31 | TRAIN_DATA_FILE = 'archaeology_train_v7.npz'
 32 | TEST_DATA_FILE = 'archaeology_test_v7.npz'
 33 | TRAIN_DATA_URL = 'https://dataverse.nl/api/access/datafile/11377'
 34 | TEST_DATA_URL = 'https://dataverse.nl/api/access/datafile/11376'
 35 | EFD_ORDERS = [0, 1, 2, 3, 4, 6, 8, 12, 16, 20, 24]
 36 | SCRIPT_START = time()
 37 | 
 38 | if __name__ == '__main__':  # this is to squelch warnings on scikit-learn multithreaded grid search
 39 |     # Load training data
 40 |     path = Path(DATA_FOLDER + TRAIN_DATA_FILE)
 41 |     if not path.exists():
 42 |         print("Retrieving training data from web...")
 43 |         urlretrieve(TRAIN_DATA_URL, DATA_FOLDER + TRAIN_DATA_FILE)
 44 | 
 45 |     train_loaded = np.load(DATA_FOLDER + TRAIN_DATA_FILE)
 46 |     train_fourier_descriptors = train_loaded['elliptic_fourier_descriptors']
 47 |     train_labels = train_loaded['feature_type']
 48 | 
 49 |     scaler = StandardScaler().fit(train_fourier_descriptors)
 50 |     train_fourier_descriptors = scaler.transform(train_fourier_descriptors)
 51 | 
 52 |     k_range = np.linspace(start=21, stop=30, num=10, dtype=int)
 53 |     param_grid = dict(n_neighbors=k_range)
 54 |     cv = StratifiedShuffleSplit(n_splits=5, test_size=0.2, random_state=42)
 55 |     grid = GridSearchCV(
 56 |         KNeighborsClassifier(),
 57 |         n_jobs=NUM_CPUS,
 58 |         param_grid=param_grid,
 59 |         verbose=2,
 60 |         cv=cv)
 61 | 
 62 |     print('Performing grid search on model...')
 63 |     print('Using {} threads for grid search'.format(NUM_CPUS))
 64 |     print('Searching {} elliptic fourier descriptor orders'.format(EFD_ORDERS))
 65 | 
 66 |     best_order = 0
 67 |     best_score = 0
 68 |     best_params = {}
 69 | 
 70 |     for order in EFD_ORDERS:
 71 |         print('Fitting order {} fourier descriptors'.format(order))
 72 |         stop_position = 3 + (order * 8)
 73 |         grid.fit(train_fourier_descriptors[::5, :stop_position], train_labels[::5])
 74 |         print("The best parameters for order {} are {} with a score of {}\n".format(
 75 |             order, grid.best_params_, grid.best_score_))
 76 |         if grid.best_score_ > best_score:
 77 |             best_score = grid.best_score_
 78 |             best_order = order
 79 |             best_params = grid.best_params_
 80 | 
 81 |     print('Training model on order {} with best parameters {}'.format(
 82 |         best_order, best_params))
 83 |     stop_position = 3 + (best_order * 8)
 84 |     clf = KNeighborsClassifier(n_neighbors=best_params['n_neighbors'])
 85 |     scores = cross_val_score(clf, train_fourier_descriptors[:, :stop_position], train_labels, cv=10, n_jobs=NUM_CPUS)
 86 |     print('Cross-validation scores:', scores)
 87 |     clf.fit(train_fourier_descriptors[:, :stop_position], train_labels)
 88 | 
 89 |     # Run predictions on unseen test data to verify generalization
 90 |     path = Path(DATA_FOLDER + TEST_DATA_FILE)
 91 |     if not path.exists():
 92 |         print("Retrieving test data from web...")
 93 |         urlretrieve(TEST_DATA_URL, DATA_FOLDER + TEST_DATA_FILE)
 94 | 
 95 |     test_loaded = np.load(DATA_FOLDER + TEST_DATA_FILE)
 96 |     test_fourier_descriptors = test_loaded['elliptic_fourier_descriptors']
 97 |     test_labels = np.asarray(test_loaded['feature_type'], dtype=int)
 98 |     test_fourier_descriptors = scaler.transform(test_fourier_descriptors)
 99 | 
100 |     print('Run on test data...')
101 |     predictions = clf.predict(test_fourier_descriptors[:, :stop_position])
102 |     test_accuracy = accuracy_score(test_labels, predictions)
103 | 
104 |     runtime = time() - SCRIPT_START
105 |     message = '\nTest accuracy of {} for fourier descriptor order {} with {} in {}'.format(
106 |         test_accuracy, best_order, best_params, timedelta(seconds=runtime))
107 |     print(message)
108 |     notify(SCRIPT_NAME, message)
109 | 


--------------------------------------------------------------------------------
/model/baseline/archaeo_feature_type_logistic_regression.py:
--------------------------------------------------------------------------------
  1 | """
  2 | This script executes the task of estimating the type of an archaeological feature, based solely on the geometry for
  3 | that feature. The data for this script can be found at http://hdl.handle.net/10411/GYPPBR.
  4 | """
  5 | 
  6 | import multiprocessing
  7 | import os
  8 | import sys
  9 | from datetime import datetime, timedelta
 10 | from pathlib import Path
 11 | from time import time
 12 | from urllib.request import urlretrieve
 13 | 
 14 | import numpy as np
 15 | from sklearn.linear_model import LogisticRegression
 16 | from sklearn.metrics import accuracy_score
 17 | from sklearn.model_selection import cross_val_score, StratifiedShuffleSplit, GridSearchCV
 18 | from sklearn.preprocessing import StandardScaler
 19 | 
 20 | PACKAGE_PARENT = '..'
 21 | SCRIPT_DIR = os.path.dirname(os.path.realpath(os.path.join(os.getcwd(), os.path.expanduser(__file__))))
 22 | sys.path.append(os.path.normpath(os.path.join(SCRIPT_DIR, PACKAGE_PARENT)))
 23 | 
 24 | from topoml_util.slack_send import notify
 25 | 
 26 | SCRIPT_VERSION = '1.0.1'
 27 | SCRIPT_NAME = os.path.basename(__file__)
 28 | TIMESTAMP = str(datetime.now()).replace(':', '.')
 29 | NUM_CPUS = multiprocessing.cpu_count() - 1 or 1
 30 | DATA_FOLDER = SCRIPT_DIR + '/../../files/archaeology/'
 31 | TRAIN_DATA_FILE = 'archaeology_train_v7.npz'
 32 | TEST_DATA_FILE = 'archaeology_test_v7.npz'
 33 | TRAIN_DATA_URL = 'https://dataverse.nl/api/access/datafile/11377'
 34 | TEST_DATA_URL = 'https://dataverse.nl/api/access/datafile/11376'
 35 | EFD_ORDERS = [0, 1, 2, 3, 4, 6, 8, 12, 16, 20, 24]
 36 | SCRIPT_START = time()
 37 | 
 38 | if __name__ == '__main__':  # this is to squelch warnings on scikit-learn multithreaded grid search
 39 |     # Load training data
 40 |     path = Path(DATA_FOLDER + TRAIN_DATA_FILE)
 41 |     if not path.exists():
 42 |         print("Retrieving training data from web...")
 43 |         urlretrieve(TRAIN_DATA_URL, DATA_FOLDER + TRAIN_DATA_FILE)
 44 | 
 45 |     train_loaded = np.load(DATA_FOLDER + TRAIN_DATA_FILE)
 46 |     train_fourier_descriptors = train_loaded['elliptic_fourier_descriptors']
 47 |     train_labels = train_loaded['feature_type']
 48 | 
 49 |     scaler = StandardScaler().fit(train_fourier_descriptors)
 50 |     train_fourier_descriptors = scaler.transform(train_fourier_descriptors)
 51 | 
 52 |     # Grid search
 53 |     C_range = [1e-2, 1e-1, 1e0, 1e1, 1e2, 1e3]
 54 |     param_grid = dict(C=C_range)
 55 |     cv = StratifiedShuffleSplit(n_splits=5, test_size=0.2, random_state=42)
 56 |     grid = GridSearchCV(
 57 |         LogisticRegression(),
 58 |         n_jobs=NUM_CPUS,
 59 |         param_grid=param_grid,
 60 |         verbose=2,
 61 |         cv=cv)
 62 | 
 63 |     print('Performing grid search on model...')
 64 |     print('Using {} threads for grid search'.format(NUM_CPUS))
 65 |     print('Searching {} elliptic fourier descriptor orders'.format(EFD_ORDERS))
 66 | 
 67 |     best_order = 0
 68 |     best_score = 0
 69 |     best_params = {}
 70 | 
 71 |     for order in EFD_ORDERS:
 72 |         print('Fitting order {} fourier descriptors'.format(order))
 73 |         stop_position = 3 + (order * 8)
 74 |         grid.fit(train_fourier_descriptors[:, :stop_position], train_labels)
 75 |         print("The best parameters for order {} are {} with a score of {}\n".format(
 76 |             order, grid.best_params_, grid.best_score_))
 77 |         if grid.best_score_ > best_score:
 78 |             best_score = grid.best_score_
 79 |             best_order = order
 80 |             best_params = grid.best_params_
 81 | 
 82 |     print('Training model on order {} with best parameters {}'.format(
 83 |         best_order, best_params))
 84 |     stop_position = 3 + (best_order * 8)
 85 |     clf = LogisticRegression(C=best_params['C'])
 86 |     scores = cross_val_score(clf, train_fourier_descriptors[:, :stop_position], train_labels, cv=10, n_jobs=NUM_CPUS)
 87 |     print('Cross-validation scores:', scores)
 88 |     clf.fit(train_fourier_descriptors[:, :stop_position], train_labels)
 89 | 
 90 |     # Run predictions on unseen test data to verify generalization
 91 |     path = Path(DATA_FOLDER + TEST_DATA_FILE)
 92 |     if not path.exists():
 93 |         print("Retrieving test data from web...")
 94 |         urlretrieve(TEST_DATA_URL, DATA_FOLDER + TEST_DATA_FILE)
 95 | 
 96 |     test_loaded = np.load(DATA_FOLDER + TEST_DATA_FILE)
 97 |     test_fourier_descriptors = test_loaded['elliptic_fourier_descriptors']
 98 |     test_labels = np.asarray(test_loaded['feature_type'], dtype=int)
 99 |     test_fourier_descriptors = scaler.transform(test_fourier_descriptors)
100 | 
101 |     print('Run on test data...')
102 |     predictions = clf.predict(test_fourier_descriptors[:, :stop_position])
103 |     test_accuracy = accuracy_score(test_labels, predictions)
104 | 
105 |     runtime = time() - SCRIPT_START
106 |     message = '\nTest accuracy of {} for fourier descriptor order {} with {} in {}'.format(
107 |         test_accuracy, best_order, best_params, timedelta(seconds=runtime))
108 |     print(message)
109 |     notify(SCRIPT_NAME, message)
110 | 


--------------------------------------------------------------------------------
/model/baseline/archaeo_feature_type_svm_linear.py:
--------------------------------------------------------------------------------
  1 | """
  2 | This script executes the task of estimating the type of an archaeological feature, based solely on the geometry for
  3 | that feature. The data for this script can be found at http://hdl.handle.net/10411/GYPPBR.
  4 | """
  5 | 
  6 | import multiprocessing
  7 | import os
  8 | import sys
  9 | from datetime import datetime, timedelta
 10 | from pathlib import Path
 11 | from time import time
 12 | from urllib.request import urlretrieve
 13 | 
 14 | import numpy as np
 15 | from sklearn.metrics import accuracy_score
 16 | from sklearn.model_selection import StratifiedShuffleSplit, GridSearchCV
 17 | from sklearn.preprocessing import StandardScaler
 18 | from sklearn.svm import SVC
 19 | 
 20 | PACKAGE_PARENT = '..'
 21 | SCRIPT_DIR = os.path.dirname(os.path.realpath(os.path.join(os.getcwd(), os.path.expanduser(__file__))))
 22 | sys.path.append(os.path.normpath(os.path.join(SCRIPT_DIR, PACKAGE_PARENT)))
 23 | 
 24 | from topoml_util.slack_send import notify
 25 | 
 26 | SCRIPT_VERSION = '1.0.1'
 27 | SCRIPT_NAME = os.path.basename(__file__)
 28 | TIMESTAMP = str(datetime.now()).replace(':', '.')
 29 | NUM_CPUS = multiprocessing.cpu_count() - 1 or 1
 30 | DATA_FOLDER = SCRIPT_DIR + '/../../files/archaeology/'
 31 | TRAIN_DATA_FILE = 'archaeology_train_v7.npz'
 32 | TEST_DATA_FILE = 'archaeology_test_v7.npz'
 33 | TRAIN_DATA_URL = 'https://dataverse.nl/api/access/datafile/11377'
 34 | TEST_DATA_URL = 'https://dataverse.nl/api/access/datafile/11376'
 35 | EFD_ORDERS = [0, 1, 2, 3, 4, 6, 8, 12, 16, 20, 24]
 36 | SCRIPT_START = time()
 37 | 
 38 | if __name__ == '__main__':  # this is to squelch warnings on scikit-learn multithreaded grid search
 39 |     # Load training data
 40 |     path = Path(DATA_FOLDER + TRAIN_DATA_FILE)
 41 |     if not path.exists():
 42 |         print("Retrieving training data from web...")
 43 |         urlretrieve(TRAIN_DATA_URL, DATA_FOLDER + TRAIN_DATA_FILE)
 44 | 
 45 |     train_loaded = np.load(DATA_FOLDER + TRAIN_DATA_FILE)
 46 |     train_fourier_descriptors = train_loaded['elliptic_fourier_descriptors']
 47 |     train_labels = train_loaded['feature_type']
 48 | 
 49 |     scaler = StandardScaler().fit(train_fourier_descriptors)
 50 |     train_fourier_descriptors = scaler.transform(train_fourier_descriptors)
 51 | 
 52 |     C_range = [1e-1, 1e0, 1e1, 1e2, 1e3]
 53 |     param_grid = dict(C=C_range)
 54 |     cv = StratifiedShuffleSplit(n_splits=5, test_size=0.2, random_state=42)
 55 |     grid = GridSearchCV(
 56 |         SVC(kernel='linear', max_iter=int(1e8)),
 57 |         n_jobs=NUM_CPUS,
 58 |         param_grid=param_grid,
 59 |         verbose=2,
 60 |         cv=cv)
 61 | 
 62 |     print('Performing grid search on model...')
 63 |     print('Using {} threads for grid search'.format(NUM_CPUS))
 64 |     print('Searching {} elliptic fourier descriptor orders'.format(EFD_ORDERS))
 65 | 
 66 |     best_order = 0
 67 |     best_score = 0
 68 |     best_params = {}
 69 | 
 70 |     for order in EFD_ORDERS:
 71 |         print('Fitting order {} fourier descriptors'.format(order))
 72 |         stop_position = 3 + (order * 8)
 73 |         grid.fit(train_fourier_descriptors[::10, :stop_position], train_labels[::10])
 74 |         print("The best parameters for order {} are {} with a score of {}\n".format(
 75 |             order, grid.best_params_, grid.best_score_))
 76 |         if grid.best_score_ > best_score:
 77 |             best_score = grid.best_score_
 78 |             best_order = order
 79 |             best_params = grid.best_params_
 80 | 
 81 |     print('Training model on order {} with best parameters {}'.format(
 82 |         best_order, best_params))
 83 |     stop_position = 3 + (best_order * 8)
 84 |     clf = SVC(kernel='linear', C=best_params['C'], max_iter=int(1e8))
 85 |     clf.fit(X=train_fourier_descriptors[:, :stop_position], y=train_labels)
 86 | 
 87 |     # Run predictions on unseen test data to verify generalization
 88 |     path = Path(DATA_FOLDER + TEST_DATA_FILE)
 89 |     if not path.exists():
 90 |         print("Retrieving test data from web...")
 91 |         urlretrieve(TEST_DATA_URL, DATA_FOLDER + TEST_DATA_FILE)
 92 | 
 93 |     test_loaded = np.load(DATA_FOLDER + TEST_DATA_FILE)
 94 |     test_fourier_descriptors = test_loaded['elliptic_fourier_descriptors']
 95 |     test_labels = np.asarray(test_loaded['feature_type'], dtype=int)
 96 |     test_fourier_descriptors = scaler.transform(test_fourier_descriptors)
 97 | 
 98 |     print('Run on test data...')
 99 |     predictions = clf.predict(test_fourier_descriptors[:, :stop_position])
100 |     test_accuracy = accuracy_score(test_labels, predictions)
101 | 
102 |     runtime = time() - SCRIPT_START
103 |     message = '\nTest accuracy of {} for fourier descriptor order {} with {} in {}'.format(
104 |         test_accuracy, best_order, best_params, timedelta(seconds=runtime))
105 |     print(message)
106 |     notify(SCRIPT_NAME, message)
107 | 


--------------------------------------------------------------------------------
/model/baseline/archaeo_feature_type_svm_polynomial.py:
--------------------------------------------------------------------------------
  1 | """
  2 | This script executes the task of estimating the type of an archaeological feature, based solely on the geometry for
  3 | that feature. The data for this script can be found at http://hdl.handle.net/10411/GYPPBR.
  4 | """
  5 | 
  6 | import multiprocessing
  7 | import os
  8 | import sys
  9 | from datetime import datetime, timedelta
 10 | from pathlib import Path
 11 | from time import time
 12 | from urllib.request import urlretrieve
 13 | 
 14 | import numpy as np
 15 | from sklearn.metrics import accuracy_score
 16 | from sklearn.model_selection import StratifiedShuffleSplit, GridSearchCV
 17 | from sklearn.preprocessing import StandardScaler
 18 | from sklearn.svm import SVC
 19 | 
 20 | PACKAGE_PARENT = '..'
 21 | SCRIPT_DIR = os.path.dirname(os.path.realpath(os.path.join(os.getcwd(), os.path.expanduser(__file__))))
 22 | sys.path.append(os.path.normpath(os.path.join(SCRIPT_DIR, PACKAGE_PARENT)))
 23 | 
 24 | from topoml_util.slack_send import notify
 25 | 
 26 | SCRIPT_VERSION = '1.0.1'
 27 | SCRIPT_NAME = os.path.basename(__file__)
 28 | TIMESTAMP = str(datetime.now()).replace(':', '.')
 29 | NUM_CPUS = multiprocessing.cpu_count() - 1 or 1
 30 | DATA_FOLDER = SCRIPT_DIR + '/../../files/archaeology/'
 31 | TRAIN_DATA_FILE = 'archaeology_train_v7.npz'
 32 | TEST_DATA_FILE = 'archaeology_test_v7.npz'
 33 | TRAIN_DATA_URL = 'https://dataverse.nl/api/access/datafile/11377'
 34 | TEST_DATA_URL = 'https://dataverse.nl/api/access/datafile/11376'
 35 | EFD_ORDERS = [0, 1, 2, 3, 4, 6, 8, 12, 16, 20, 24]
 36 | SCRIPT_START = time()
 37 | 
 38 | if __name__ == '__main__':  # this is to squelch warnings on scikit-learn multithreaded grid search
 39 |     # Load training data
 40 |     path = Path(DATA_FOLDER + TRAIN_DATA_FILE)
 41 |     if not path.exists():
 42 |         print("Retrieving training data from web...")
 43 |         urlretrieve(TRAIN_DATA_URL, DATA_FOLDER + TRAIN_DATA_FILE)
 44 | 
 45 |     train_loaded = np.load(DATA_FOLDER + TRAIN_DATA_FILE)
 46 |     train_fourier_descriptors = train_loaded['elliptic_fourier_descriptors']
 47 |     train_labels = train_loaded['feature_type']
 48 | 
 49 |     scaler = StandardScaler().fit(train_fourier_descriptors)
 50 |     train_fourier_descriptors = scaler.transform(train_fourier_descriptors)
 51 | 
 52 |     C_range = [1e-2, 1e-1, 1e0, 1e1, 1e2, 1e3]
 53 |     degree_range = range(1, 7)
 54 |     param_grid = dict(degree=degree_range, C=C_range)
 55 |     cv = StratifiedShuffleSplit(n_splits=5, test_size=0.2, random_state=42)
 56 |     grid = GridSearchCV(
 57 |         SVC(kernel='poly', max_iter=int(1e8)),
 58 |         n_jobs=NUM_CPUS,
 59 |         param_grid=param_grid,
 60 |         verbose=2,
 61 |         cv=cv)
 62 | 
 63 |     print('Performing grid search on model...')
 64 |     print('Using {} threads for grid search'.format(NUM_CPUS))
 65 |     print('Searching {} elliptic fourier descriptor orders'.format(EFD_ORDERS))
 66 | 
 67 |     best_order = 0
 68 |     best_score = 0
 69 |     best_params = {}
 70 | 
 71 |     for order in EFD_ORDERS:
 72 |         print('Fitting order {} fourier descriptors'.format(order))
 73 |         stop_position = 3 + (order * 8)
 74 |         grid.fit(train_fourier_descriptors[::10, :stop_position], train_labels[::10])
 75 |         print("The best parameters for order {} are {} with a score of {}\n".format(
 76 |             order, grid.best_params_, grid.best_score_))
 77 |         if grid.best_score_ > best_score:
 78 |             best_score = grid.best_score_
 79 |             best_order = order
 80 |             best_params = grid.best_params_
 81 | 
 82 |     print('Training model on order {} with best parameters {}'.format(
 83 |         best_order, best_params))
 84 |     stop_position = 3 + (best_order * 8)
 85 |     clf = SVC(kernel='poly', C=best_params['C'], degree=best_params['degree'])
 86 |     clf.fit(X=train_fourier_descriptors[:, :stop_position], y=train_labels)
 87 | 
 88 |     # Run predictions on unseen test data to verify generalization
 89 |     path = Path(DATA_FOLDER + TEST_DATA_FILE)
 90 |     if not path.exists():
 91 |         print("Retrieving test data from web...")
 92 |         urlretrieve(TEST_DATA_URL, DATA_FOLDER + TEST_DATA_FILE)
 93 | 
 94 |     test_loaded = np.load(DATA_FOLDER + TEST_DATA_FILE)
 95 |     test_fourier_descriptors = test_loaded['elliptic_fourier_descriptors']
 96 |     test_labels = np.asarray(test_loaded['feature_type'], dtype=int)
 97 |     test_fourier_descriptors = scaler.transform(test_fourier_descriptors)
 98 | 
 99 |     print('Run on test data...')
100 |     predictions = clf.predict(test_fourier_descriptors[:, :stop_position])
101 |     test_accuracy = accuracy_score(test_labels, predictions)
102 | 
103 |     runtime = time() - SCRIPT_START
104 |     message = '\nTest accuracy of {} for fourier descriptor order {} with {} in {}'.format(
105 |         test_accuracy, best_order, best_params, timedelta(seconds=runtime))
106 |     print(message)
107 |     notify(SCRIPT_NAME, message)
108 | 


--------------------------------------------------------------------------------
/model/baseline/archaeo_feature_type_svm_rbf.py:
--------------------------------------------------------------------------------
  1 | """
  2 | This script executes the task of estimating the type of an archaeological feature, based solely on the geometry for
  3 | that feature. The data for this script can be found at http://hdl.handle.net/10411/GYPPBR.
  4 | """
  5 | 
  6 | import multiprocessing
  7 | import os
  8 | import sys
  9 | from datetime import datetime, timedelta
 10 | from pathlib import Path
 11 | from time import time
 12 | from urllib.request import urlretrieve
 13 | 
 14 | import numpy as np
 15 | from sklearn.metrics import accuracy_score
 16 | from sklearn.model_selection import StratifiedShuffleSplit, GridSearchCV
 17 | from sklearn.preprocessing import StandardScaler
 18 | from sklearn.svm import SVC
 19 | 
 20 | PACKAGE_PARENT = '..'
 21 | SCRIPT_DIR = os.path.dirname(os.path.realpath(os.path.join(os.getcwd(), os.path.expanduser(__file__))))
 22 | sys.path.append(os.path.normpath(os.path.join(SCRIPT_DIR, PACKAGE_PARENT)))
 23 | 
 24 | from topoml_util.slack_send import notify
 25 | 
 26 | SCRIPT_VERSION = '1.0.1'
 27 | SCRIPT_NAME = os.path.basename(__file__)
 28 | TIMESTAMP = str(datetime.now()).replace(':', '.')
 29 | NUM_CPUS = multiprocessing.cpu_count() - 1 or 1
 30 | DATA_FOLDER = SCRIPT_DIR + '/../../files/archaeology/'
 31 | TRAIN_DATA_FILE = 'archaeology_train_v7.npz'
 32 | TEST_DATA_FILE = 'archaeology_test_v7.npz'
 33 | TRAIN_DATA_URL = 'https://dataverse.nl/api/access/datafile/11377'
 34 | TEST_DATA_URL = 'https://dataverse.nl/api/access/datafile/11376'
 35 | EFD_ORDERS = [0, 1, 2, 3, 4, 6, 8, 12, 16, 20, 24]
 36 | SCRIPT_START = time()
 37 | 
 38 | if __name__ == '__main__':  # this is to squelch warnings on scikit-learn multithreaded grid search
 39 |     # Load training data
 40 |     path = Path(DATA_FOLDER + TRAIN_DATA_FILE)
 41 |     if not path.exists():
 42 |         print("Retrieving training data from web...")
 43 |         urlretrieve(TRAIN_DATA_URL, DATA_FOLDER + TRAIN_DATA_FILE)
 44 | 
 45 |     train_loaded = np.load(DATA_FOLDER + TRAIN_DATA_FILE)
 46 |     train_fourier_descriptors = train_loaded['elliptic_fourier_descriptors']
 47 |     train_labels = train_loaded['feature_type']
 48 | 
 49 |     scaler = StandardScaler().fit(train_fourier_descriptors)
 50 |     train_fourier_descriptors = scaler.transform(train_fourier_descriptors)
 51 | 
 52 |     C_range = [1e-1, 1e0, 1e1, 1e2, 1e3]
 53 |     gamma_range = np.logspace(-4, 4, 9)
 54 |     param_grid = dict(gamma=gamma_range, C=C_range)
 55 |     cv = StratifiedShuffleSplit(n_splits=5, test_size=0.2, random_state=42)
 56 |     grid = GridSearchCV(
 57 |         SVC(kernel='rbf'),
 58 |         n_jobs=NUM_CPUS,
 59 |         param_grid=param_grid,
 60 |         verbose=2,
 61 |         cv=cv)
 62 | 
 63 |     print('Performing grid search on model...')
 64 |     print('Using {} threads for grid search'.format(NUM_CPUS))
 65 |     print('Searching {} elliptic fourier descriptor orders'.format(EFD_ORDERS))
 66 | 
 67 |     best_order = 0
 68 |     best_score = 0
 69 |     best_params = {}
 70 | 
 71 |     for order in EFD_ORDERS:
 72 |         print('Fitting order {} fourier descriptors'.format(order))
 73 |         stop_position = 3 + (order * 8)
 74 |         grid.fit(train_fourier_descriptors[::5, :stop_position], train_labels[::5])
 75 |         print("The best parameters for order {} are {} with a score of {}\n".format(
 76 |             order, grid.best_params_, grid.best_score_))
 77 |         if grid.best_score_ > best_score:
 78 |             best_score = grid.best_score_
 79 |             best_order = order
 80 |             best_params = grid.best_params_
 81 | 
 82 |     print('Training model on order {} with best parameters {}'.format(
 83 |         best_order, best_params))
 84 |     stop_position = 3 + (best_order * 8)
 85 |     clf = SVC(kernel='rbf', C=best_params['C'], gamma=best_params['gamma'])
 86 |     clf.fit(X=train_fourier_descriptors[:, :stop_position], y=train_labels)
 87 | 
 88 |     # Run predictions on unseen test data to verify generalization
 89 |     path = Path(DATA_FOLDER + TEST_DATA_FILE)
 90 |     if not path.exists():
 91 |         print("Retrieving test data from web...")
 92 |         urlretrieve(TEST_DATA_URL, DATA_FOLDER + TEST_DATA_FILE)
 93 | 
 94 |     test_loaded = np.load(DATA_FOLDER + TEST_DATA_FILE)
 95 |     test_fourier_descriptors = test_loaded['elliptic_fourier_descriptors']
 96 |     test_labels = np.asarray(test_loaded['feature_type'], dtype=int)
 97 |     test_fourier_descriptors = scaler.transform(test_fourier_descriptors)
 98 | 
 99 |     print('Run on test data...')
100 |     predictions = clf.predict(test_fourier_descriptors[:, :stop_position])
101 |     test_accuracy = accuracy_score(test_labels, predictions)
102 | 
103 |     runtime = time() - SCRIPT_START
104 |     message = '\nTest accuracy of {} for fourier descriptor order {} with {} in {}'.format(
105 |         test_accuracy, best_order, best_params, timedelta(seconds=runtime))
106 |     print(message)
107 |     notify(SCRIPT_NAME, message)
108 | 


--------------------------------------------------------------------------------
/model/baseline/building_type_decision_tree.py:
--------------------------------------------------------------------------------
  1 | """
  2 | This script executes the task of estimating the building type, based solely on the geometry for that building.
  3 | The data for this script can be found at http://hdl.handle.net/10411/GYPPBR.
  4 | """
  5 | 
  6 | import multiprocessing
  7 | import os
  8 | import sys
  9 | from datetime import datetime, timedelta
 10 | from pathlib import Path
 11 | from time import time
 12 | from urllib.request import urlretrieve
 13 | 
 14 | import numpy as np
 15 | from sklearn.metrics import accuracy_score
 16 | from sklearn.model_selection import cross_val_score, StratifiedShuffleSplit, GridSearchCV
 17 | from sklearn.preprocessing import StandardScaler
 18 | from sklearn.tree import DecisionTreeClassifier
 19 | 
 20 | PACKAGE_PARENT = '..'
 21 | SCRIPT_DIR = os.path.dirname(os.path.realpath(os.path.join(os.getcwd(), os.path.expanduser(__file__))))
 22 | sys.path.append(os.path.normpath(os.path.join(SCRIPT_DIR, PACKAGE_PARENT)))
 23 | 
 24 | from topoml_util.slack_send import notify
 25 | 
 26 | SCRIPT_VERSION = '1.0.2'
 27 | SCRIPT_NAME = os.path.basename(__file__)
 28 | TIMESTAMP = str(datetime.now()).replace(':', '.')
 29 | NUM_CPUS = multiprocessing.cpu_count() - 1 or 1
 30 | DATA_FOLDER = SCRIPT_DIR + '/../../files/buildings/'
 31 | TRAIN_DATA_FILE = 'buildings_train_v7.npz'
 32 | TEST_DATA_FILE = 'buildings_test_v7.npz'
 33 | TRAIN_DATA_URL = 'https://dataverse.nl/api/access/datafile/11381'
 34 | TEST_DATA_URL = 'https://dataverse.nl/api/access/datafile/11380'
 35 | EFD_ORDERS = [0, 1, 2, 3, 4, 6, 8, 12, 16, 20, 24]
 36 | SCRIPT_START = time()
 37 | 
 38 | if __name__ == '__main__':  # this is to squelch warnings on scikit-learn multithreaded grid search
 39 |     # Load training data
 40 |     path = Path(DATA_FOLDER + TRAIN_DATA_FILE)
 41 |     if not path.exists():
 42 |         print("Retrieving training data from web...")
 43 |         urlretrieve(TRAIN_DATA_URL, DATA_FOLDER + TRAIN_DATA_FILE)
 44 | 
 45 |     train_loaded = np.load(DATA_FOLDER + TRAIN_DATA_FILE)
 46 |     train_fourier_descriptors = train_loaded['elliptic_fourier_descriptors']
 47 |     train_labels = train_loaded['building_type']
 48 | 
 49 |     scaler = StandardScaler().fit(train_fourier_descriptors)
 50 |     train_fourier_descriptors = scaler.transform(train_fourier_descriptors)
 51 | 
 52 |     param_grid = {'max_depth': range(6, 13)}
 53 |     cv = StratifiedShuffleSplit(n_splits=5, test_size=0.2, random_state=42)
 54 |     grid = GridSearchCV(
 55 |         DecisionTreeClassifier(),
 56 |         n_jobs=NUM_CPUS,
 57 |         param_grid=param_grid,
 58 |         verbose=2,
 59 |         cv=cv)
 60 | 
 61 |     print('Performing grid search on model...')
 62 |     print('Using {} threads for grid search'.format(NUM_CPUS))
 63 |     print('Searching {} elliptic fourier descriptor orders'.format(EFD_ORDERS))
 64 | 
 65 |     best_order = 0
 66 |     best_score = 0
 67 |     best_params = {}
 68 | 
 69 |     for order in EFD_ORDERS:
 70 |         print('Fitting order {} fourier descriptors'.format(order))
 71 |         stop_position = 3 + (order * 8)
 72 |         grid.fit(train_fourier_descriptors[:, :stop_position], train_labels)
 73 |         print("The best parameters for order {} are {} with a score of {}\n".format(
 74 |             order, grid.best_params_, grid.best_score_))
 75 |         if grid.best_score_ > best_score:
 76 |             best_score = grid.best_score_
 77 |             best_order = order
 78 |             best_params = grid.best_params_
 79 | 
 80 |     print('Training model on order {} with best parameters {}'.format(
 81 |         best_order, best_params))
 82 |     stop_position = 3 + (best_order * 8)
 83 |     clf = DecisionTreeClassifier(max_depth=best_params['max_depth'])
 84 |     scores = cross_val_score(clf, train_fourier_descriptors[:, :stop_position], train_labels, cv=10, n_jobs=NUM_CPUS)
 85 |     print('Cross-validation scores:', scores)
 86 |     clf.fit(train_fourier_descriptors[:, :stop_position], train_labels)
 87 | 
 88 |     # Run predictions on unseen test data to verify generalization
 89 |     path = Path(DATA_FOLDER + TEST_DATA_FILE)
 90 |     if not path.exists():
 91 |         print("Retrieving test data from web...")
 92 |         urlretrieve(TEST_DATA_URL, DATA_FOLDER + TEST_DATA_FILE)
 93 | 
 94 |     test_loaded = np.load(DATA_FOLDER + TEST_DATA_FILE)
 95 |     test_fourier_descriptors = test_loaded['elliptic_fourier_descriptors']
 96 |     test_labels = np.asarray(test_loaded['building_type'], dtype=int)
 97 |     test_fourier_descriptors = scaler.transform(test_fourier_descriptors)
 98 | 
 99 |     print('Run on test data...')
100 |     predictions = clf.predict(test_fourier_descriptors[:, :stop_position])
101 |     test_accuracy = accuracy_score(test_labels, predictions)
102 | 
103 |     runtime = time() - SCRIPT_START
104 |     message = '\nTest accuracy of {} for fourier descriptor order {} with {} in {}'.format(
105 |         test_accuracy, best_order, best_params, timedelta(seconds=runtime))
106 |     print(message)
107 |     notify(SCRIPT_NAME, message)
108 | 


--------------------------------------------------------------------------------
/model/baseline/building_type_knn.py:
--------------------------------------------------------------------------------
  1 | """
  2 | This script executes the task of estimating the building type, based solely on the geometry for that building.
  3 | The data for this script can be found at http://hdl.handle.net/10411/GYPPBR.
  4 | """
  5 | 
  6 | import multiprocessing
  7 | import os
  8 | import sys
  9 | from datetime import datetime, timedelta
 10 | from pathlib import Path
 11 | from time import time
 12 | from urllib.request import urlretrieve
 13 | 
 14 | import numpy as np
 15 | from sklearn.metrics import accuracy_score
 16 | from sklearn.model_selection import cross_val_score, StratifiedShuffleSplit, GridSearchCV
 17 | from sklearn.neighbors import KNeighborsClassifier
 18 | from sklearn.preprocessing import StandardScaler
 19 | 
 20 | PACKAGE_PARENT = '..'
 21 | SCRIPT_DIR = os.path.dirname(os.path.realpath(os.path.join(os.getcwd(), os.path.expanduser(__file__))))
 22 | sys.path.append(os.path.normpath(os.path.join(SCRIPT_DIR, PACKAGE_PARENT)))
 23 | 
 24 | from topoml_util.slack_send import notify
 25 | 
 26 | SCRIPT_VERSION = '1.0.5'
 27 | SCRIPT_NAME = os.path.basename(__file__)
 28 | TIMESTAMP = str(datetime.now()).replace(':', '.')
 29 | NUM_CPUS = multiprocessing.cpu_count() - 1 or 1
 30 | DATA_FOLDER = SCRIPT_DIR + '/../../files/buildings/'
 31 | TRAIN_DATA_FILE = 'buildings_train_v7.npz'
 32 | TEST_DATA_FILE = 'buildings_test_v7.npz'
 33 | TRAIN_DATA_URL = 'https://dataverse.nl/api/access/datafile/11381'
 34 | TEST_DATA_URL = 'https://dataverse.nl/api/access/datafile/11380'
 35 | EFD_ORDERS = [0, 1, 2, 3, 4, 6, 8, 12, 16, 20, 24]
 36 | SCRIPT_START = time()
 37 | 
 38 | if __name__ == '__main__':  # this is to squelch warnings on scikit-learn multithreaded grid search
 39 |     # Load training data
 40 |     path = Path(DATA_FOLDER + TRAIN_DATA_FILE)
 41 |     if not path.exists():
 42 |         print("Retrieving training data from web...")
 43 |         urlretrieve(TRAIN_DATA_URL, DATA_FOLDER + TRAIN_DATA_FILE)
 44 | 
 45 |     train_loaded = np.load(DATA_FOLDER + TRAIN_DATA_FILE)
 46 |     train_fourier_descriptors = train_loaded['elliptic_fourier_descriptors']
 47 |     train_labels = train_loaded['building_type']
 48 | 
 49 |     scaler = StandardScaler().fit(train_fourier_descriptors)
 50 |     train_fourier_descriptors = scaler.transform(train_fourier_descriptors)
 51 | 
 52 |     k_range = np.linspace(start=21, stop=30, num=10, dtype=int)
 53 |     param_grid = dict(n_neighbors=k_range)
 54 |     cv = StratifiedShuffleSplit(n_splits=5, test_size=0.2, random_state=42)
 55 |     grid = GridSearchCV(
 56 |         KNeighborsClassifier(),
 57 |         n_jobs=NUM_CPUS,
 58 |         param_grid=param_grid,
 59 |         verbose=2,
 60 |         cv=cv)
 61 | 
 62 |     print('Performing grid search on model...')
 63 |     print('Using {} threads for grid search'.format(NUM_CPUS))
 64 |     print('Searching {} elliptic fourier descriptor orders'.format(EFD_ORDERS))
 65 | 
 66 |     best_order = 0
 67 |     best_score = 0
 68 |     best_params = {}
 69 | 
 70 |     for order in EFD_ORDERS:
 71 |         print('Fitting order {} fourier descriptors'.format(order))
 72 |         stop_position = 3 + (order * 8)
 73 |         grid.fit(train_fourier_descriptors[::5, :stop_position], train_labels[::5])
 74 |         print("The best parameters for order {} are {} with a score of {}\n".format(
 75 |             order, grid.best_params_, grid.best_score_))
 76 |         if grid.best_score_ > best_score:
 77 |             best_score = grid.best_score_
 78 |             best_order = order
 79 |             best_params = grid.best_params_
 80 | 
 81 |     print('Training model on order {} with best parameters {}'.format(
 82 |         best_order, best_params))
 83 |     stop_position = 3 + (best_order * 8)
 84 |     clf = KNeighborsClassifier(n_neighbors=best_params['n_neighbors'])
 85 |     scores = cross_val_score(clf, train_fourier_descriptors[:, :stop_position], train_labels, cv=10, n_jobs=NUM_CPUS)
 86 |     print('Cross-validation scores:', scores)
 87 |     clf.fit(train_fourier_descriptors[:, :stop_position], train_labels)
 88 | 
 89 |     # Run predictions on unseen test data to verify generalization
 90 |     path = Path(DATA_FOLDER + TEST_DATA_FILE)
 91 |     if not path.exists():
 92 |         print("Retrieving test data from web...")
 93 |         urlretrieve(TEST_DATA_URL, DATA_FOLDER + TEST_DATA_FILE)
 94 | 
 95 |     test_loaded = np.load(DATA_FOLDER + TEST_DATA_FILE)
 96 |     test_fourier_descriptors = test_loaded['elliptic_fourier_descriptors']
 97 |     test_labels = np.asarray(test_loaded['building_type'], dtype=int)
 98 |     test_fourier_descriptors = scaler.transform(test_fourier_descriptors)
 99 | 
100 |     print('Run on test data...')
101 |     predictions = clf.predict(test_fourier_descriptors[:, :stop_position])
102 |     test_accuracy = accuracy_score(test_labels, predictions)
103 | 
104 |     runtime = time() - SCRIPT_START
105 |     message = '\nTest accuracy of {} for fourier descriptor order {} with {} in {}'.format(
106 |         test_accuracy, best_order, best_params, timedelta(seconds=runtime))
107 |     print(message)
108 |     notify(SCRIPT_NAME, message)
109 | 


--------------------------------------------------------------------------------
/model/baseline/building_type_logistic_regression.py:
--------------------------------------------------------------------------------
  1 | """
  2 | This script executes the task of estimating the building type, based solely on the geometry for that building.
  3 | The data for this script can be found at http://hdl.handle.net/10411/GYPPBR.
  4 | """
  5 | 
  6 | import multiprocessing
  7 | import os
  8 | import sys
  9 | from datetime import datetime, timedelta
 10 | from pathlib import Path
 11 | from time import time
 12 | from urllib.request import urlretrieve
 13 | 
 14 | import numpy as np
 15 | from sklearn.linear_model import LogisticRegression
 16 | from sklearn.metrics import accuracy_score
 17 | from sklearn.model_selection import cross_val_score, StratifiedShuffleSplit, GridSearchCV
 18 | from sklearn.preprocessing import StandardScaler
 19 | 
 20 | PACKAGE_PARENT = '..'
 21 | SCRIPT_DIR = os.path.dirname(os.path.realpath(os.path.join(os.getcwd(), os.path.expanduser(__file__))))
 22 | sys.path.append(os.path.normpath(os.path.join(SCRIPT_DIR, PACKAGE_PARENT)))
 23 | 
 24 | from topoml_util.slack_send import notify
 25 | 
 26 | SCRIPT_VERSION = '1.0.1'
 27 | SCRIPT_NAME = os.path.basename(__file__)
 28 | TIMESTAMP = str(datetime.now()).replace(':', '.')
 29 | NUM_CPUS = multiprocessing.cpu_count() - 1 or 1
 30 | DATA_FOLDER = SCRIPT_DIR + '/../../files/buildings/'
 31 | TRAIN_DATA_FILE = 'buildings_train_v7.npz'
 32 | TEST_DATA_FILE = 'buildings_test_v7.npz'
 33 | TRAIN_DATA_URL = 'https://dataverse.nl/api/access/datafile/11381'
 34 | TEST_DATA_URL = 'https://dataverse.nl/api/access/datafile/11380'
 35 | EFD_ORDERS = [0, 1, 2, 3, 4, 6, 8, 12, 16, 20, 24]
 36 | SCRIPT_START = time()
 37 | 
 38 | if __name__ == '__main__':  # this is to squelch warnings on scikit-learn multithreaded grid search
 39 |     # Load training data
 40 |     path = Path(DATA_FOLDER + TRAIN_DATA_FILE)
 41 |     if not path.exists():
 42 |         print("Retrieving training data from web...")
 43 |         urlretrieve(TRAIN_DATA_URL, DATA_FOLDER + TRAIN_DATA_FILE)
 44 | 
 45 |     train_loaded = np.load(DATA_FOLDER + TRAIN_DATA_FILE)
 46 |     train_fourier_descriptors = train_loaded['elliptic_fourier_descriptors']
 47 |     train_labels = train_loaded['building_type']
 48 | 
 49 |     scaler = StandardScaler().fit(train_fourier_descriptors)
 50 |     train_fourier_descriptors = scaler.transform(train_fourier_descriptors)
 51 | 
 52 |     # Grid search
 53 |     C_range = [1e-2, 1e-1, 1e0, 1e1, 1e2, 1e3]
 54 |     param_grid = dict(C=C_range)
 55 |     cv = StratifiedShuffleSplit(n_splits=5, test_size=0.2, random_state=42)
 56 |     grid = GridSearchCV(
 57 |         LogisticRegression(),
 58 |         n_jobs=NUM_CPUS,
 59 |         param_grid=param_grid,
 60 |         verbose=2,
 61 |         cv=cv)
 62 | 
 63 |     print('Performing grid search on model...')
 64 |     print('Using {} threads for grid search'.format(NUM_CPUS))
 65 |     print('Searching {} elliptic fourier descriptor orders'.format(EFD_ORDERS))
 66 | 
 67 |     best_order = 0
 68 |     best_score = 0
 69 |     best_params = {}
 70 | 
 71 |     for order in EFD_ORDERS:
 72 |         print('Fitting order {} fourier descriptors'.format(order))
 73 |         stop_position = 3 + (order * 8)
 74 |         grid.fit(train_fourier_descriptors[:, :stop_position], train_labels)
 75 |         print("The best parameters for order {} are {} with a score of {}\n".format(
 76 |             order, grid.best_params_, grid.best_score_))
 77 |         if grid.best_score_ > best_score:
 78 |             best_score = grid.best_score_
 79 |             best_order = order
 80 |             best_params = grid.best_params_
 81 | 
 82 |     print('Training model on order {} with best parameters {}'.format(
 83 |         best_order, best_params))
 84 |     stop_position = 3 + (best_order * 8)
 85 |     clf = LogisticRegression(C=best_params['C'])
 86 |     scores = cross_val_score(clf, train_fourier_descriptors[:, :stop_position], train_labels, cv=10, n_jobs=NUM_CPUS)
 87 |     print('Cross-validation scores:', scores)
 88 |     clf.fit(train_fourier_descriptors[:, :stop_position], train_labels)
 89 | 
 90 |     # Run predictions on unseen test data to verify generalization
 91 |     path = Path(DATA_FOLDER + TEST_DATA_FILE)
 92 |     if not path.exists():
 93 |         print("Retrieving test data from web...")
 94 |         urlretrieve(TEST_DATA_URL, DATA_FOLDER + TEST_DATA_FILE)
 95 | 
 96 |     test_loaded = np.load(DATA_FOLDER + TEST_DATA_FILE)
 97 |     test_fourier_descriptors = test_loaded['elliptic_fourier_descriptors']
 98 |     test_labels = np.asarray(test_loaded['building_type'], dtype=int)
 99 |     test_fourier_descriptors = scaler.transform(test_fourier_descriptors)
100 | 
101 |     print('Run on test data...')
102 |     predictions = clf.predict(test_fourier_descriptors[:, :stop_position])
103 |     test_accuracy = accuracy_score(test_labels, predictions)
104 | 
105 |     runtime = time() - SCRIPT_START
106 |     message = '\nTest accuracy of {} for fourier descriptor order {} with {} in {}'.format(
107 |         test_accuracy, best_order, best_params, timedelta(seconds=runtime))
108 |     print(message)
109 |     notify(SCRIPT_NAME, message)
110 | 


--------------------------------------------------------------------------------
/model/baseline/building_type_svm_linear.py:
--------------------------------------------------------------------------------
  1 | """
  2 | This script executes the task of estimating the building type, based solely on the geometry for that building.
  3 | The data for this script can be found at http://hdl.handle.net/10411/GYPPBR.
  4 | """
  5 | 
  6 | import multiprocessing
  7 | import os
  8 | import sys
  9 | from datetime import datetime, timedelta
 10 | from pathlib import Path
 11 | from time import time
 12 | from urllib.request import urlretrieve
 13 | 
 14 | import numpy as np
 15 | from sklearn.metrics import accuracy_score
 16 | from sklearn.model_selection import StratifiedShuffleSplit, GridSearchCV
 17 | from sklearn.preprocessing import StandardScaler
 18 | from sklearn.svm import SVC
 19 | 
 20 | PACKAGE_PARENT = '..'
 21 | SCRIPT_DIR = os.path.dirname(os.path.realpath(os.path.join(os.getcwd(), os.path.expanduser(__file__))))
 22 | sys.path.append(os.path.normpath(os.path.join(SCRIPT_DIR, PACKAGE_PARENT)))
 23 | 
 24 | from topoml_util.slack_send import notify
 25 | 
 26 | SCRIPT_VERSION = '1.0.1'
 27 | SCRIPT_NAME = os.path.basename(__file__)
 28 | TIMESTAMP = str(datetime.now()).replace(':', '.')
 29 | NUM_CPUS = multiprocessing.cpu_count() - 1 or 1
 30 | DATA_FOLDER = SCRIPT_DIR + '/../../files/buildings/'
 31 | TRAIN_DATA_FILE = 'buildings_train_v7.npz'
 32 | TEST_DATA_FILE = 'buildings_test_v7.npz'
 33 | TRAIN_DATA_URL = 'https://dataverse.nl/api/access/datafile/11381'
 34 | TEST_DATA_URL = 'https://dataverse.nl/api/access/datafile/11380'
 35 | EFD_ORDERS = [0, 1, 2, 3, 4, 6, 8, 12, 16, 20, 24]
 36 | SCRIPT_START = time()
 37 | 
 38 | if __name__ == '__main__':  # this is to squelch warnings on scikit-learn multithreaded grid search
 39 |     # Load training data
 40 |     path = Path(DATA_FOLDER + TRAIN_DATA_FILE)
 41 |     if not path.exists():
 42 |         print("Retrieving training data from web...")
 43 |         urlretrieve(TRAIN_DATA_URL, DATA_FOLDER + TRAIN_DATA_FILE)
 44 | 
 45 |     train_loaded = np.load(DATA_FOLDER + TRAIN_DATA_FILE)
 46 |     train_fourier_descriptors = train_loaded['elliptic_fourier_descriptors']
 47 |     train_labels = train_loaded['building_type']
 48 | 
 49 |     scaler = StandardScaler().fit(train_fourier_descriptors)
 50 |     train_fourier_descriptors = scaler.transform(train_fourier_descriptors)
 51 | 
 52 |     C_range = [1e-1, 1e0, 1e1, 1e2, 1e3]
 53 |     param_grid = dict(C=C_range)
 54 |     cv = StratifiedShuffleSplit(n_splits=5, test_size=0.2, random_state=42)
 55 |     grid = GridSearchCV(
 56 |         SVC(kernel='linear', max_iter=int(1e7)),
 57 |         n_jobs=NUM_CPUS,
 58 |         param_grid=param_grid,
 59 |         verbose=2,
 60 |         cv=cv)
 61 | 
 62 |     print('Performing grid search on model...')
 63 |     print('Using {} threads for grid search'.format(NUM_CPUS))
 64 |     print('Searching {} elliptic fourier descriptor orders'.format(EFD_ORDERS))
 65 | 
 66 |     best_order = 0
 67 |     best_score = 0
 68 |     best_params = {}
 69 | 
 70 |     for order in EFD_ORDERS:
 71 |         print('Fitting order {} fourier descriptors'.format(order))
 72 |         stop_position = 3 + (order * 8)
 73 |         grid.fit(train_fourier_descriptors[::20, :stop_position], train_labels[::20])
 74 |         print("The best parameters for order {} are {} with a score of {}\n".format(
 75 |             order, grid.best_params_, grid.best_score_))
 76 |         if grid.best_score_ > best_score:
 77 |             best_score = grid.best_score_
 78 |             best_order = order
 79 |             best_params = grid.best_params_
 80 | 
 81 |     print('Training model on order {} with best parameters {}'.format(
 82 |         best_order, best_params))
 83 |     stop_position = 3 + (best_order * 8)
 84 |     clf = SVC(kernel='linear', C=best_params['C'], max_iter=int(1e7))
 85 |     clf.fit(X=train_fourier_descriptors[:, :stop_position], y=train_labels)
 86 | 
 87 |     # Run predictions on unseen test data to verify generalization
 88 |     path = Path(DATA_FOLDER + TEST_DATA_FILE)
 89 |     if not path.exists():
 90 |         print("Retrieving test data from web...")
 91 |         urlretrieve(TEST_DATA_URL, DATA_FOLDER + TEST_DATA_FILE)
 92 | 
 93 |     test_loaded = np.load(DATA_FOLDER + TEST_DATA_FILE)
 94 |     test_fourier_descriptors = test_loaded['elliptic_fourier_descriptors']
 95 |     test_labels = np.asarray(test_loaded['building_type'], dtype=int)
 96 |     test_fourier_descriptors = scaler.transform(test_fourier_descriptors)
 97 | 
 98 |     print('Run on test data...')
 99 |     predictions = clf.predict(test_fourier_descriptors[:, :stop_position])
100 |     test_accuracy = accuracy_score(test_labels, predictions)
101 | 
102 |     runtime = time() - SCRIPT_START
103 |     message = '\nTest accuracy of {} for fourier descriptor order {} with {} in {}'.format(
104 |         test_accuracy, best_order, best_params, timedelta(seconds=runtime))
105 |     print(message)
106 |     notify(SCRIPT_NAME, message)
107 | 


--------------------------------------------------------------------------------
/model/baseline/building_type_svm_polynomial.py:
--------------------------------------------------------------------------------
  1 | """
  2 | This script executes the task of estimating the building type, based solely on the geometry for that building.
  3 | The data for this script can be found at http://hdl.handle.net/10411/GYPPBR.
  4 | """
  5 | 
  6 | import multiprocessing
  7 | import os
  8 | import sys
  9 | from datetime import datetime, timedelta
 10 | from pathlib import Path
 11 | from time import time
 12 | from urllib.request import urlretrieve
 13 | 
 14 | import numpy as np
 15 | from sklearn.metrics import accuracy_score
 16 | from sklearn.model_selection import StratifiedShuffleSplit, GridSearchCV
 17 | from sklearn.preprocessing import StandardScaler
 18 | from sklearn.svm import SVC
 19 | 
 20 | PACKAGE_PARENT = '..'
 21 | SCRIPT_DIR = os.path.dirname(os.path.realpath(os.path.join(os.getcwd(), os.path.expanduser(__file__))))
 22 | sys.path.append(os.path.normpath(os.path.join(SCRIPT_DIR, PACKAGE_PARENT)))
 23 | 
 24 | from topoml_util.slack_send import notify
 25 | 
 26 | SCRIPT_VERSION = '1.0.4'
 27 | SCRIPT_NAME = os.path.basename(__file__)
 28 | TIMESTAMP = str(datetime.now()).replace(':', '.')
 29 | NUM_CPUS = multiprocessing.cpu_count() - 1 or 1
 30 | DATA_FOLDER = SCRIPT_DIR + '/../../files/buildings/'
 31 | TRAIN_DATA_FILE = 'buildings_train_v7.npz'
 32 | TEST_DATA_FILE = 'buildings_test_v7.npz'
 33 | TRAIN_DATA_URL = 'https://dataverse.nl/api/access/datafile/11381'
 34 | TEST_DATA_URL = 'https://dataverse.nl/api/access/datafile/11380'
 35 | EFD_ORDERS = [0, 1, 2, 3, 4, 6, 8, 12, 16, 20, 24]
 36 | SCRIPT_START = time()
 37 | 
 38 | if __name__ == '__main__':  # this is to squelch warnings on scikit-learn multithreaded grid search
 39 |     # Load training data
 40 |     path = Path(DATA_FOLDER + TRAIN_DATA_FILE)
 41 |     if not path.exists():
 42 |         print("Retrieving training data from web...")
 43 |         urlretrieve(TRAIN_DATA_URL, DATA_FOLDER + TRAIN_DATA_FILE)
 44 | 
 45 |     train_loaded = np.load(DATA_FOLDER + TRAIN_DATA_FILE)
 46 |     train_fourier_descriptors = train_loaded['elliptic_fourier_descriptors']
 47 |     train_labels = train_loaded['building_type']
 48 | 
 49 |     scaler = StandardScaler().fit(train_fourier_descriptors)
 50 |     train_fourier_descriptors = scaler.transform(train_fourier_descriptors)
 51 | 
 52 |     C_range = [1e-2, 1e-1, 1e0, 1e1, 1e2, 1e3]
 53 |     degree_range = range(1, 7)
 54 |     param_grid = dict(degree=degree_range, C=C_range)
 55 |     cv = StratifiedShuffleSplit(n_splits=5, test_size=0.2, random_state=42)
 56 |     grid = GridSearchCV(
 57 |         SVC(kernel='poly', max_iter=int(1e7)),
 58 |         n_jobs=NUM_CPUS,
 59 |         param_grid=param_grid,
 60 |         verbose=2,
 61 |         cv=cv)
 62 | 
 63 |     print('Performing grid search on model...')
 64 |     print('Using {} threads for grid search'.format(NUM_CPUS))
 65 |     print('Searching {} elliptic fourier descriptor orders'.format(EFD_ORDERS))
 66 | 
 67 |     best_order = 0
 68 |     best_score = 0
 69 |     best_params = {}
 70 | 
 71 |     for order in EFD_ORDERS:
 72 |         print('Fitting order {} fourier descriptors'.format(order))
 73 |         stop_position = 3 + (order * 8)
 74 |         grid.fit(train_fourier_descriptors[::16, :stop_position], train_labels[::16])
 75 |         print("The best parameters for order {} are {} with a score of {}\n".format(
 76 |             order, grid.best_params_, grid.best_score_))
 77 |         if grid.best_score_ > best_score:
 78 |             best_score = grid.best_score_
 79 |             best_order = order
 80 |             best_params = grid.best_params_
 81 | 
 82 |     print('Training model on order {} with best parameters {}'.format(
 83 |         best_order, best_params))
 84 |     stop_position = 3 + (best_order * 8)
 85 |     clf = SVC(kernel='poly', C=best_params['C'], degree=best_params['degree'])
 86 |     clf.fit(X=train_fourier_descriptors[:, :stop_position], y=train_labels)
 87 | 
 88 |     # Run predictions on unseen test data to verify generalization
 89 |     path = Path(DATA_FOLDER + TEST_DATA_FILE)
 90 |     if not path.exists():
 91 |         print("Retrieving test data from web...")
 92 |         urlretrieve(TEST_DATA_URL, DATA_FOLDER + TEST_DATA_FILE)
 93 | 
 94 |     test_loaded = np.load(DATA_FOLDER + TEST_DATA_FILE)
 95 |     test_fourier_descriptors = test_loaded['elliptic_fourier_descriptors']
 96 |     test_labels = np.asarray(test_loaded['building_type'], dtype=int)
 97 |     test_fourier_descriptors = scaler.transform(test_fourier_descriptors)
 98 | 
 99 |     print('Run on test data...')
100 |     predictions = clf.predict(test_fourier_descriptors[:, :stop_position])
101 |     test_accuracy = accuracy_score(test_labels, predictions)
102 | 
103 |     runtime = time() - SCRIPT_START
104 |     message = '\nTest accuracy of {} for fourier descriptor order {} with {} in {}'.format(
105 |         test_accuracy, best_order, best_params, timedelta(seconds=runtime))
106 |     print(message)
107 |     notify(SCRIPT_NAME, message)
108 | 


--------------------------------------------------------------------------------
/model/baseline/building_type_svm_rbf.py:
--------------------------------------------------------------------------------
  1 | """
  2 | This script executes the task of estimating the building type, based solely on the geometry for that building.
  3 | The data for this script can be found at http://hdl.handle.net/10411/GYPPBR.
  4 | """
  5 | 
  6 | import multiprocessing
  7 | import os
  8 | import sys
  9 | from datetime import datetime, timedelta
 10 | from pathlib import Path
 11 | from time import time
 12 | from urllib.request import urlretrieve
 13 | 
 14 | import numpy as np
 15 | from sklearn.metrics import accuracy_score
 16 | from sklearn.model_selection import StratifiedShuffleSplit, GridSearchCV
 17 | from sklearn.preprocessing import StandardScaler
 18 | from sklearn.svm import SVC
 19 | 
 20 | PACKAGE_PARENT = '..'
 21 | SCRIPT_DIR = os.path.dirname(os.path.realpath(os.path.join(os.getcwd(), os.path.expanduser(__file__))))
 22 | sys.path.append(os.path.normpath(os.path.join(SCRIPT_DIR, PACKAGE_PARENT)))
 23 | 
 24 | from topoml_util.slack_send import notify
 25 | 
 26 | SCRIPT_VERSION = '1.0.1'
 27 | SCRIPT_NAME = os.path.basename(__file__)
 28 | TIMESTAMP = str(datetime.now()).replace(':', '.')
 29 | NUM_CPUS = multiprocessing.cpu_count() - 1 or 1
 30 | DATA_FOLDER = SCRIPT_DIR + '/../../files/buildings/'
 31 | TRAIN_DATA_FILE = 'buildings_train_v7.npz'
 32 | TEST_DATA_FILE = 'buildings_test_v7.npz'
 33 | TRAIN_DATA_URL = 'https://dataverse.nl/api/access/datafile/11381'
 34 | TEST_DATA_URL = 'https://dataverse.nl/api/access/datafile/11380'
 35 | EFD_ORDERS = [0, 1, 2, 3, 4, 6, 8, 12, 16, 20, 24]
 36 | SCRIPT_START = time()
 37 | 
 38 | if __name__ == '__main__':  # this is to squelch warnings on scikit-learn multithreaded grid search
 39 |     # Load training data
 40 |     path = Path(DATA_FOLDER + TRAIN_DATA_FILE)
 41 |     if not path.exists():
 42 |         print("Retrieving training data from web...")
 43 |         urlretrieve(TRAIN_DATA_URL, DATA_FOLDER + TRAIN_DATA_FILE)
 44 | 
 45 |     train_loaded = np.load(DATA_FOLDER + TRAIN_DATA_FILE)
 46 |     train_fourier_descriptors = train_loaded['elliptic_fourier_descriptors']
 47 |     train_labels = train_loaded['building_type']
 48 | 
 49 |     scaler = StandardScaler().fit(train_fourier_descriptors)
 50 |     train_fourier_descriptors = scaler.transform(train_fourier_descriptors)
 51 | 
 52 |     C_range = [1e-2, 1e-1, 1e0, 1e1, 1e2, 1e3]
 53 |     gamma_range = np.logspace(-2, 3, 6)
 54 |     param_grid = dict(gamma=gamma_range, C=C_range)
 55 |     cv = StratifiedShuffleSplit(n_splits=5, test_size=0.2, random_state=42)
 56 |     grid = GridSearchCV(
 57 |         SVC(kernel='rbf', max_iter=int(1e8)),
 58 |         n_jobs=NUM_CPUS,
 59 |         param_grid=param_grid,
 60 |         verbose=2,
 61 |         cv=cv)
 62 | 
 63 |     print('Performing grid search on model...')
 64 |     print('Using {} threads for grid search'.format(NUM_CPUS))
 65 |     print('Searching {} elliptic fourier descriptor orders'.format(EFD_ORDERS))
 66 | 
 67 |     best_order = 0
 68 |     best_score = 0
 69 |     best_params = {}
 70 | 
 71 |     for order in EFD_ORDERS:
 72 |         print('Fitting order {} fourier descriptors'.format(order))
 73 |         stop_position = 3 + (order * 8)
 74 |         grid.fit(train_fourier_descriptors[::10, :stop_position], train_labels[::10])
 75 |         print("The best parameters for order {} are {} with a score of {}\n".format(
 76 |             order, grid.best_params_, grid.best_score_))
 77 |         if grid.best_score_ > best_score:
 78 |             best_score = grid.best_score_
 79 |             best_order = order
 80 |             best_params = grid.best_params_
 81 | 
 82 |     print('Training model on order {} with best parameters {}'.format(
 83 |         best_order, best_params))
 84 |     stop_position = 3 + (best_order * 8)
 85 |     clf = SVC(kernel='rbf', C=best_params['C'], gamma=best_params['gamma'])
 86 |     clf.fit(X=train_fourier_descriptors[:, :stop_position], y=train_labels)
 87 | 
 88 |     # Run predictions on unseen test data to verify generalization
 89 |     path = Path(DATA_FOLDER + TEST_DATA_FILE)
 90 |     if not path.exists():
 91 |         print("Retrieving test data from web...")
 92 |         urlretrieve(TEST_DATA_URL, DATA_FOLDER + TEST_DATA_FILE)
 93 | 
 94 |     test_loaded = np.load(DATA_FOLDER + TEST_DATA_FILE)
 95 |     test_fourier_descriptors = test_loaded['elliptic_fourier_descriptors']
 96 |     test_labels = np.asarray(test_loaded['building_type'], dtype=int)
 97 |     test_fourier_descriptors = scaler.transform(test_fourier_descriptors)
 98 | 
 99 |     print('Run on test data...')
100 |     predictions = clf.predict(test_fourier_descriptors[:, :stop_position])
101 |     test_accuracy = accuracy_score(test_labels, predictions)
102 | 
103 |     runtime = time() - SCRIPT_START
104 |     message = '\nTest accuracy of {} for fourier descriptor order {} with {} in {}'.format(
105 |         test_accuracy, best_order, best_params, timedelta(seconds=runtime))
106 |     print(message)
107 |     notify(SCRIPT_NAME, message)
108 | 


--------------------------------------------------------------------------------
/model/baseline/neighborhood_inhabintants_decision_tree.py:
--------------------------------------------------------------------------------
  1 | """
  2 | This script executes the task of estimating the number of inhabitants of a neighborhood to be under or over the
  3 | median of all neighborhoods in the Netherlands, based solely on the geometry for that neighborhood.
  4 | The data for this script can be found at http://hdl.handle.net/10411/GYPPBR.
  5 | """
  6 | 
  7 | import multiprocessing
  8 | import os
  9 | import sys
 10 | from datetime import datetime, timedelta
 11 | from pathlib import Path
 12 | from time import time
 13 | from urllib.request import urlretrieve
 14 | 
 15 | import numpy as np
 16 | from sklearn.metrics import accuracy_score
 17 | from sklearn.model_selection import cross_val_score, StratifiedShuffleSplit, GridSearchCV
 18 | from sklearn.preprocessing import StandardScaler
 19 | from sklearn.tree import DecisionTreeClassifier
 20 | 
 21 | PACKAGE_PARENT = '..'
 22 | SCRIPT_DIR = os.path.dirname(os.path.realpath(os.path.join(os.getcwd(), os.path.expanduser(__file__))))
 23 | sys.path.append(os.path.normpath(os.path.join(SCRIPT_DIR, PACKAGE_PARENT)))
 24 | 
 25 | from topoml_util.slack_send import notify
 26 | 
 27 | SCRIPT_VERSION = '1.0.8'
 28 | SCRIPT_NAME = os.path.basename(__file__)
 29 | TIMESTAMP = str(datetime.now()).replace(':', '.')
 30 | NUM_CPUS = multiprocessing.cpu_count() - 1 or 1
 31 | DATA_FOLDER = SCRIPT_DIR + '/../../files/neighborhoods/'
 32 | TRAIN_DATA_FILE = 'neighborhoods_train_v7.npz'
 33 | TEST_DATA_FILE = 'neighborhoods_test_v7.npz'
 34 | TRAIN_DATA_URL = 'https://dataverse.nl/api/access/datafile/11378'
 35 | TEST_DATA_URL = 'https://dataverse.nl/api/access/datafile/11379'
 36 | EFD_ORDERS = [0, 1, 2, 3, 4, 6, 8, 12, 16, 20, 24]
 37 | SCRIPT_START = time()
 38 | 
 39 | if __name__ == '__main__':  # this is to squelch warnings on scikit-learn multithreaded grid search
 40 |     # Load training data
 41 |     path = Path(DATA_FOLDER + TRAIN_DATA_FILE)
 42 |     if not path.exists():
 43 |         print("Retrieving training data from web...")
 44 |         urlretrieve(TRAIN_DATA_URL, DATA_FOLDER + TRAIN_DATA_FILE)
 45 | 
 46 |     train_loaded = np.load(DATA_FOLDER + TRAIN_DATA_FILE)
 47 |     train_fourier_descriptors = train_loaded['elliptic_fourier_descriptors']
 48 |     train_labels = train_loaded['above_or_below_median']
 49 | 
 50 |     scaler = StandardScaler().fit(train_fourier_descriptors)
 51 |     train_fourier_descriptors = scaler.transform(train_fourier_descriptors)
 52 | 
 53 |     param_grid = {'max_depth': range(4, 10)}
 54 |     cv = StratifiedShuffleSplit(n_splits=5, test_size=0.2, random_state=42)
 55 |     grid = GridSearchCV(
 56 |         DecisionTreeClassifier(),
 57 |         n_jobs=NUM_CPUS,
 58 |         param_grid=param_grid,
 59 |         verbose=2,
 60 |         cv=cv)
 61 | 
 62 |     print('Performing grid search on model...')
 63 |     print('Using {} threads for grid search'.format(NUM_CPUS))
 64 |     print('Searching {} elliptic fourier descriptor orders'.format(EFD_ORDERS))
 65 | 
 66 |     best_order = 0
 67 |     best_score = 0
 68 |     best_params = {}
 69 | 
 70 |     for order in EFD_ORDERS:
 71 |         print('Fitting order {} fourier descriptors'.format(order))
 72 |         stop_position = 3 + (order * 8)
 73 |         grid.fit(train_fourier_descriptors[:, :stop_position], train_labels)
 74 |         print("The best parameters for order {} are {} with a score of {}\n".format(
 75 |             order, grid.best_params_, grid.best_score_))
 76 |         if grid.best_score_ > best_score:
 77 |             best_score = grid.best_score_
 78 |             best_order = order
 79 |             best_params = grid.best_params_
 80 | 
 81 |     print('Training model on order {} with best parameters {}'.format(
 82 |         best_order, best_params))
 83 |     stop_position = 3 + (best_order * 8)
 84 |     clf = DecisionTreeClassifier(max_depth=best_params['max_depth'])
 85 |     scores = cross_val_score(clf, train_fourier_descriptors[:, :stop_position], train_labels, cv=10, n_jobs=NUM_CPUS)
 86 |     print('Cross-validation scores:', scores)
 87 |     clf.fit(train_fourier_descriptors[:, :stop_position], train_labels)
 88 | 
 89 |     # Run predictions on unseen test data to verify generalization
 90 |     path = Path(DATA_FOLDER + TEST_DATA_FILE)
 91 |     if not path.exists():
 92 |         print("Retrieving test data from web...")
 93 |         urlretrieve(TEST_DATA_URL, DATA_FOLDER + TEST_DATA_FILE)
 94 | 
 95 |     test_loaded = np.load(DATA_FOLDER + TEST_DATA_FILE)
 96 |     test_fourier_descriptors = test_loaded['elliptic_fourier_descriptors']
 97 |     test_labels = np.asarray(test_loaded['above_or_below_median'], dtype=int)
 98 |     test_fourier_descriptors = scaler.transform(test_fourier_descriptors)
 99 | 
100 |     print('Run on test data...')
101 |     predictions = clf.predict(test_fourier_descriptors[:, :stop_position])
102 |     test_accuracy = accuracy_score(test_labels, predictions)
103 | 
104 |     runtime = time() - SCRIPT_START
105 |     message = '\nTest accuracy of {} for fourier descriptor order {} with {} in {}'.format(
106 |         test_accuracy, best_order, best_params, timedelta(seconds=runtime))
107 |     print(message)
108 |     notify(SCRIPT_NAME, message)
109 | 


--------------------------------------------------------------------------------
/model/baseline/neighborhood_inhabintants_knn.py:
--------------------------------------------------------------------------------
  1 | """
  2 | This script executes the task of estimating the number of inhabitants of a neighborhood to be under or over the
  3 | median of all neighborhoods in the Netherlands, based solely on the geometry for that neighborhood.
  4 | The data for this script can be found at http://hdl.handle.net/10411/GYPPBR.
  5 | """
  6 | 
  7 | import multiprocessing
  8 | import os
  9 | import sys
 10 | from datetime import datetime, timedelta
 11 | from pathlib import Path
 12 | from time import time
 13 | from urllib.request import urlretrieve
 14 | 
 15 | import numpy as np
 16 | from sklearn.metrics import accuracy_score
 17 | from sklearn.model_selection import cross_val_score, StratifiedShuffleSplit, GridSearchCV
 18 | from sklearn.neighbors import KNeighborsClassifier
 19 | from sklearn.preprocessing import StandardScaler
 20 | 
 21 | PACKAGE_PARENT = '..'
 22 | SCRIPT_DIR = os.path.dirname(os.path.realpath(os.path.join(os.getcwd(), os.path.expanduser(__file__))))
 23 | sys.path.append(os.path.normpath(os.path.join(SCRIPT_DIR, PACKAGE_PARENT)))
 24 | 
 25 | from topoml_util.slack_send import notify
 26 | 
 27 | SCRIPT_VERSION = '1.0.1'
 28 | SCRIPT_NAME = os.path.basename(__file__)
 29 | TIMESTAMP = str(datetime.now()).replace(':', '.')
 30 | NUM_CPUS = multiprocessing.cpu_count() - 1 or 1
 31 | DATA_FOLDER = SCRIPT_DIR + '/../../files/neighborhoods/'
 32 | TRAIN_DATA_FILE = 'neighborhoods_train_v7.npz'
 33 | TEST_DATA_FILE = 'neighborhoods_test_v7.npz'
 34 | TRAIN_DATA_URL = 'https://dataverse.nl/api/access/datafile/11378'
 35 | TEST_DATA_URL = 'https://dataverse.nl/api/access/datafile/11379'
 36 | EFD_ORDERS = [0, 1, 2, 3, 4, 6, 8, 12, 16, 20, 24]
 37 | SCRIPT_START = time()
 38 | 
 39 | if __name__ == '__main__':  # this is to squelch warnings on scikit-learn multithreaded grid search
 40 |     # Load training data
 41 |     path = Path(DATA_FOLDER + TRAIN_DATA_FILE)
 42 |     if not path.exists():
 43 |         print("Retrieving training data from web...")
 44 |         urlretrieve(TRAIN_DATA_URL, DATA_FOLDER + TRAIN_DATA_FILE)
 45 | 
 46 |     train_loaded = np.load(DATA_FOLDER + TRAIN_DATA_FILE)
 47 |     train_fourier_descriptors = train_loaded['elliptic_fourier_descriptors']
 48 |     train_labels = train_loaded['above_or_below_median'][:, 0]
 49 | 
 50 |     scaler = StandardScaler().fit(train_fourier_descriptors)
 51 |     train_fourier_descriptors = scaler.transform(train_fourier_descriptors)
 52 | 
 53 |     k_range = np.linspace(start=21, stop=30, num=10, dtype=int)
 54 |     param_grid = dict(n_neighbors=k_range)
 55 |     cv = StratifiedShuffleSplit(n_splits=5, test_size=0.2, random_state=42)
 56 |     grid = GridSearchCV(
 57 |         KNeighborsClassifier(),
 58 |         n_jobs=NUM_CPUS,
 59 |         param_grid=param_grid,
 60 |         verbose=2,
 61 |         cv=cv)
 62 | 
 63 |     print('Performing grid search on model...')
 64 |     print('Using {} threads for grid search'.format(NUM_CPUS))
 65 |     print('Searching {} elliptic fourier descriptor orders'.format(EFD_ORDERS))
 66 | 
 67 |     best_order = 0
 68 |     best_score = 0
 69 |     best_params = {}
 70 | 
 71 |     for order in EFD_ORDERS:
 72 |         print('Fitting order {} fourier descriptors'.format(order))
 73 |         stop_position = 3 + (order * 8)
 74 |         grid.fit(train_fourier_descriptors[:, :stop_position], train_labels)
 75 |         print("The best parameters for order {} are {} with a score of {}\n".format(
 76 |             order, grid.best_params_, grid.best_score_))
 77 |         if grid.best_score_ > best_score:
 78 |             best_score = grid.best_score_
 79 |             best_order = order
 80 |             best_params = grid.best_params_
 81 | 
 82 |     print('Training model on order {} with best parameters {}'.format(
 83 |         best_order, best_params))
 84 |     stop_position = 3 + (best_order * 8)
 85 |     clf = KNeighborsClassifier(n_neighbors=best_params['n_neighbors'])
 86 |     scores = cross_val_score(clf, train_fourier_descriptors[:, :stop_position], train_labels, cv=10, n_jobs=NUM_CPUS)
 87 |     print('Cross-validation scores:', scores)
 88 |     clf.fit(train_fourier_descriptors[:, :stop_position], train_labels)
 89 | 
 90 |     # Run predictions on unseen test data to verify generalization
 91 |     path = Path(DATA_FOLDER + TEST_DATA_FILE)
 92 |     if not path.exists():
 93 |         print("Retrieving test data from web...")
 94 |         urlretrieve(TEST_DATA_URL, DATA_FOLDER + TEST_DATA_FILE)
 95 | 
 96 |     test_loaded = np.load(DATA_FOLDER + TEST_DATA_FILE)
 97 |     test_fourier_descriptors = test_loaded['elliptic_fourier_descriptors']
 98 |     test_labels = test_loaded['above_or_below_median'][:, 0]
 99 |     test_fourier_descriptors = scaler.transform(test_fourier_descriptors)
100 | 
101 |     print('Run on test data...')
102 |     predictions = clf.predict(test_fourier_descriptors[:, :stop_position])
103 |     test_accuracy = accuracy_score(test_labels, predictions)
104 | 
105 |     runtime = time() - SCRIPT_START
106 |     message = '\nTest accuracy of {} for fourier descriptor order {} with {} in {}'.format(
107 |         test_accuracy, best_order, best_params, timedelta(seconds=runtime))
108 |     print(message)
109 |     notify(SCRIPT_NAME, message)
110 | 


--------------------------------------------------------------------------------
/model/baseline/neighborhood_inhabintants_logistic_regression.py:
--------------------------------------------------------------------------------
  1 | """
  2 | This script executes the task of estimating the number of inhabitants of a neighborhood to be under or over the
  3 | median of all neighborhoods in the Netherlands, based solely on the geometry for that neighborhood.
  4 | The data for this script can be found at http://hdl.handle.net/10411/GYPPBR.
  5 | """
  6 | 
  7 | import multiprocessing
  8 | import os
  9 | import sys
 10 | from datetime import datetime, timedelta
 11 | from pathlib import Path
 12 | from time import time
 13 | from urllib.request import urlretrieve
 14 | 
 15 | import numpy as np
 16 | from sklearn.linear_model import LogisticRegression
 17 | from sklearn.metrics import accuracy_score
 18 | from sklearn.model_selection import cross_val_score, StratifiedShuffleSplit, GridSearchCV
 19 | from sklearn.preprocessing import StandardScaler
 20 | 
 21 | PACKAGE_PARENT = '..'
 22 | SCRIPT_DIR = os.path.dirname(os.path.realpath(os.path.join(os.getcwd(), os.path.expanduser(__file__))))
 23 | sys.path.append(os.path.normpath(os.path.join(SCRIPT_DIR, PACKAGE_PARENT)))
 24 | 
 25 | from topoml_util.slack_send import notify
 26 | 
 27 | SCRIPT_VERSION = '1.0.0'
 28 | SCRIPT_NAME = os.path.basename(__file__)
 29 | TIMESTAMP = str(datetime.now()).replace(':', '.')
 30 | NUM_CPUS = multiprocessing.cpu_count() - 1 or 1
 31 | DATA_FOLDER = SCRIPT_DIR + '/../../files/neighborhoods/'
 32 | TRAIN_DATA_FILE = 'neighborhoods_train_v7.npz'
 33 | TEST_DATA_FILE = 'neighborhoods_test_v7.npz'
 34 | TRAIN_DATA_URL = 'https://dataverse.nl/api/access/datafile/11378'
 35 | TEST_DATA_URL = 'https://dataverse.nl/api/access/datafile/11379'
 36 | EFD_ORDERS = [0, 1, 2, 3, 4, 6, 8, 12, 16, 20, 24]
 37 | SCRIPT_START = time()
 38 | 
 39 | if __name__ == '__main__':  # this is to squelch warnings on scikit-learn multithreaded grid search
 40 |     # Load training data
 41 |     path = Path(DATA_FOLDER + TRAIN_DATA_FILE)
 42 |     if not path.exists():
 43 |         print("Retrieving training data from web...")
 44 |         urlretrieve(TRAIN_DATA_URL, DATA_FOLDER + TRAIN_DATA_FILE)
 45 | 
 46 |     train_loaded = np.load(DATA_FOLDER + TRAIN_DATA_FILE)
 47 |     train_fourier_descriptors = train_loaded['elliptic_fourier_descriptors']
 48 |     train_labels = train_loaded['above_or_below_median'][:, 0]
 49 | 
 50 |     scaler = StandardScaler().fit(train_fourier_descriptors)
 51 |     train_fourier_descriptors = scaler.transform(train_fourier_descriptors)
 52 | 
 53 |     # Grid search
 54 |     C_range = [1e-3, 1e-2, 1e-1, 1e0, 1e1]
 55 |     param_grid = dict(C=C_range)
 56 |     cv = StratifiedShuffleSplit(n_splits=5, test_size=0.2, random_state=42)
 57 |     grid = GridSearchCV(
 58 |         LogisticRegression(),
 59 |         n_jobs=NUM_CPUS,
 60 |         param_grid=param_grid,
 61 |         verbose=2,
 62 |         cv=cv)
 63 | 
 64 |     print('Performing grid search on model...')
 65 |     print('Using {} threads for grid search'.format(NUM_CPUS))
 66 |     print('Searching {} elliptic fourier descriptor orders'.format(EFD_ORDERS))
 67 | 
 68 |     best_order = 0
 69 |     best_score = 0
 70 |     best_params = {}
 71 | 
 72 |     for order in EFD_ORDERS:
 73 |         print('Fitting order {} fourier descriptors'.format(order))
 74 |         stop_position = 3 + (order * 8)
 75 |         grid.fit(train_fourier_descriptors[:, :stop_position], train_labels)
 76 |         print("The best parameters for order {} are {} with a score of {}\n".format(
 77 |             order, grid.best_params_, grid.best_score_))
 78 |         if grid.best_score_ > best_score:
 79 |             best_score = grid.best_score_
 80 |             best_order = order
 81 |             best_params = grid.best_params_
 82 | 
 83 |     print('Training model on order {} with best parameters {}'.format(
 84 |         best_order, best_params))
 85 |     stop_position = 3 + (best_order * 8)
 86 |     clf = LogisticRegression(C=best_params['C'])
 87 |     scores = cross_val_score(clf, train_fourier_descriptors[:, :stop_position], train_labels, cv=10, n_jobs=NUM_CPUS)
 88 |     print('Cross-validation scores:', scores)
 89 |     clf.fit(train_fourier_descriptors[:, :stop_position], train_labels)
 90 | 
 91 |     # Run predictions on unseen test data to verify generalization
 92 |     path = Path(DATA_FOLDER + TEST_DATA_FILE)
 93 |     if not path.exists():
 94 |         print("Retrieving test data from web...")
 95 |         urlretrieve(TEST_DATA_URL, DATA_FOLDER + TEST_DATA_FILE)
 96 | 
 97 |     test_loaded = np.load(DATA_FOLDER + TEST_DATA_FILE)
 98 |     test_fourier_descriptors = test_loaded['elliptic_fourier_descriptors']
 99 |     test_labels = np.asarray(test_loaded['above_or_below_median'][:, 0], dtype=int)
100 |     test_fourier_descriptors = scaler.transform(test_fourier_descriptors)
101 | 
102 |     print('Run on test data...')
103 |     predictions = clf.predict(test_fourier_descriptors[:, :stop_position])
104 |     test_accuracy = accuracy_score(test_labels, predictions)
105 | 
106 |     runtime = time() - SCRIPT_START
107 |     message = '\nTest accuracy of {} for fourier descriptor order {} with {} in {}'.format(
108 |         test_accuracy, best_order, best_params, timedelta(seconds=runtime))
109 |     print(message)
110 |     notify(SCRIPT_NAME, message)
111 | 


--------------------------------------------------------------------------------
/model/baseline/neighborhood_inhabintants_svm_linear.py:
--------------------------------------------------------------------------------
  1 | """
  2 | This script executes the task of estimating the number of inhabitants of a neighborhood to be under or over the
  3 | median of all neighborhoods in the Netherlands, based solely on the geometry for that neighborhood.
  4 | The data for this script can be found at http://hdl.handle.net/10411/GYPPBR.
  5 | """
  6 | 
  7 | import multiprocessing
  8 | import os
  9 | import sys
 10 | from datetime import datetime, timedelta
 11 | from pathlib import Path
 12 | from time import time
 13 | from urllib.request import urlretrieve
 14 | 
 15 | import numpy as np
 16 | from sklearn.metrics import accuracy_score
 17 | from sklearn.model_selection import StratifiedShuffleSplit, GridSearchCV
 18 | from sklearn.preprocessing import StandardScaler
 19 | from sklearn.svm import SVC
 20 | 
 21 | PACKAGE_PARENT = '..'
 22 | SCRIPT_DIR = os.path.dirname(os.path.realpath(os.path.join(os.getcwd(), os.path.expanduser(__file__))))
 23 | sys.path.append(os.path.normpath(os.path.join(SCRIPT_DIR, PACKAGE_PARENT)))
 24 | 
 25 | from topoml_util.slack_send import notify
 26 | 
 27 | SCRIPT_VERSION = '1.0.0'
 28 | SCRIPT_NAME = os.path.basename(__file__)
 29 | TIMESTAMP = str(datetime.now()).replace(':', '.')
 30 | NUM_CPUS = multiprocessing.cpu_count() - 1 or 1
 31 | DATA_FOLDER = SCRIPT_DIR + '/../../files/neighborhoods/'
 32 | TRAIN_DATA_FILE = 'neighborhoods_train_v7.npz'
 33 | TEST_DATA_FILE = 'neighborhoods_test_v7.npz'
 34 | TRAIN_DATA_URL = 'https://dataverse.nl/api/access/datafile/11378'
 35 | TEST_DATA_URL = 'https://dataverse.nl/api/access/datafile/11379'
 36 | EFD_ORDERS = [0, 1, 2, 3, 4, 6, 8, 12, 16, 20, 24]
 37 | SCRIPT_START = time()
 38 | 
 39 | if __name__ == '__main__':  # this is to squelch warnings on scikit-learn multithreaded grid search
 40 |     # Load training data
 41 |     path = Path(DATA_FOLDER + TRAIN_DATA_FILE)
 42 |     if not path.exists():
 43 |         print("Retrieving training data from web...")
 44 |         urlretrieve(TRAIN_DATA_URL, DATA_FOLDER + TRAIN_DATA_FILE)
 45 | 
 46 |     train_loaded = np.load(DATA_FOLDER + TRAIN_DATA_FILE)
 47 |     train_fourier_descriptors = train_loaded['elliptic_fourier_descriptors']
 48 |     train_labels = train_loaded['above_or_below_median'][:, 0]
 49 |     train_labels = np.reshape(train_labels, (train_labels.shape[0]))
 50 | 
 51 |     scaler = StandardScaler().fit(train_fourier_descriptors)
 52 |     train_fourier_descriptors = scaler.transform(train_fourier_descriptors)
 53 | 
 54 |     C_range = [1e-2, 1e-1, 1e0, 1e1, 1e2, 1e3]
 55 |     param_grid = dict(C=C_range)
 56 |     cv = StratifiedShuffleSplit(n_splits=5, test_size=0.2, random_state=42)
 57 |     grid = GridSearchCV(
 58 |         SVC(kernel='linear'),
 59 |         n_jobs=NUM_CPUS,
 60 |         param_grid=param_grid,
 61 |         verbose=2,
 62 |         cv=cv)
 63 | 
 64 |     print('Performing grid search on model...')
 65 |     print('Using {} threads for grid search'.format(NUM_CPUS))
 66 |     print('Searching {} elliptic fourier descriptor orders'.format(EFD_ORDERS))
 67 | 
 68 |     best_order = 0
 69 |     best_score = 0
 70 |     best_params = {}
 71 | 
 72 |     for order in EFD_ORDERS:
 73 |         print('Fitting order {} fourier descriptors'.format(order))
 74 |         stop_position = 3 + (order * 8)
 75 |         grid.fit(train_fourier_descriptors[::5, :stop_position], train_labels[::5])
 76 |         print("The best parameters for order {} are {} with a score of {}\n".format(
 77 |             order, grid.best_params_, grid.best_score_))
 78 |         if grid.best_score_ > best_score:
 79 |             best_score = grid.best_score_
 80 |             best_order = order
 81 |             best_params = grid.best_params_
 82 | 
 83 |     print('Training model on order {} with best parameters {}'.format(
 84 |         best_order, best_params))
 85 |     stop_position = 3 + (best_order * 8)
 86 |     clf = SVC(kernel='linear', C=best_params['C'], max_iter=int(1e7))
 87 |     clf.fit(X=train_fourier_descriptors[:, :stop_position], y=train_labels)
 88 | 
 89 |     # Run predictions on unseen test data to verify generalization
 90 |     path = Path(DATA_FOLDER + TEST_DATA_FILE)
 91 |     if not path.exists():
 92 |         print("Retrieving test data from web...")
 93 |         urlretrieve(TEST_DATA_URL, DATA_FOLDER + TEST_DATA_FILE)
 94 | 
 95 |     test_loaded = np.load(DATA_FOLDER + TEST_DATA_FILE)
 96 |     test_fourier_descriptors = test_loaded['elliptic_fourier_descriptors']
 97 |     test_labels = test_loaded['above_or_below_median'][:, 0]
 98 |     test_fourier_descriptors = scaler.transform(test_fourier_descriptors)
 99 |     test_labels = np.reshape(test_labels, (test_labels.shape[0]))
100 | 
101 |     print('Run on test data...')
102 |     predictions = clf.predict(test_fourier_descriptors[:, :stop_position])
103 |     test_accuracy = accuracy_score(test_labels, predictions)
104 | 
105 |     runtime = time() - SCRIPT_START
106 |     message = '\nTest accuracy of {} for fourier descriptor order {} with {} in {}'.format(
107 |         test_accuracy, best_order, best_params, timedelta(seconds=runtime))
108 |     print(message)
109 |     notify(SCRIPT_NAME, message)
110 | 


--------------------------------------------------------------------------------
/model/baseline/neighborhood_inhabintants_svm_polynomial.py:
--------------------------------------------------------------------------------
  1 | """
  2 | This script executes the task of estimating the number of inhabitants of a neighborhood to be under or over the
  3 | median of all neighborhoods in the Netherlands, based solely on the geometry for that neighborhood.
  4 | The data for this script can be found at http://hdl.handle.net/10411/GYPPBR.
  5 | """
  6 | 
  7 | import multiprocessing
  8 | import os
  9 | import sys
 10 | from datetime import datetime, timedelta
 11 | from pathlib import Path
 12 | from time import time
 13 | from urllib.request import urlretrieve
 14 | 
 15 | import numpy as np
 16 | from sklearn.metrics import accuracy_score
 17 | from sklearn.model_selection import StratifiedShuffleSplit, GridSearchCV
 18 | from sklearn.preprocessing import StandardScaler
 19 | from sklearn.svm import SVC
 20 | 
 21 | PACKAGE_PARENT = '..'
 22 | SCRIPT_DIR = os.path.dirname(os.path.realpath(os.path.join(os.getcwd(), os.path.expanduser(__file__))))
 23 | sys.path.append(os.path.normpath(os.path.join(SCRIPT_DIR, PACKAGE_PARENT)))
 24 | 
 25 | from topoml_util.slack_send import notify
 26 | 
 27 | SCRIPT_VERSION = '1.0.1'
 28 | SCRIPT_NAME = os.path.basename(__file__)
 29 | TIMESTAMP = str(datetime.now()).replace(':', '.')
 30 | NUM_CPUS = multiprocessing.cpu_count() - 1 or 1
 31 | DATA_FOLDER = SCRIPT_DIR + '/../../files/neighborhoods/'
 32 | TRAIN_DATA_FILE = 'neighborhoods_train_v7.npz'
 33 | TEST_DATA_FILE = 'neighborhoods_test_v7.npz'
 34 | TRAIN_DATA_URL = 'https://dataverse.nl/api/access/datafile/11378'
 35 | TEST_DATA_URL = 'https://dataverse.nl/api/access/datafile/11379'
 36 | EFD_ORDERS = [0, 1, 2, 3, 4, 6, 8, 12, 16, 20, 24]
 37 | SCRIPT_START = time()
 38 | 
 39 | if __name__ == '__main__':  # this is to squelch warnings on scikit-learn multithreaded grid search
 40 |     # Load training data
 41 |     path = Path(DATA_FOLDER + TRAIN_DATA_FILE)
 42 |     if not path.exists():
 43 |         print("Retrieving training data from web...")
 44 |         urlretrieve(TRAIN_DATA_URL, DATA_FOLDER + TRAIN_DATA_FILE)
 45 | 
 46 |     train_loaded = np.load(DATA_FOLDER + TRAIN_DATA_FILE)
 47 |     train_fourier_descriptors = train_loaded['elliptic_fourier_descriptors']
 48 |     train_labels = train_loaded['above_or_below_median'][:, 0]
 49 |     train_labels = np.reshape(train_labels, (train_labels.shape[0]))
 50 | 
 51 |     scaler = StandardScaler().fit(train_fourier_descriptors)
 52 |     train_fourier_descriptors = scaler.transform(train_fourier_descriptors)
 53 | 
 54 |     C_range = [1e0, 1e1, 1e2, 1e3, 1e4, 1e5]
 55 |     degree_range = range(1, 7)
 56 |     param_grid = dict(degree=degree_range, C=C_range)
 57 |     cv = StratifiedShuffleSplit(n_splits=5, test_size=0.2, random_state=42)
 58 |     grid = GridSearchCV(
 59 |         SVC(kernel='poly'),
 60 |         n_jobs=NUM_CPUS,
 61 |         param_grid=param_grid,
 62 |         verbose=2,
 63 |         cv=cv)
 64 | 
 65 |     print('Performing grid search on model...')
 66 |     print('Using {} threads for grid search'.format(NUM_CPUS))
 67 |     print('Searching {} elliptic fourier descriptor orders'.format(EFD_ORDERS))
 68 | 
 69 |     best_order = 0
 70 |     best_score = 0
 71 |     best_params = {}
 72 | 
 73 |     for order in EFD_ORDERS:
 74 |         print('Fitting order {} fourier descriptors'.format(order))
 75 |         stop_position = 3 + (order * 8)
 76 |         grid.fit(train_fourier_descriptors[:, :stop_position], train_labels)
 77 |         print("The best parameters for order {} are {} with a score of {}\n".format(
 78 |             order, grid.best_params_, grid.best_score_))
 79 |         if grid.best_score_ > best_score:
 80 |             best_score = grid.best_score_
 81 |             best_order = order
 82 |             best_params = grid.best_params_
 83 | 
 84 |     print('Training model on order {} with best parameters {}'.format(
 85 |         best_order, best_params))
 86 |     stop_position = 3 + (best_order * 8)
 87 |     clf = SVC(kernel='poly',
 88 |               C=best_params['C'],
 89 |               degree=best_params['degree'])
 90 |     clf.fit(X=train_fourier_descriptors[:, :stop_position], y=train_labels)
 91 | 
 92 |     # Run predictions on unseen test data to verify generalization
 93 |     path = Path(DATA_FOLDER + TEST_DATA_FILE)
 94 |     if not path.exists():
 95 |         print("Retrieving test data from web...")
 96 |         urlretrieve(TEST_DATA_URL, DATA_FOLDER + TEST_DATA_FILE)
 97 | 
 98 |     test_loaded = np.load(DATA_FOLDER + TEST_DATA_FILE)
 99 |     test_fourier_descriptors = test_loaded['elliptic_fourier_descriptors']
100 |     test_labels = test_loaded['above_or_below_median'][:, 0]
101 |     test_fourier_descriptors = scaler.transform(test_fourier_descriptors)
102 |     test_labels = np.reshape(test_labels, (test_labels.shape[0]))
103 | 
104 |     print('Run on test data...')
105 |     predictions = clf.predict(test_fourier_descriptors[:, :stop_position])
106 |     test_accuracy = accuracy_score(test_labels, predictions)
107 | 
108 |     runtime = time() - SCRIPT_START
109 |     message = '\nTest accuracy of {} for fourier descriptor order {} with {} in {}'.format(
110 |         test_accuracy, best_order, best_params, timedelta(seconds=runtime))
111 |     print(message)
112 |     notify(SCRIPT_NAME, message)
113 | 


--------------------------------------------------------------------------------
/model/baseline/neighborhood_inhabintants_svm_rbf.py:
--------------------------------------------------------------------------------
  1 | """
  2 | This script executes the task of estimating the number of inhabitants of a neighborhood to be under or over the
  3 | median of all neighborhoods in the Netherlands, based solely on the geometry for that neighborhood.
  4 | The data for this script can be found at http://hdl.handle.net/10411/GYPPBR.
  5 | """
  6 | 
  7 | import multiprocessing
  8 | import os
  9 | import sys
 10 | from datetime import datetime, timedelta
 11 | from pathlib import Path
 12 | from time import time
 13 | from urllib.request import urlretrieve
 14 | 
 15 | import numpy as np
 16 | from sklearn.metrics import accuracy_score
 17 | from sklearn.model_selection import StratifiedShuffleSplit, GridSearchCV
 18 | from sklearn.preprocessing import StandardScaler
 19 | from sklearn.svm import SVC
 20 | 
 21 | PACKAGE_PARENT = '..'
 22 | SCRIPT_DIR = os.path.dirname(os.path.realpath(os.path.join(os.getcwd(), os.path.expanduser(__file__))))
 23 | sys.path.append(os.path.normpath(os.path.join(SCRIPT_DIR, PACKAGE_PARENT)))
 24 | 
 25 | from topoml_util.slack_send import notify
 26 | 
 27 | SCRIPT_VERSION = '1.0.1'
 28 | SCRIPT_NAME = os.path.basename(__file__)
 29 | TIMESTAMP = str(datetime.now()).replace(':', '.')
 30 | NUM_CPUS = multiprocessing.cpu_count() - 1 or 1
 31 | DATA_FOLDER = SCRIPT_DIR + '/../../files/neighborhoods/'
 32 | TRAIN_DATA_FILE = 'neighborhoods_train_v7.npz'
 33 | TEST_DATA_FILE = 'neighborhoods_test_v7.npz'
 34 | TRAIN_DATA_URL = 'https://dataverse.nl/api/access/datafile/11378'
 35 | TEST_DATA_URL = 'https://dataverse.nl/api/access/datafile/11379'
 36 | EFD_ORDERS = [0, 1, 2, 3, 4, 6, 8, 12, 16, 20, 24]
 37 | SCRIPT_START = time()
 38 | 
 39 | if __name__ == '__main__':  # this is to squelch warnings on scikit-learn multithreaded grid search
 40 |     # Load training data
 41 |     path = Path(DATA_FOLDER + TRAIN_DATA_FILE)
 42 |     if not path.exists():
 43 |         print("Retrieving training data from web...")
 44 |         urlretrieve(TRAIN_DATA_URL, DATA_FOLDER + TRAIN_DATA_FILE)
 45 | 
 46 |     train_loaded = np.load(DATA_FOLDER + TRAIN_DATA_FILE)
 47 |     train_fourier_descriptors = train_loaded['elliptic_fourier_descriptors']
 48 |     train_labels = train_loaded['above_or_below_median'][:, 0]
 49 |     train_labels = np.reshape(train_labels, (train_labels.shape[0]))
 50 | 
 51 |     scaler = StandardScaler().fit(train_fourier_descriptors)
 52 |     train_fourier_descriptors = scaler.transform(train_fourier_descriptors)
 53 | 
 54 |     # Grid search
 55 |     C_range = [1e-2, 1e-1, 1e0, 1e1, 1e2, 1e3]
 56 |     gamma_range = np.logspace(-3, 3, 7)
 57 |     param_grid = dict(gamma=gamma_range, C=C_range)
 58 |     cv = StratifiedShuffleSplit(n_splits=5, test_size=0.2, random_state=42)
 59 |     grid = GridSearchCV(
 60 |         SVC(kernel='rbf'),
 61 |         n_jobs=NUM_CPUS,
 62 |         param_grid=param_grid,
 63 |         verbose=2,
 64 |         cv=cv)
 65 | 
 66 |     print('Performing grid search on model...')
 67 |     print('Using {} threads for grid search'.format(NUM_CPUS))
 68 |     print('Searching {} elliptic fourier descriptor orders'.format(EFD_ORDERS))
 69 | 
 70 |     best_order = 0
 71 |     best_score = 0
 72 |     best_params = {}
 73 | 
 74 |     for order in EFD_ORDERS:
 75 |         print('Fitting order {} fourier descriptors'.format(order))
 76 |         stop_position = 3 + (order * 8)
 77 |         grid.fit(train_fourier_descriptors[::2, :stop_position], train_labels[::2])
 78 |         print("The best parameters for order {} are {} with a score of {}\n".format(
 79 |             order, grid.best_params_, grid.best_score_))
 80 |         if grid.best_score_ > best_score:
 81 |             best_score = grid.best_score_
 82 |             best_order = order
 83 |             best_params = grid.best_params_
 84 | 
 85 |     print('Training model on order {} with best parameters {}'.format(
 86 |         best_order, best_params))
 87 |     stop_position = 3 + (best_order * 8)
 88 |     clf = SVC(kernel='rbf', C=best_params['C'], gamma=best_params['gamma'])
 89 |     clf.fit(X=train_fourier_descriptors[:, :stop_position], y=train_labels)
 90 | 
 91 |     # Run predictions on unseen test data to verify generalization
 92 |     path = Path(DATA_FOLDER + TEST_DATA_FILE)
 93 |     if not path.exists():
 94 |         print("Retrieving test data from web...")
 95 |         urlretrieve(TEST_DATA_URL, DATA_FOLDER + TEST_DATA_FILE)
 96 | 
 97 |     test_loaded = np.load(DATA_FOLDER + TEST_DATA_FILE)
 98 |     test_fourier_descriptors = test_loaded['elliptic_fourier_descriptors']
 99 |     test_labels = test_loaded['above_or_below_median'][:, 0]
100 |     test_labels = np.reshape(test_labels, (test_labels.shape[0]))
101 |     test_fourier_descriptors = scaler.transform(test_fourier_descriptors)
102 | 
103 |     print('Run on test data...')
104 |     predictions = clf.predict(test_fourier_descriptors[:, :stop_position])
105 |     test_accuracy = accuracy_score(test_labels, predictions)
106 | 
107 |     runtime = time() - SCRIPT_START
108 |     message = '\nTest accuracy of {} for fourier descriptor order {} with {} in {}'.format(
109 |         test_accuracy, best_order, best_params, timedelta(seconds=runtime))
110 |     print(message)
111 |     notify(SCRIPT_NAME, message)
112 | 


--------------------------------------------------------------------------------
/model/building_convnet_fixed.py:
--------------------------------------------------------------------------------
  1 | """
  2 | This script executes the task of estimating the building type, based solely on the geometry for that building.
  3 | The data for this script can be found at http://hdl.handle.net/10411/GYPPBR.
  4 | """
  5 | 
  6 | import os
  7 | import socket
  8 | import sys
  9 | from datetime import datetime, timedelta
 10 | from pathlib import Path
 11 | from time import time
 12 | from urllib.request import urlretrieve
 13 | 
 14 | import numpy as np
 15 | from keras import Input
 16 | from keras.callbacks import TensorBoard
 17 | from keras.engine import Model
 18 | from keras.layers import Dense, Conv1D, GlobalAveragePooling1D, Dropout
 19 | from keras.optimizers import Adam
 20 | from sklearn.metrics import accuracy_score
 21 | from sklearn.model_selection import train_test_split
 22 | 
 23 | from topoml_util import geom_scaler
 24 | from topoml_util.slack_send import notify
 25 | 
 26 | SCRIPT_VERSION = '2.0.3'
 27 | SCRIPT_NAME = os.path.basename(__file__)
 28 | TIMESTAMP = str(datetime.now()).replace(':', '.')
 29 | SIGNATURE = SCRIPT_NAME + ' ' + SCRIPT_VERSION + ' ' + TIMESTAMP
 30 | DATA_FOLDER = '../files/buildings/'
 31 | TRAIN_DATA_FILE = 'buildings_train_v7.npz'
 32 | TEST_DATA_FILE = 'buildings_test_v7.npz'
 33 | TRAIN_DATA_URL = 'https://dataverse.nl/api/access/datafile/11381'
 34 | TEST_DATA_URL = 'https://dataverse.nl/api/access/datafile/11380'
 35 | SCRIPT_START = time()
 36 | 
 37 | # Hyperparameters
 38 | hp = {
 39 |     'BATCH_SIZE': int(os.getenv('BATCH_SIZE', 32)),
 40 |     'TRAIN_VALIDATE_SPLIT': float(os.getenv('TRAIN_VALIDATE_SPLIT', 0.1)),
 41 |     'REPEAT_DEEP_ARCH': int(os.getenv('REPEAT_DEEP_ARCH', 0)),
 42 |     'DENSE_SIZE': int(os.getenv('DENSE_SIZE', 32)),
 43 |     'EPOCHS': int(os.getenv('EPOCHS', 200)),
 44 |     'LEARNING_RATE': float(os.getenv('LEARNING_RATE', 1e-4)),
 45 |     'DROPOUT': float(os.getenv('DROPOUT', 0.0)),
 46 |     'GEOM_SCALE': float(os.getenv("GEOM_SCALE", 0)),  # If no default or 0: overridden when data is known
 47 | }
 48 | OPTIMIZER = Adam(lr=hp['LEARNING_RATE'])
 49 | 
 50 | # Load training data
 51 | path = Path(DATA_FOLDER + TRAIN_DATA_FILE)
 52 | if not path.exists():
 53 |     print("Retrieving training data from web...")
 54 |     urlretrieve(TRAIN_DATA_URL, DATA_FOLDER + TRAIN_DATA_FILE)
 55 | 
 56 | train_loaded = np.load(DATA_FOLDER + TRAIN_DATA_FILE)
 57 | train_geoms = train_loaded['fixed_size_geoms']
 58 | train_labels = train_loaded['building_type']
 59 | 
 60 | # Determine final test mode or standard
 61 | if len(sys.argv) > 1 and sys.argv[1] in ['-t', '--test']:
 62 |     print('Training in final test mode')
 63 |     path = Path(DATA_FOLDER + TEST_DATA_FILE)
 64 |     if not path.exists():
 65 |         print("Retrieving test data from web...")
 66 |         urlretrieve(TEST_DATA_URL, DATA_FOLDER + TEST_DATA_FILE)
 67 | 
 68 |     test_loaded = np.load(DATA_FOLDER + TEST_DATA_FILE)
 69 |     test_geoms = test_loaded['fixed_size_geoms']
 70 |     test_labels = test_loaded['building_type']
 71 | else:
 72 |     print('Training in standard training mode')
 73 |     # Split the training data in random seen/unseen sets
 74 |     train_geoms, test_geoms, train_labels, test_labels = train_test_split(train_geoms, train_labels, test_size=0.1)
 75 | 
 76 | # Normalize
 77 | geom_scale = hp['GEOM_SCALE'] or geom_scaler.scale(train_geoms)
 78 | train_geoms = geom_scaler.transform(train_geoms, geom_scale)
 79 | test_geoms = geom_scaler.transform(test_geoms, geom_scale)  # re-use variance from training
 80 | 
 81 | # Map types to one-hot vectors
 82 | # noinspection PyUnresolvedReferences
 83 | train_targets = np.zeros((len(train_labels), train_labels.max() + 1))
 84 | for index, building_type in enumerate(train_labels):
 85 |     train_targets[index, building_type] = 1
 86 | 
 87 | # Shape determination
 88 | geom_max_points, geom_vector_len = train_geoms.shape[1:]
 89 | output_size = train_targets.shape[-1]
 90 | 
 91 | # Build model
 92 | inputs = Input(shape=(geom_max_points, geom_vector_len))
 93 | model = Conv1D(filters=32, kernel_size=(5,), activation='relu')(inputs)
 94 | model = Conv1D(filters=48, kernel_size=(5,), activation='relu', strides=2)(model)
 95 | model = Conv1D(filters=64, kernel_size=(5,), activation='relu', strides=2)(model)
 96 | model = GlobalAveragePooling1D()(model)
 97 | model = Dense(hp['DENSE_SIZE'], activation='relu')(model)
 98 | model = Dropout(hp['DROPOUT'])(model)
 99 | model = Dense(output_size, activation='softmax')(model)
100 | 
101 | model = Model(inputs=inputs, outputs=model)
102 | model.compile(
103 |     loss='categorical_crossentropy',
104 |     metrics=['accuracy'],
105 |     optimizer=OPTIMIZER),
106 | model.summary()
107 | 
108 | # Callbacks
109 | callbacks = [TensorBoard(log_dir='./tensorboard_log/' + SIGNATURE, write_graph=False)]
110 | 
111 | history = model.fit(
112 |     x=train_geoms,
113 |     y=train_targets,
114 |     epochs=hp['EPOCHS'],
115 |     batch_size=hp['BATCH_SIZE'],
116 |     validation_split=hp['TRAIN_VALIDATE_SPLIT'],
117 |     callbacks=callbacks).history
118 | 
119 | # Run on unseen test data
120 | test_pred = [np.argmax(prediction) for prediction in model.predict(test_geoms)]
121 | accuracy = accuracy_score(test_labels, test_pred)
122 | 
123 | runtime = time() - SCRIPT_START
124 | message = 'on {} completed with accuracy of \n{:f} \nin {} in {} epochs\n'.format(
125 |     socket.gethostname(), accuracy, timedelta(seconds=runtime), len(history['val_loss']))
126 | 
127 | for key, value in sorted(hp.items()):
128 |     message += '{}: {}\t'.format(key, value)
129 | 
130 | notify(SIGNATURE, message)
131 | print(SCRIPT_NAME, 'finished successfully with', message)
132 | 


--------------------------------------------------------------------------------
/model/configs/README.md:
--------------------------------------------------------------------------------
1 | # Configurations
2 | This directory contains an archive of python test setup configurations.


--------------------------------------------------------------------------------
/model/grid_search.py:
--------------------------------------------------------------------------------
 1 | import os
 2 | import socket
 3 | 
 4 | import sys
 5 | 
 6 | # import numpy as np
 7 | from sklearn.model_selection import ParameterGrid
 8 | from topoml_util.slack_send import notify
 9 | 
10 | SCRIPT_NAME = os.path.basename(__file__)
11 | SCRIPT_VERSION = '1.0.2'
12 | SIGNATURE = '{} {} on {}'.format(SCRIPT_NAME, SCRIPT_VERSION, socket.gethostname())
13 | N_TIMES = 1
14 | 
15 | if len(sys.argv) > 1:
16 |     script_name = sys.argv[1]
17 | else:  # resort to default, for
18 |     # script_name = 'neighborhood_convnet.py'
19 |     script_name = 'neighborhood_lstm.py'
20 |     # script_name = 'building_convnet.py'
21 |     # script_name = 'building_lstm.py'
22 |     # script_name = 'archaeology_convnet.py'
23 |     # script_name = 'archaeology_lstm.py'
24 | 
25 | HYPERPARAMS = {
26 |     'BATCH_SIZE': [256],
27 |     # 'REPEAT_DEEP_ARCH': [1, 2],
28 |     # 'KERNEL_SIZE': np.linspace(1, 8, 8, dtype=int),
29 |     # 'LSTM_SIZE': np.linspace(64, 128, 3, dtype=int),
30 |     # 'DENSE_SIZE': [64],
31 |     # 'EPOCHS': [200],
32 |     # 'LEARNING_RATE': [8e-4, 6e-4, 4e-4, 2e-4, 1e-4],
33 |     'LEARNING_RATE': [5e-3, 1e-3],
34 |     # 'LEARNING_RATE': [8e-5, 6e-5],
35 |     # 'GEOM_SCALE': [1e0, 1e-1, 1e-2, 1e-3],
36 |     # 'RECURRENT_DROPOUT': [0.0],
37 |     # 'PATIENCE': [0, 1, 4, 8, 16, 32],
38 |     # 'EARLY_STOPPING': [0],
39 | }
40 | grid = list(ParameterGrid(HYPERPARAMS))
41 | 
42 | for configuration in grid:
43 |     envs = []
44 |     # Set environment variables (this allows you to do hyperparam searches from any scripting environment)
45 |     for key, value in configuration.items():
46 |         os.environ[key] = str(value)
47 | 
48 |     # repeat to get a sense of results spread
49 |     for _ in range(N_TIMES):
50 |         r_code = os.system('python3 ' + script_name)
51 |         if not r_code == 0:
52 |             print('Grid search exited with error')
53 |             notify(SIGNATURE, 'error')
54 |             sys.exit(1)
55 | 
56 | notify(SIGNATURE, 'success')
57 | print('Grid search {} finished successfully'.format(SIGNATURE))
58 | 


--------------------------------------------------------------------------------
/model/neighborhood_convnet_fixed.py:
--------------------------------------------------------------------------------
  1 | """
  2 | This script executes the task of estimating the number of inhabitants of a neighborhood to be under or over the
  3 | median of all neighborhoods in the Netherlands, based solely on the geometry for that neighborhood.
  4 | The data for this script can be found at http://hdl.handle.net/10411/GYPPBR.
  5 | """
  6 | 
  7 | import os
  8 | import socket
  9 | import sys
 10 | from datetime import datetime, timedelta
 11 | from pathlib import Path
 12 | from time import time
 13 | from urllib.request import urlretrieve
 14 | 
 15 | import numpy as np
 16 | from keras import Input
 17 | from keras.callbacks import TensorBoard
 18 | from keras.engine import Model
 19 | from keras.layers import Dense, Conv1D, MaxPooling1D, GlobalAveragePooling1D, Dropout
 20 | from keras.optimizers import Adam
 21 | from sklearn.metrics import accuracy_score
 22 | from sklearn.model_selection import train_test_split
 23 | 
 24 | from topoml_util import geom_scaler
 25 | from topoml_util.slack_send import notify
 26 | 
 27 | SCRIPT_VERSION = '2.0.5'
 28 | SCRIPT_NAME = os.path.basename(__file__)
 29 | TIMESTAMP = str(datetime.now()).replace(':', '.')
 30 | SIGNATURE = SCRIPT_NAME + ' ' + SCRIPT_VERSION + ' ' + TIMESTAMP
 31 | DATA_FOLDER = '../files/neighborhoods/'
 32 | TRAIN_DATA_FILE = 'neighborhoods_train_v7.npz'
 33 | TEST_DATA_FILE = 'neighborhoods_test_v7.npz'
 34 | TRAIN_DATA_URL = 'https://dataverse.nl/api/access/datafile/11378'
 35 | TEST_DATA_URL = 'https://dataverse.nl/api/access/datafile/11379'
 36 | SCRIPT_START = time()
 37 | 
 38 | # Hyperparameters
 39 | hp = {
 40 |     'BATCH_SIZE': int(os.getenv('BATCH_SIZE', 32)),
 41 |     'TRAIN_VALIDATE_SPLIT': float(os.getenv('TRAIN_VALIDATE_SPLIT', 0.1)),
 42 |     'REPEAT_DEEP_ARCH': int(os.getenv('REPEAT_DEEP_ARCH', 0)),
 43 |     'DENSE_SIZE': int(os.getenv('DENSE_SIZE', 32)),
 44 |     'EPOCHS': int(os.getenv('EPOCHS', 200)),
 45 |     'LEARNING_RATE': float(os.getenv('LEARNING_RATE', 1e-3)),
 46 |     'DROPOUT': float(os.getenv('DROPOUT', 0.0)),
 47 |     'GEOM_SCALE': float(os.getenv("GEOM_SCALE", 0)),  # If no default or 0: overridden when data is known
 48 | }
 49 | OPTIMIZER = Adam(lr=hp['LEARNING_RATE'])
 50 | 
 51 | # Load training data
 52 | path = Path(DATA_FOLDER + TRAIN_DATA_FILE)
 53 | if not path.exists():
 54 |     print("Retrieving training data from web...")
 55 |     urlretrieve(TRAIN_DATA_URL, DATA_FOLDER + TRAIN_DATA_FILE)
 56 | 
 57 | train_loaded = np.load(DATA_FOLDER + TRAIN_DATA_FILE)
 58 | train_geoms = train_loaded['fixed_size_geoms']
 59 | train_labels = train_loaded['above_or_below_median']
 60 | 
 61 | # Determine final test mode or standard
 62 | if len(sys.argv) > 1 and sys.argv[1] in ['-t', '--test']:
 63 |     print('Training in final test mode')
 64 |     path = Path(DATA_FOLDER + TEST_DATA_FILE)
 65 |     if not path.exists():
 66 |         print("Retrieving test data from web...")
 67 |         urlretrieve(TEST_DATA_URL, DATA_FOLDER + TEST_DATA_FILE)
 68 | 
 69 |     test_loaded = np.load(DATA_FOLDER + TEST_DATA_FILE)
 70 |     test_geoms = test_loaded['fixed_size_geoms']
 71 |     test_labels = test_loaded['above_or_below_median']
 72 | else:
 73 |     print('Training in standard training mode')
 74 |     # Split the training data in random seen/unseen sets
 75 |     train_geoms, test_geoms, train_labels, test_labels = train_test_split(train_geoms, train_labels, test_size=0.1)
 76 | 
 77 | # Normalize
 78 | geom_scale = hp['GEOM_SCALE'] or geom_scaler.scale(train_geoms)
 79 | train_geoms = geom_scaler.transform(train_geoms, geom_scale)
 80 | test_geoms = geom_scaler.transform(test_geoms, geom_scale)  # re-use variance from training
 81 | 
 82 | # Map types to one-hot vectors
 83 | # noinspection PyUnresolvedReferences
 84 | train_targets = np.zeros((len(train_labels), train_labels.max() + 1))
 85 | for index, train_label in enumerate(train_labels):
 86 |     train_targets[index, train_label] = 1
 87 | 
 88 | # Shape determination
 89 | geom_vector_len = train_geoms[0].shape[1]
 90 | 
 91 | # Build model
 92 | inputs = Input(shape=(None, geom_vector_len))
 93 | model = Conv1D(32, (5,), activation='relu')(inputs)
 94 | model = MaxPooling1D(3, padding='SAME')(model)
 95 | model = Conv1D(64, (5,), activation='relu')(model)
 96 | model = GlobalAveragePooling1D()(model)
 97 | model = Dense(hp['DENSE_SIZE'], activation='relu')(model)
 98 | model = Dropout(hp['DROPOUT'])(model)
 99 | model = Dense(2, activation='softmax')(model)
100 | 
101 | model = Model(inputs=inputs, outputs=model)
102 | model.compile(
103 |     loss='categorical_crossentropy',
104 |     metrics=['accuracy'],
105 |     optimizer=OPTIMIZER),
106 | model.summary()
107 | 
108 | # Callbacks
109 | callbacks = [TensorBoard(log_dir='./tensorboard_log/' + SIGNATURE, write_graph=False)]
110 | 
111 | history = model.fit(
112 |     x=train_geoms,
113 |     y=train_targets,
114 |     epochs=hp['EPOCHS'],
115 |     batch_size=hp['BATCH_SIZE'],
116 |     validation_split=hp['TRAIN_VALIDATE_SPLIT'],
117 |     callbacks=callbacks).history
118 | 
119 | # Run on unseen test data
120 | test_pred = [np.argmax(prediction) for prediction in model.predict(test_geoms)]
121 | accuracy = accuracy_score(test_labels, test_pred)
122 | 
123 | runtime = time() - SCRIPT_START
124 | message = 'on {} completed with accuracy of \n{:f} \nin {} in {} epochs\n'.format(
125 |     socket.gethostname(), accuracy, timedelta(seconds=runtime), len(history['val_loss']))
126 | 
127 | for key, value in sorted(hp.items()):
128 |     message += '{}: {}\t'.format(key, value)
129 | 
130 | notify(SIGNATURE, message)
131 | print(SCRIPT_NAME, 'finished successfully with', message)
132 | 


--------------------------------------------------------------------------------
/model/plots/README.md:
--------------------------------------------------------------------------------
1 | # Plots
2 | This directory is a logging directory for png-saved pyplots.


--------------------------------------------------------------------------------
/model/topoml_util/ConsoleLogger.py:
--------------------------------------------------------------------------------
 1 | from keras.callbacks import Callback
 2 | import random
 3 | from datetime import datetime
 4 | import numpy as np
 5 | 
 6 | 
 7 | class DecypherAll(Callback):
 8 |     def __init__(self, decypher):
 9 |         super().__init__()
10 |         self.decypher = decypher
11 | 
12 |     def on_epoch_end(self, epoch, logs=None):
13 |         random.seed(datetime.now())
14 |         sample_indexes = random.sample(range(len(self.validation_data[0])), 3)
15 |         input_samples = [self.validation_data[0][sample] for sample in sample_indexes]
16 |         target_samples = [self.validation_data[1][sample] for sample in sample_indexes]
17 |         predictions = self.model.predict(np.array(input_samples))
18 | 
19 |         print('')
20 | 
21 |         for (input, target, prediction) in zip(input_samples, target_samples, predictions):
22 |             print('Input:      %s' % self.decypher(input))
23 |             print('Target:     %s' % self.decypher(target))
24 |             print('Prediction: %s\n' % self.decypher(prediction))
25 | 
26 | 


--------------------------------------------------------------------------------
/model/topoml_util/GaussianMixtureLoss.py:
--------------------------------------------------------------------------------
 1 | from keras import backend as K
 2 | 
 3 | from topoml_util.GeoVectorizer import RENDER_LEN, GEOM_TYPE_LEN, ONE_HOT_LEN
 4 | from topoml_util.gaussian_loss import bivariate_gaussian, univariate_gaussian
 5 | 
 6 | 
 7 | class GaussianMixtureLoss:
 8 |     def __init__(self, num_components, num_points):
 9 |         self.num_points = num_points
10 |         self.num_components = num_components
11 | 
12 |     def geom_gaussian_mixture_loss(self, y_true, y_pred):
13 |         """
14 |         Calculates a loss from a rank 3 sequence, representing a self.num_components * 6 slice (the mixture components)
15 |         plus one-hot encoded sequences of geometry type (8) and render/stop action type (3)
16 |         :param y_true: rank 3 of shape(records, points, true_point_features >= 17) truth values tensor
17 |         :param y_pred: rank 3 of shape(records, points, pred_point_features >= 17) predicted values tensor
18 |         :return: a summed mixture loss and categorical cross entropy losses for the geometry type and stop bits
19 |         """
20 |         # loss fn based on eq #26 of http://arxiv.org/abs/1308.0850.
21 |         # Reshape to one target component to be broadcasted over self.num_components
22 |         true_coordinates = y_true[..., :2]
23 |         # It would be nice to be able to do
24 |         # shape = [*y_true.shape[:-1], 1, 2]
25 |         shape = [-1, self.num_points, 1, 2]
26 |         true_coordinates = K.reshape(true_coordinates, tuple(shape))
27 | 
28 |         y_pred_gmm_components = y_pred[..., :-ONE_HOT_LEN]
29 |         predicted_components = K.reshape(
30 |             y_pred_gmm_components,
31 |             # (*y_pred.shape[:-1], -1, 6))  # This would be nice
32 |             (-1, self.num_points, self.num_components, 6))
33 | 
34 |         pi_index = 5  # mixture component weight
35 |         pi_weights = K.softmax(predicted_components[..., pi_index])
36 |         gmm = bivariate_gaussian(true_coordinates, predicted_components) * pi_weights
37 |         gmm_loss = K.sum(-K.log(gmm + K.epsilon()))
38 | 
39 |         render_action = K.softmax(y_true[..., -RENDER_LEN:])
40 |         neg_full_stop_chance = 1 - render_action[..., 2]  # 1 minus the chance of full stop
41 |         gmm_loss = gmm_loss * neg_full_stop_chance
42 | 
43 |         geom_type_error = K.categorical_crossentropy(
44 |             K.softmax(y_true[..., -(GEOM_TYPE_LEN + RENDER_LEN - 1):-RENDER_LEN]),
45 |             K.softmax(y_pred[..., -(GEOM_TYPE_LEN + RENDER_LEN - 1):-RENDER_LEN]))
46 |         render_error = K.categorical_crossentropy(
47 |             K.softmax(y_true[..., -RENDER_LEN:]),
48 |             K.softmax(y_pred[..., -RENDER_LEN:]))
49 | 
50 |         return gmm_loss + geom_type_error + render_error
51 | 
52 |     def univariate_gmm_loss(self, true, pred):
53 |         """
54 |         A simple loss function for rank 3 single gaussian mixture models
55 |         :param true: truth values tensor
56 |         :param pred: prediction values tensor
57 |         :return: loss values tensor
58 |         """
59 |         if not true.shape == pred.shape:
60 |             print(
61 |                 'Warning: truth', true.shape, 'and prediction tensors', pred.shape, 'do not have the same shape. The '
62 |                 'outcome of the loss function may be unpredictable.')
63 | 
64 |         # true_components = K.reshape(true, (-1, self.num_components, 3))
65 |         # TODO: make reshape op rank agnostic
66 |         predicted_components = K.reshape(pred, (-1, self.num_components, 3))
67 | 
68 |         pi_index = 2
69 |         pi_weights = K.softmax(pred[..., pi_index])
70 |         gmm = univariate_gaussian(true, predicted_components) * pi_weights
71 |         gmm_loss = -K.log(K.sum(gmm + K.epsilon()))
72 | 
73 |         return gmm_loss
74 | 


--------------------------------------------------------------------------------
/model/topoml_util/LoggerCallback.py:
--------------------------------------------------------------------------------
 1 | import pprint
 2 | from keras.callbacks import Callback
 3 | import random
 4 | from datetime import datetime
 5 | import numpy as np
 6 | 
 7 | pp = pprint.PrettyPrinter()
 8 | 
 9 | 
10 | class EpochLogger(Callback):
11 |     def __init__(self, input_func=None, target_func=None, predict_func=None, aggregate_func=None, sample_size=3,
12 |                  stdout=False, input_slice=lambda x: x[0:1], target_slice=lambda x: x[1:2]):
13 |         super().__init__()
14 |         self.input_func = input_func
15 |         self.target_func = target_func
16 |         self.predict_func = predict_func
17 |         self.aggregate_func = aggregate_func
18 |         self.sample_size = sample_size
19 |         self.log_to_stdout = stdout
20 |         self.input_slice = input_slice
21 |         self.target_slice = target_slice
22 | 
23 |     def on_epoch_end(self, epoch, logs=None):
24 |         random.seed(datetime.now())
25 |         sample_indexes = random.sample(range(len(self.validation_data[0])), self.sample_size)
26 |         inputs = np.array(self.input_slice(self.validation_data))
27 |         targets = np.array(self.target_slice(self.validation_data))
28 |         input_samples = [inputs[:, sample_index] for sample_index in sample_indexes]
29 |         target_samples = [targets[:, sample_index] for sample_index in sample_indexes]
30 | 
31 |         predictions = []
32 |         for sample_index in sample_indexes:
33 |             sample = inputs[:, sample_index:sample_index + 1]
34 |             predictions.append(self.model.predict([*sample]))
35 | 
36 |         print('\nLogging output for %i inputs, targets and predictions...' % len(predictions))
37 | 
38 |         for (inputs, targets, predictions) in zip(input_samples, target_samples, predictions):
39 | 
40 |             if self.log_to_stdout:
41 |                 print('Input:')
42 |                 pp.pprint(inputs)
43 |                 print('Target:')
44 |                 pp.pprint(targets)
45 |                 print('Prediction:')
46 |                 pp.pprint(predictions)
47 |                 print('')
48 | 
49 |             if self.aggregate_func:
50 |                 self.aggregate_func(
51 |                     (self.input_func(inputs), self.target_func(targets), self.predict_func(predictions)))
52 | 


--------------------------------------------------------------------------------
/model/topoml_util/PyplotLogger.py:
--------------------------------------------------------------------------------
 1 | import os
 2 | import pprint
 3 | import random
 4 | from datetime import datetime
 5 | 
 6 | import numpy as np
 7 | from keras.callbacks import Callback
 8 | from shapely.geometry import Point
 9 | 
10 | from .GeoVectorizer import GeoVectorizer
11 | from .wkt2pyplot import save_plot
12 | 
13 | pp = pprint.PrettyPrinter()
14 | 
15 | 
16 | class DecypherAll(Callback):
17 |     def __init__(self, gmm_size=1, sample_size=3, input_slice=lambda x: x[0:1], target_slice=lambda x: x[1:2],
18 |                  stdout=False, save_plots=True, plot_dir='plots'):
19 |         """
20 |         Class constructor that instantiates with a few vital settings in order to decypher the output
21 |         :type target_slice: object
22 |         :param gmm_size: size as an integer of the gaussian mixture model
23 |         :param sample_size: size as an integer of the number of samples to log
24 |         :param stdout: boolean whether or not to log to stdout. Mixture models can have a lot of output.
25 |         :param plot_dir: string of a directory to save plots to, relative to the path called to execute the script
26 |         """
27 |         super().__init__()
28 |         self.gmm_size = gmm_size
29 |         self.sample_size = sample_size
30 |         self.input_slice = input_slice
31 |         self.target_slice = target_slice
32 |         self.stdout = stdout
33 |         self.save_plots = save_plots
34 | 
35 |         os.makedirs(plot_dir, exist_ok=True)
36 |         self.plot_dir = plot_dir
37 | 
38 |     def on_epoch_end(self, epoch, logs=None):
39 |         """
40 |         Epochal logging function that outputs to a pyplot saved to a timestamped .png file
41 |         :param epoch: automatically instantiated by Keras
42 |         :param logs: automatically instantiated by Keras
43 |         """
44 |         random.seed(datetime.now())
45 | 
46 |         sample_indexes = random.sample(range(len(self.validation_data[0])), self.sample_size)
47 |         inputs = np.array(self.input_slice(self.validation_data))
48 |         targets = np.array(self.target_slice(self.validation_data))
49 |         input_samples = [inputs[:, sample_index] for sample_index in sample_indexes]
50 |         target_samples = [targets[:, sample_index] for sample_index in sample_indexes]
51 | 
52 |         predictions = []
53 |         for sample_index in sample_indexes:
54 |             sample = inputs[:, sample_index:sample_index + 1]
55 |             predictions.append(self.model.predict([*sample]))
56 | 
57 |         print('\nPlotting output for %i inputs, targets and predictions...' % len(predictions))
58 | 
59 |         for (input_vectors, target_vectors, prediction_vectors) in zip(input_samples, target_samples, predictions):
60 |             timestamp = str(datetime.now()).replace(':', '.')
61 | 
62 |             if self.stdout:
63 |                 print('Input:')
64 |                 pp.pprint(input_vectors)
65 |                 print('Target:')
66 |                 pp.pprint(target_vectors)
67 |                 print('Prediction:')
68 |                 pp.pprint(prediction_vectors)
69 | 
70 |             if self.save_plots:
71 |                 input_polys = [GeoVectorizer.decypher(poly) for poly in input_vectors]
72 |                 target_polys = [GeoVectorizer.decypher(target_vectors[0])]
73 |                 prediction_points = [
74 |                     Point(point).wkt for point in
75 |                     GeoVectorizer(gmm_size=self.gmm_size).decypher_gmm_geom(prediction_vectors[0], 500)
76 |                 ]
77 | 
78 |                 geoms = input_polys, target_polys, prediction_points
79 |                 save_plot(geoms, self.plot_dir, timestamp)
80 | 


--------------------------------------------------------------------------------
/model/topoml_util/Tokenizer.py:
--------------------------------------------------------------------------------
  1 | from keras.preprocessing.text import Tokenizer
  2 | import numpy as np
  3 | 
  4 | 
  5 | class Tokenize(Tokenizer):
  6 |     """Text tokenization wrapper around Keras text tokenization methods
  7 |     """
  8 | 
  9 |     def __init__(self, texts):
 10 |         super().__init__(num_words=None,
 11 |                          filters='\t\n',
 12 |                          lower=True,
 13 |                          split="",
 14 |                          char_level=True)
 15 |         self.fit_on_texts(texts)
 16 | 
 17 |     @staticmethod
 18 |     def truncate(max_len, untruncated_training_set, untruncated_target_set):
 19 |         """
 20 |         Method for truncating the training and target set to fit the maximum
 21 |             sequence length, batch and validation set size
 22 |         :param max_len: maximum length of characters per sequence/sentence
 23 |         :param untruncated_training_set: untruncated list of input sequences
 24 |         :param untruncated_target_set: untruncated list of target output sequences
 25 |         :return: training_set, target_set: a tuple of truncated training and target sets
 26 |         """
 27 |         training_set = []
 28 |         target_set = []
 29 | 
 30 |         # Restrict input to be of less or equal length as the maximum length.
 31 |         for index, record in enumerate(untruncated_training_set):
 32 |             if len(record) <= max_len:
 33 |                 training_set.append(record)
 34 |                 target_set.append(untruncated_target_set[index])
 35 | 
 36 |         return training_set, target_set
 37 | 
 38 |     @staticmethod
 39 |     def batch_truncate(batch_size, max_len, validation_split, untruncated_training_set, untruncated_target_set):
 40 |         """
 41 |         Method for truncating the training and target set to fit the maximum
 42 |             sequence length, batch and validation set size
 43 |         :param batch_size: size of the epoch batch size
 44 |         :param max_len: maximum length of characters per sequence/sentence
 45 |         :param validation_split: ratio of the training/validation split
 46 |         :param untruncated_training_set: untruncated list of input sequences
 47 |         :param untruncated_target_set: untruncated list of target output sequences
 48 |         :return: training_set, target_set: a tuple of truncated training and target sets
 49 |         """
 50 |         training_set = []
 51 |         target_set = []
 52 | 
 53 |         # Restrict input to be of less or equal length as the maximum length.
 54 |         for index, record in enumerate(untruncated_training_set):
 55 |             if len(record) <= max_len:
 56 |                 training_set.append(record)
 57 |                 target_set.append(untruncated_target_set[index])
 58 | 
 59 |         # Truncate the array to the batch size, accounting for the validation set
 60 |         # The validation sample size must be a multiple of the batch size
 61 |         # Say the truncated length is 27,000 and the split ratio is 0.1, the validation sample size is 2700
 62 |         validation_size = int(len(training_set) * validation_split)
 63 |         # We need to get it down to 2000
 64 |         validation_size = validation_size - validation_size % batch_size
 65 |         # The truncated length must be a multiple of the validation sample size
 66 |         truncated_size = len(training_set) - len(training_set) % int(validation_size / validation_split)
 67 |         training_set = training_set[0:truncated_size]
 68 |         target_set = target_set[0:truncated_size]
 69 |         return training_set, target_set
 70 | 
 71 |     @staticmethod
 72 |     def max_sample(predictions):
 73 |         # helper function to sample an index from a probability array
 74 |         return np.argmax(predictions)
 75 | 
 76 |     def char_level_tokenize(self, texts):
 77 |         sequences = self.texts_to_sequences(texts)
 78 |         return sequences
 79 | 
 80 |     def decypher(self, sequences):
 81 |         """
 82 |         Decyphers a encoded 3D array of one-hot vectors back to a 2D array of sentences
 83 |         :param sequences:
 84 |         :return:
 85 |         """
 86 |         # sampled = [Tokenize.max_sample(token) for token in prediction]
 87 |         # sequence.append(sampled)
 88 |         inv_cipher = {v: k for k, v in self.word_index.items()}
 89 |         decyphered = []
 90 |         for sequence in sequences:
 91 |             decyphered_sequence = []
 92 |             for num in sequence:
 93 |                 if num in inv_cipher:
 94 |                     decyphered_sequence.append(inv_cipher[num])
 95 |                 else:
 96 |                     decyphered_sequence.append(' ')
 97 |             decyphered.append(''.join([char for char in decyphered_sequence]))
 98 |         return decyphered
 99 | 
100 |     def one_hot(self, input_sequences, maxlen):
101 |         # The third dimension of the matrix is equal to the length of the word index plus one:
102 |         # There is no '0' index in the word index.
103 |         x = np.zeros((len(input_sequences), maxlen, len(self.word_index) + 1), dtype=np.bool)
104 |         for i, sentence in enumerate(input_sequences):
105 |             for t, char in enumerate(sentence):
106 |                 x[i, t, self.word_index[char]] = True
107 |         return x
108 | 
109 | 


--------------------------------------------------------------------------------
/model/topoml_util/__init__.py:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/SPINLab/geometry-learning/5300d421ef848c2748a2ba41ced5c6e2fba93200/model/topoml_util/__init__.py


--------------------------------------------------------------------------------
/model/topoml_util/gaussian_loss.py:
--------------------------------------------------------------------------------
 1 | import numpy as np
 2 | from keras import backend as K
 3 | from keras.backend import epsilon
 4 | from keras.losses import mse, categorical_crossentropy
 5 | 
 6 | from .GeoVectorizer import GEOM_TYPE_INDEX, RENDER_INDEX
 7 | 
 8 | 
 9 | def geom_gaussian_loss(y_true, y_pred):
10 |     # loss fn based on eq #26 of http://arxiv.org/abs/1308.0850.
11 |     gaussian_loss = bivariate_gaussian_loss(y_true, y_pred)
12 |     geom_type_error = categorical_crossentropy(K.softmax(y_true[..., GEOM_TYPE_INDEX:RENDER_INDEX]),
13 |                                                K.softmax(y_pred[..., GEOM_TYPE_INDEX:RENDER_INDEX]))
14 |     render_error = categorical_crossentropy(K.softmax(y_true[..., RENDER_INDEX:]),
15 |                                             K.softmax(y_pred[..., RENDER_INDEX:]))
16 |     return gaussian_loss + geom_type_error + render_error
17 | 
18 | 
19 | # Adapted to Keras from https://github.com/tensorflow/magenta/blob/master/magenta/models/sketch_rnn/model.py#L268
20 | # Adapted version of the probability density function of
21 | # https://en.wikipedia.org/wiki/Multivariate_normal_distribution#Bivariate_case
22 | def bivariate_gaussian(true, pred):
23 |     """
24 |     Stabilized rank-agnostic bivariate gaussian probability function (pdf)
25 |     Returns results of eq # 24 of http://arxiv.org/abs/1308.0850
26 |     :param true: truth values with at least [mu1, mu2]
27 |     :param pred: values predicted with at least [mu1, mu2, sigma1, sigma2, rho]
28 |     :return: probability density function
29 |     """
30 |     x_coord = true[..., 0]
31 |     y_coord = true[..., 1]
32 |     mu_x = pred[..., 0]
33 |     mu_y = pred[..., 1]
34 |     # exponentiate the sigmas and also make correlative rho between -1 and 1.
35 |     # eq. # 21 and 22 of http://arxiv.org/abs/1308.0850
36 |     # analogous to https://github.com/tensorflow/magenta/blob/master/magenta/models/sketch_rnn/model.py#L326
37 |     sigma_x = K.exp(K.abs(pred[..., 2])) + epsilon()
38 |     sigma_y = K.exp(K.abs(pred[..., 3])) + epsilon()
39 |     rho = K.tanh(pred[..., 4]) * 0  # avoid drifting to -1 or 1 to prevent NaN
40 |     norm1 = K.log(1 + K.abs(x_coord - mu_x))
41 |     norm2 = K.log(1 + K.abs(y_coord - mu_y))
42 |     variance_x = K.square(sigma_x)
43 |     variance_y = K.square(sigma_y)
44 |     s1s2 = sigma_x * sigma_y  # very large if sigma_x and/or sigma_y are very large
45 |     # eq 25 of http://arxiv.org/abs/1308.0850
46 |     z = ((K.square(norm1) / variance_x) +
47 |          (K.square(norm2) / variance_y) -
48 |          (2 * rho * norm1 * norm2 / s1s2))  # z → -∞ if rho * norm1 * norm2 → ∞ and/or s1s2 → 0
49 |     neg_rho = 1 - K.square(rho)  # → 0 if rho → {1, -1}
50 |     numerator = K.exp(-z / (2 * neg_rho))  # → ∞ if z → -∞ and/or neg_rho → 0
51 |     denominator = (2 * np.pi * s1s2 * K.sqrt(neg_rho))  # → 0 if s1s2 → 0 and/or neg_rho → 0
52 |     pdf = numerator / denominator  # → ∞ if denominator → 0 and/or if numerator → ∞
53 |     return pdf
54 | 
55 | 
56 | # Adapted to Keras from https://github.com/tensorflow/magenta/blob/master/magenta/models/sketch_rnn/model.py#L268
57 | # Adapted version of the probability density function of
58 | # https://en.wikipedia.org/wiki/Multivariate_normal_distribution#Bivariate_case
59 | # augmented to negative log likelihood loss configuration
60 | def bivariate_gaussian_loss(true, pred):
61 |     """
62 |     Bivariate gaussian loss function
63 |     Returns results of eq # 24 of http://arxiv.org/abs/1308.0850
64 |     :param true: truth values with at least [mu1, mu2]
65 |     :param pred: values predicted with at least [mu1, mu2, sigma1, sigma2, rho]
66 |     :return: the log of the summed max likelihood
67 |     """
68 |     pdf = bivariate_gaussian(true, pred)
69 |     return K.sum(-K.log(pdf + epsilon()))  # → -∞ if pdf → ∞
70 | 
71 | 
72 | def univariate_gaussian(true, pred):
73 |     """
74 |     Generic, rank-agnostic bivariate gaussian function
75 |     Returns results of eq # 24 of http://arxiv.org/abs/1308.0850
76 |     :param true: truth values with at least [mu]
77 |     :param pred: values predicted with at least [mu, sigma]
78 |     :return: probability density function
79 |     """
80 |     x = true[..., 0]
81 |     mu = pred[..., 0]
82 |     sigma = pred[..., 1]
83 | 
84 |     norm = K.log(1 + K.abs(x - mu))  # needs log of norm to counter large mu diffs
85 |     variance = K.softplus(K.square(sigma))
86 |     z = K.exp(-K.square(K.abs(norm)) / (2 * variance) + epsilon())  # z -> 0 if sigma
87 |     # pdf -> 0 if sigma is very large or z -> 0; NaN if variance -> 0
88 |     pdf = z / K.sqrt((2 * np.pi * variance) + epsilon())
89 |     return pdf
90 | 
91 | 
92 | def univariate_gaussian_loss(true, pred):
93 |     pdf = univariate_gaussian(true, pred)  # pdf -> 0 if sigma is very large or z -> 0
94 |     return -K.log(pdf + epsilon())  # inf if pdf -> 0
95 | 


--------------------------------------------------------------------------------
/model/topoml_util/geom_fourier_descriptors.py:
--------------------------------------------------------------------------------
 1 | import numpy as np
 2 | from pyefd import elliptic_fourier_descriptors
 3 | 
 4 | np.seterr(all='raise')
 5 | 
 6 | 
 7 | def geom_fourier_descriptors(shapes, order):
 8 |     """
 9 |     Creates a stacked array of different variations of fourier descriptors: normalized, non-normalized
10 |     :param shapes: a list of shapely shapes
11 |     :param order: the fourier descriptor order (the size of the returned array along the 0-axis)
12 |     :return: a 2d array with shape ((order * 2) + 3, 4)
13 |     """
14 |     fourier_descriptors = []
15 |     for index, shape in enumerate(shapes):
16 |         coeffs = create_geom_fourier_descriptor(shape, order)
17 |         fourier_descriptors.append(coeffs)
18 | 
19 |     return fourier_descriptors
20 | 
21 | 
22 | def create_geom_fourier_descriptor(shape, order):
23 |     boundary = shape.boundary
24 |     while boundary.geom_type == "MultiLineString":
25 |         boundary = boundary.geoms[0]
26 |     # Set normalize to false to retain size information.
27 |     non_normalized_coeffs = elliptic_fourier_descriptors(
28 |         boundary.coords, order=order, normalize=False)
29 |     # normalized Fouriers
30 |     normalized_coeffs = elliptic_fourier_descriptors(
31 |         boundary.coords, order=order, normalize=True)
32 | 
33 |     # TODO: create centroid distance fourier descriptors
34 |     # See https://doi-org.vu-nl.idm.oclc.org/10.1016/j.image.2009.04.001
35 |     # coords = np.array(boundary.coords)
36 |     # centroid_distances = [boundary.centroid.distance(Point(point)) for point in coords]
37 |     # centroid_fourier_descriptors = elliptic_fourier_descriptors(centroid_distances, normalize=True)
38 | 
39 |     # Stack 'em all
40 |     coeffs = [shape.area, boundary.length, len(boundary.coords)]
41 |     for nn, n in zip(non_normalized_coeffs, normalized_coeffs):
42 |         coeffs = np.append(coeffs, nn)  # without axis this will just create an array
43 |         coeffs = np.append(coeffs, n)
44 | 
45 |     return coeffs
46 | 


--------------------------------------------------------------------------------
/model/topoml_util/geom_scaler.py:
--------------------------------------------------------------------------------
 1 | import numpy as np
 2 | from .GeoVectorizer import FULL_STOP_INDEX
 3 | 
 4 | 
 5 | def scale(vectors):
 6 |     means = localized_mean(vectors)
 7 |     min_maxs = []
 8 | 
 9 |     for index, data_point in enumerate(vectors):
10 |         full_stop_point = data_point[:, FULL_STOP_INDEX].tolist()
11 | 
12 |         try:
13 |             full_stop_point_index = full_stop_point.index(1)
14 |         except Exception as e:  # if a dummy point is encountered
15 |             min_maxs.append([0, 0])
16 |             continue
17 | 
18 |         min_maxs.append([
19 |             np.min(data_point[..., :full_stop_point_index, :2] - means[index]),
20 |             np.max(data_point[..., :full_stop_point_index, :2] - means[index])
21 |         ])
22 | 
23 |     return np.std(min_maxs)
24 | 
25 | 
26 | def transform(vectors, scale=None):
27 |     localized = np.copy(vectors)
28 |     means = localized_mean(vectors)
29 | 
30 |     for index, data_point in enumerate(localized):
31 |         full_stop_point = data_point[:, FULL_STOP_INDEX].tolist()
32 | 
33 |         try:
34 |             full_stop_point_index = full_stop_point.index(1)
35 |         except Exception as e:  # if a dummy point is encountered
36 |             continue
37 | 
38 |         data_point[..., :full_stop_point_index + 1, :2] -= means[index]
39 |         data_point[..., :full_stop_point_index + 1, :2] /= scale
40 | 
41 |     return localized
42 | 
43 | 
44 | def localized_mean(vectors):
45 |     geom_means = []
46 |     for data_point in vectors:
47 |         full_stop_point = data_point[:, FULL_STOP_INDEX].tolist()
48 | 
49 |         try:
50 |             full_stop_point_index = full_stop_point.index(1)
51 |         except Exception as e:  # if a dummy point is encountered
52 |             geom_means.append([[[0, 0]]])
53 |             continue
54 | 
55 |         # Take the mean of all non-null points for localized origin
56 |         geom_mean = np.mean(data_point[0:full_stop_point_index, 0:2], axis=0, keepdims=True)
57 |         geom_means.append(geom_mean)
58 | 
59 |     return np.array(geom_means)
60 | 


--------------------------------------------------------------------------------
/model/topoml_util/np_gaussian_2d_loss.py:
--------------------------------------------------------------------------------
 1 | import numpy as np
 2 | 
 3 | epsilon = 1e-8
 4 | 
 5 | 
 6 | def softplus(x):
 7 |     return np.logaddexp(1.0, x)
 8 | 
 9 | 
10 | def softmax(x):
11 |     """Compute softmax values for each sets of scores in x."""
12 |     e_x = np.exp(x - np.max(x))
13 |     return e_x / e_x.sum()
14 | 
15 | # Adapted version of the probability density function of
16 | # https://en.wikipedia.org/wiki/Multivariate_normal_distribution#Bivariate_case
17 | # augmented to negative log likelihood loss configuration
18 | def np_r2_bivariate_gaussian_loss(true, pred):
19 |     """Returns results of eq # 24 of http://arxiv.org/abs/1308.0850"""
20 |     x_coord = true[:, 0]
21 |     y_coord = true[:, 1]
22 |     mu_x = pred[:, 0]
23 |     mu_y = pred[:, 1]
24 | 
25 |     # exponentiate the sigmas and also make correlative rho between -1 and 1.
26 |     # eq. # 21 and 22 of http://arxiv.org/abs/1308.0850
27 |     # analogous to https://github.com/tensorflow/magenta/blob/master/magenta/models/sketch_rnn/model.py#L326
28 |     sigma_x = np.exp(np.abs(pred[:, 2]))
29 |     sigma_y = np.exp(np.abs(pred[:, 3]))
30 |     rho = np.tanh(pred[:, 4])  # hardcode to avoid drifting to -1 or 1
31 | 
32 |     norm1 = np.log(1 + np.abs(x_coord - mu_x))
33 |     norm2 = np.log(1 + np.abs(y_coord - mu_y))
34 | 
35 |     variance_x = softplus(np.square(sigma_x))
36 |     variance_y = softplus(np.square(sigma_y))
37 |     s1s2 = softplus(sigma_x * sigma_y)  # very large if sigma_x and/or sigma_y are very large
38 | 
39 |     # eq 25 of http://arxiv.org/abs/1308.0850
40 |     z = ((np.square(norm1) / variance_x) +
41 |          (np.square(norm2) / variance_y) -
42 |          (2 * rho * norm1 * norm2 / s1s2))  # z → -∞ if rho * norm1 * norm2 → ∞ and/or s1s2 → 0
43 |     neg_rho = 1 - np.square(rho)  # → 0 if rho → {1, -1}
44 |     numerator = np.exp(-z / (2 * neg_rho))  # → ∞ if z → -∞ and/or neg_rho → 0
45 |     denominator = (2 * np.pi * s1s2 * np.sqrt(neg_rho)) + epsilon  # → 0 if s1s2 → 0 and/or neg_rho → 0
46 |     pdf = numerator / denominator  # → ∞ if denominator → 0 and/or if numerator → ∞
47 |     return -np.log(pdf + epsilon)  # → -∞ if pdf → ∞
48 | 
49 | 
50 | # Adapted version of the probability density function of
51 | # https://en.wikipedia.org/wiki/Multivariate_normal_distribution#Bivariate_case
52 | # augmented to negative log likelihood loss configuration
53 | def np_r4_bivariate_gaussian_loss(true, pred):
54 |     pdf = np_r4_bivariate_gaussian(true, pred)
55 |     return -np.log(pdf + epsilon)  # → -∞ if pdf → ∞
56 | 
57 | 
58 | def np_r4_bivariate_gaussian(true, pred):
59 |     """Returns results of eq # 24 of http://arxiv.org/abs/1308.0850"""
60 |     x_coord = true[:, :, :, 0]
61 |     y_coord = true[:, :, :, 1]
62 |     mu_x = pred[:, :, :, 0]
63 |     mu_y = pred[:, :, :, 1]
64 |     # exponentiate the sigmas and also make correlative rho between -1 and 1.
65 |     # eq. # 21 and 22 of http://arxiv.org/abs/1308.0850
66 |     # analogous to https://github.com/tensorflow/magenta/blob/master/magenta/models/sketch_rnn/model.py#L326
67 |     sigma_x = np.exp(np.abs(pred[:, :, :, 2]))
68 |     sigma_y = np.exp(np.abs(pred[:, :, :, 3]))
69 |     rho = np.tanh(pred[:, :, :, 4]) * 0.1  # hardcode to avoid drifting to -1 or 1
70 | 
71 |     norm1 = np.log(1 + np.abs(x_coord - mu_x))
72 |     norm2 = np.log(1 + np.abs(y_coord - mu_y))
73 | 
74 |     variance_x = softplus(np.square(sigma_x))
75 |     variance_y = softplus(np.square(sigma_y))
76 |     s1s2 = softplus(sigma_x * sigma_y)  # very large if sigma_x and/or sigma_y are very large
77 |     # eq 25 of http://arxiv.org/abs/1308.0850
78 |     z = ((np.square(norm1) / variance_x) +
79 |          (np.square(norm2) / variance_y) -
80 |          (2 * rho * norm1 * norm2 / s1s2))  # z → -∞ if rho * norm1 * norm2 → ∞ and/or s1s2 → 0
81 |     neg_rho = 1 - np.square(rho)  # → 0 if rho → {1, -1}
82 |     numerator = np.exp(-z / (2 * neg_rho))  # → ∞ if z → -∞ and/or neg_rho → 0
83 |     denominator = (2 * np.pi * s1s2 * np.sqrt(neg_rho)) + epsilon  # → 0 if s1s2 → 0 and/or neg_rho → 0
84 |     pdf = numerator / denominator  # → ∞ if denominator → 0 and/or if numerator → ∞
85 |     return pdf
86 | 


--------------------------------------------------------------------------------
/model/topoml_util/np_gmm_loss.py:
--------------------------------------------------------------------------------
 1 | from topoml_util.gaussian_loss import r4_bivariate_gaussian
 2 | import numpy as np
 3 | 
 4 | class GaussianMixtureLoss:
 5 |     def __init__(self, num_components):
 6 |         self.num_components = num_components
 7 |         
 8 |     @staticmethod
 9 |     def softmax(x):
10 |         """Compute softmax values for each sets of scores in x."""
11 |         return np.exp(x) / np.sum(np.exp(x), axis=0)
12 | 
13 |     @staticmethod
14 |     def epsilon():
15 |         return 1e-16
16 | 
17 |     def geom_gaussian_mixture_loss(self, y_true, y_pred):
18 |         # loss fn based on eq #26 of http://arxiv.org/abs/1308.0850.
19 |         (data_points, points, features) = y_pred.shape
20 |         geom_type_index = 6 * self.num_components  # Calculate offset from parameters times components
21 |         render_index = geom_type_index + 8
22 |         pi_index = 5
23 | 
24 |         predicted_components = np.reshape(y_pred[:geom_type_index], (-1, points.value, self.num_components, 6))
25 |         pi = self.softmax(predicted_components[:, :, :, pi_index])
26 | 
27 |         true_components = np.reshape(y_true[:geom_type_index], (-1, points.value, self.num_components, 6))
28 | 
29 |         gmm = r4_bivariate_gaussian(true_components, predicted_components) * pi
30 |         gmm_loss = np.sum(-np.log(gmm + self.epsilon()))
31 | 
32 |         # Zero out loss terms beyond N_s, the last actual stroke
33 |         render = 1 - np.mean(y_pred[:, :, render_index:render_index + 2])  # RENDER and STOP values
34 | 
35 |         gmm_loss = gmm_loss * render
36 | 
37 |         return gmm_loss
38 | 
39 | 
40 | 


--------------------------------------------------------------------------------
/model/topoml_util/sketch_rnn_model.py:
--------------------------------------------------------------------------------
  1 | # Copyright 2017 Google Inc. All Rights Reserved.
  2 | #
  3 | # Licensed under the Apache License, Version 2.0 (the "License");
  4 | # you may not use this file except in compliance with the License.
  5 | # You may obtain a copy of the License at
  6 | #
  7 | #    http://www.apache.org/licenses/LICENSE-2.0
  8 | #
  9 | # Unless required by applicable law or agreed to in writing, software
 10 | # distributed under the License is distributed on an "AS IS" BASIS,
 11 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 12 | # See the License for the specific language governing permissions and
 13 | # limitations under the License.
 14 | """Sketch-RNN Model."""
 15 | 
 16 | from __future__ import absolute_import
 17 | from __future__ import division
 18 | from __future__ import print_function
 19 | from keras import backend as K
 20 | 
 21 | import random
 22 | 
 23 | # internal imports
 24 | 
 25 | import numpy as np
 26 | import tensorflow as tf
 27 | 
 28 | 
 29 | # NB: the below are inner functions, not methods of Model
 30 | def tf_2d_normal(x1, x2, mu1, mu2, s1, s2, rho):
 31 |     """Returns result of eq # 24 of http://arxiv.org/abs/1308.0850."""
 32 |     # exponentiate the sigmas and also make corr between -1 and 1.
 33 |     print_op = tf.Print()
 34 |     s1 = tf.exp(s1)
 35 |     s2 = tf.exp(s2)
 36 |     rho = tf.tanh(rho)
 37 | 
 38 |     norm1 = tf.subtract(x1, mu1)
 39 |     norm2 = tf.subtract(x2, mu2)
 40 |     s1s2 = tf.multiply(s1, s2)
 41 |     # eq 25
 42 |     z = (tf.square(tf.div(norm1, s1)) + tf.square(tf.div(norm2, s2)) -
 43 |          2 * tf.div(tf.multiply(rho, tf.multiply(norm1, norm2)), s1s2))
 44 |     neg_rho = 1 - tf.square(rho)
 45 |     result = tf.exp(tf.div(-z, 2 * neg_rho))
 46 |     denom = 2 * np.pi * tf.multiply(s1s2, tf.sqrt(neg_rho))
 47 |     result = tf.div(result, denom)
 48 |     return result
 49 | 
 50 | 
 51 | def get_lossfunc(z_pi, z_mu1, z_mu2, z_sigma1, z_sigma2, z_corr,
 52 |                  z_pen_logits, x1_data, x2_data, pen_data):
 53 |     """Returns a loss fn based on eq #26 of http://arxiv.org/abs/1308.0850."""
 54 |     # This represents the L_R only (i.e. does not include the KL loss term).
 55 | 
 56 |     result0 = tf_2d_normal(x1_data, x2_data, z_mu1, z_mu2, z_sigma1, z_sigma2,
 57 |                            z_corr)
 58 |     epsilon = 1e-6
 59 |     # result1 is the loss wrt pen offset (L_s in equation 9 of
 60 |     # https://arxiv.org/pdf/1704.03477.pdf)
 61 |     result1 = tf.multiply(result0, z_pi)
 62 |     result1 = tf.reduce_sum(result1, 1, keep_dims=True)
 63 |     result1 = -tf.log(result1 + epsilon)  # avoid log(0)
 64 | 
 65 |     fs = 1.0 - pen_data[:, 2]  # use training data for this
 66 |     fs = tf.reshape(fs, [-1, 1])
 67 |     # Zero out loss terms beyond N_s, the last actual stroke
 68 |     result1 = tf.multiply(result1, fs)
 69 | 
 70 |     # result2: loss wrt pen state, (L_p in equation 9)
 71 |     result2 = tf.nn.softmax_cross_entropy_with_logits(
 72 |         labels=pen_data, logits=z_pen_logits)
 73 |     result2 = tf.reshape(result2, [-1, 1])
 74 |     if not self.hps.is_training:  # eval mode, mask eos columns
 75 |         result2 = tf.multiply(result2, fs)
 76 | 
 77 |     result = result1 + result2
 78 |     return result
 79 | 
 80 | 
 81 | # below is where we need to do MDN (Mixture Density Network) splitting of
 82 | # distribution params
 83 | def get_mixture_coef(output):
 84 |     """Returns the tf slices containing mdn dist params."""
 85 |     # This uses eqns 18 -> 23 of http://arxiv.org/abs/1308.0850.
 86 |     z = output
 87 |     z_pen_logits = z[:, 0:3]  # pen states
 88 |     z_pi, z_mu1, z_mu2, z_sigma1, z_sigma2, z_corr = tf.split(z[:, 3:], 6, 1)
 89 | 
 90 |     # process output z's into MDN paramters
 91 | 
 92 |     # softmax all the pi's and pen states:
 93 |     z_pi = tf.nn.softmax(z_pi)
 94 |     z_pen = tf.nn.softmax(z_pen_logits)
 95 | 
 96 |     # exponentiate the sigmas and also make corr between -1 and 1.
 97 |     z_sigma1 = tf.exp(z_sigma1)
 98 |     z_sigma2 = tf.exp(z_sigma2)
 99 |     z_corr = tf.tanh(z_corr)
100 | 
101 |     r = [z_pi, z_mu1, z_mu2, z_sigma1, z_sigma2, z_corr, z_pen, z_pen_logits]
102 |     return r
103 | 


--------------------------------------------------------------------------------
/model/topoml_util/slack_send.py:
--------------------------------------------------------------------------------
 1 | import os
 2 | from slackclient import SlackClient
 3 | 
 4 | slack_token = os.environ.get("SLACK_API_TOKEN")
 5 | 
 6 | 
 7 | def notify(signature, message):
 8 |     if slack_token:
 9 |         sc = SlackClient(slack_token)
10 |         sc.api_call(
11 |           "chat.postMessage",
12 |           channel="#machinelearning",
13 |           text="Session \n" + signature + "\ncompleted with: " + str(message))
14 |     else:
15 |         print('No slack notification: no slack API token environment variable "SLACK_API_TOKEN" set.')
16 | 


--------------------------------------------------------------------------------
/model/topoml_util/test_GaussianMixtureLoss.py:
--------------------------------------------------------------------------------
 1 | import unittest
 2 | import tensorflow as tf
 3 | import numpy as np
 4 | 
 5 | from topoml_util.test_files import gmm_output
 6 | from topoml_util.GaussianMixtureLoss import GaussianMixtureLoss
 7 | 
 8 | sess = tf.InteractiveSession()
 9 | DATA_FILE = '../files/geodata_vectorized.npz'
10 | 
11 | 
12 | class TestGaussianMixtureLoss(unittest.TestCase):
13 |     def test_bivariate_gaussian_loss(self):
14 |         true = np.array([gmm_output.target])
15 |         pred = np.array([gmm_output.prediction])
16 |         loss = GaussianMixtureLoss(num_components=5, num_points=14).geom_gaussian_mixture_loss(true, pred)
17 |         print(loss.eval())
18 | 
19 |     def test_single_gaussian_loss(self):
20 |         true = np.array([
21 |             [1., 1., 0.],
22 |             [1., 1., 0.],
23 |             [1., 1., 0.],
24 |             [1., 1., 0.],
25 |         ])
26 |         pred1 = np.array([
27 |             [1., 1., 0.],
28 |             [1., 1., 0.],
29 |             [1., 1., 0.],
30 |             [1., 1., 0.],
31 |         ])
32 |         pred2 = np.array([
33 |             [0., 0., 0.],
34 |             [0., 0., 0.],
35 |             [0., 0., 0.],
36 |             [0., 0., 0.],
37 |         ])
38 |         loss1 = GaussianMixtureLoss(num_components=1, num_points=1).univariate_gmm_loss(true, pred1)
39 |         loss2 = GaussianMixtureLoss(num_components=1, num_points=1).univariate_gmm_loss(true, pred2)
40 |         self.assertLess(loss1.eval(), loss2.eval())
41 | 
42 | 


--------------------------------------------------------------------------------
/model/topoml_util/test_GeoVectorizer.py:
--------------------------------------------------------------------------------
  1 | import unittest
  2 | 
  3 | import numpy as np
  4 | import pandas
  5 | from GeoVectorizer import GeoVectorizer, GEO_VECTOR_LEN
  6 | from shapely import wkt as wktreader
  7 | 
  8 | TOPOLOGY_CSV = 'test_files/polygon_multipolygon.csv'
  9 | SOURCE_DATA = pandas.read_csv(TOPOLOGY_CSV)
 10 | brt_wkt = SOURCE_DATA['brt_wkt']
 11 | osm_wkt = SOURCE_DATA['osm_wkt']
 12 | target_wkt = SOURCE_DATA['intersection_wkt']
 13 | 
 14 | input_geom = np.array([
 15 |     [0., 0., 1., 0., 0.],
 16 |     [0., 1., 1., 0., 0.],
 17 |     [1., 1., 1., 0., 0.],
 18 |     [1., 0., 1., 0., 0.],
 19 |     [0., 0., 0., 1., 0.],
 20 |     [0., 0., 1., 0., 0.],
 21 |     [0., -1., 1., 0., 0.],
 22 |     [-1., -1., 1., 0., 0.],
 23 |     [-1., 0., 1., 0., 0.],
 24 |     [0., 0., 0., 0., 1.],
 25 |     [0., 0., 0., 0., 0.]
 26 | ])
 27 | 
 28 | output_geom = np.array([
 29 |     [0.0, 0.00, 1., 0., 0.],
 30 |     [0.0, 0.25, 1., 0., 0.],
 31 |     [0.0, 0.50, 1., 0., 0.],
 32 |     [0.0, 0.75, 1., 0., 0.],
 33 |     [0.0, 1.00, 1., 0., 0.],
 34 |     [0.25, 1.0, 1., 0., 0.],
 35 |     [0.50, 1.0, 1., 0., 0.],
 36 |     [1.0, 1.00, 1., 0., 0.],
 37 |     [1.0, 0.50, 1., 0., 0.],
 38 |     [1.0, 0.00, 1., 0., 0.],
 39 |     [0.5, 0.00, 1., 0., 0.],
 40 |     [0.0, 0.00, 0., 1., 0.],
 41 |     [0.0, 0.00, 1., 0., 0.],
 42 |     [0.0, -0.5, 1., 0., 0.],
 43 |     [0.0, -1.0, 1., 0., 0.],
 44 |     [-0.5, -1., 1., 0., 0.],
 45 |     [-1., -1.0, 1., 0., 0.],
 46 |     [-1., -0.5, 1., 0., 0.],
 47 |     [-1., 0.00, 1., 0., 0.],
 48 |     [-0.5, 0.0, 1., 0., 0.],
 49 |     [0.00, 0.0, 0., 0., 1.],
 50 |     [0.00, 0.0, 0., 0., 0.]
 51 | ])
 52 | 
 53 | non_empty_geom_collection = 'GEOMETRYCOLLECTION(LINESTRING(1 1, 3 5),POLYGON((-1 -1, -1 -5, -5 -5, -5 -1, -1 -1)))'
 54 | 
 55 | 
 56 | class TestVectorizer(unittest.TestCase):
 57 |     def test_max_points(self):
 58 |         max_points = GeoVectorizer.max_points(brt_wkt, osm_wkt)
 59 |         self.assertEqual(max_points, 159)
 60 | 
 61 |     # def test_interpolate(self):
 62 |     #     interpolated = GeoVectorizer.interpolate(input_geom, len(input_geom) * 2)
 63 |     #     for index, _ in enumerate(interpolated):
 64 |     #         result = list(interpolated[index])
 65 |     #         expected = list(output_geom[index])
 66 |     #         self.assertListEqual(result, expected, msg='Lists differ at index %i' % index)
 67 | 
 68 |     def test_vectorize_one_wkt(self):
 69 |         max_points = 20
 70 |         input_set = SOURCE_DATA['intersection_wkt']
 71 |         vectorized = []
 72 |         for index in range(len(input_set)):
 73 |             vectorized.append(GeoVectorizer.vectorize_wkt(input_set[index], max_points, simplify=True))
 74 |         self.assertEqual(len(input_set), len(brt_wkt))
 75 |         self.assertEqual(vectorized[0].shape, (19, GEO_VECTOR_LEN))
 76 |         self.assertEqual(vectorized[1].shape, (1, GEO_VECTOR_LEN))
 77 | 
 78 |     def test_fixed_size(self):
 79 |         max_points = 20
 80 |         input_set = SOURCE_DATA['intersection_wkt']
 81 |         vectorized = [GeoVectorizer.vectorize_wkt(wkt, max_points, simplify=True, fixed_size=True) for wkt in input_set]
 82 |         self.assertEqual(np.array(vectorized).shape, (input_set.size, 20, GEO_VECTOR_LEN))
 83 | 
 84 |     def test_non_empty_geom_coll(self):
 85 |         with self.assertRaises(ValueError):
 86 |             GeoVectorizer.vectorize_wkt(non_empty_geom_collection, 100)
 87 | 
 88 |     def test_point(self):
 89 |         point_matrix = GeoVectorizer.vectorize_wkt('POINT(12 14)', 5)
 90 |         self.assertEqual(point_matrix.shape, (1, GEO_VECTOR_LEN))
 91 | 
 92 |     def test_unsupported_geom(self):
 93 |         # Since
 94 |         with self.assertRaises(Exception):
 95 |             GeoVectorizer.vectorize_wkt(
 96 |                 'TEST_FOR_UNKNOWN_GEOM_TYPE ((10 10, 20 20, 10 40),(40 40, 30 30, 40 20, 30 10))', 16)
 97 | 
 98 |     def test_vectorize_big_multipolygon(self):
 99 |         with open('test_files/big_multipolygon_wkt.txt', 'r') as file:
100 |             wkt = file.read()
101 |             max_points = GeoVectorizer.max_points([wkt])
102 |             vectorized = GeoVectorizer.vectorize_wkt(wkt, max_points)
103 |             self.assertEqual((144, GEO_VECTOR_LEN), vectorized.shape)
104 | 
105 |     def test_simplify_multipolygon_gt_max_points(self):
106 |         with open('test_files/multipart_multipolygon_wkt.txt', 'r') as file:
107 |             wkt = file.read()
108 |             max_points = 20
109 |             vectorized = GeoVectorizer.vectorize_wkt(wkt, max_points, simplify=True)
110 |             self.assertEqual((20, GEO_VECTOR_LEN), vectorized.shape)
111 | 
112 |     def test_multipolygon_exceed_max_points(self):
113 |         with open('test_files/multipart_multipolygon_wkt.txt', 'r') as file:
114 |             wkt = file.read()
115 |             max_points = 20
116 |             with self.assertRaises(Exception):
117 |                 GeoVectorizer.vectorize_wkt(wkt, max_points)
118 | 
119 |     def test_polygon_exceed_max_points(self):
120 |         with open('test_files/multipart_multipolygon_wkt.txt', 'r') as file:
121 |             wkt = file.read()
122 |             shape = wktreader.loads(wkt)
123 |             geom = shape.geoms[0]
124 |             max_points = 20
125 |             with self.assertRaises(Exception):
126 |                 GeoVectorizer.vectorize_wkt(geom.wkt, max_points)
127 | 


--------------------------------------------------------------------------------
/model/topoml_util/test_Tokenizer.py:
--------------------------------------------------------------------------------
 1 | import unittest
 2 | import pandas
 3 | from Tokenizer import Tokenize
 4 | 
 5 | TOPOLOGY_TRAINING_CSV = 'test_files/example.csv'
 6 | source_data = pandas.read_csv(TOPOLOGY_TRAINING_CSV)
 7 | raw_training_set = source_data['brt_wkt'] + ' ' + source_data['osm_wkt']
 8 | raw_target_set = source_data['intersection_wkt']
 9 | 
10 | 
11 | class TestUtil(unittest.TestCase):
12 |     def test_truncate(self):
13 |         max_len = 500
14 |         (input_set, _) = Tokenize.truncate(max_len, raw_training_set, raw_target_set)
15 |         for record in input_set:
16 |             for field in record:
17 |                 self.assertLessEqual(len(field), max_len)
18 | 
19 |     def test_batch_truncate(self):
20 |         batch_size = 3
21 |         max_len = 1000
22 |         validation_split = 0.1
23 |         training_set, target_set = Tokenize.batch_truncate(batch_size, max_len, validation_split, raw_training_set,
24 |                                                            raw_target_set)
25 |         self.assertEqual(len(training_set), 30)
26 | 
27 |     def test_tokenize(self):
28 |         test_strings = ['A test string']
29 |         tokenizer = Tokenize(test_strings)
30 |         tokenized = tokenizer.char_level_tokenize(test_strings)
31 |         self.assertEqual((tokenizer.word_index, tokenized),
32 |                          ({' ': 2, 'A': 4, 'e': 5, 'g': 9, 'i': 7, 'n': 8, 'r': 6, 's': 3, 't': 1},
33 |                           [[4, 2, 1, 5, 3, 1, 2, 3, 1, 6, 7, 8, 9]]))
34 | 
35 |     def test_tokenize_example(self):
36 |         self.maxDiff = None
37 |         test_strings = source_data.as_matrix()
38 |         word_index = {'5': 1, '4': 2, '.': 3, '1': 4, '2': 5, '8': 6, ' ': 7, ',': 8, '3': 9, '6': 10, '0': 11,
39 |                       '9': 12, '7': 13, 'O': 14, '(': 15, ')': 16, 'L': 17, 'Y': 18, 'P': 19, 'G': 20, 'N': 21,
40 |                       'T': 22, 'E': 23, 'M': 24, 'I': 25, 'C': 26, 'U': 27, 'R': 28}
41 |         tokenizer = Tokenize(test_strings[0] + test_strings[1] + test_strings[2])
42 |         tokenized = tokenizer.char_level_tokenize(test_strings[0])
43 |         self.assertEqual((tokenizer.word_index, tokenized[0][0:15]),
44 |                          (word_index,
45 |                           [19, 14, 17, 18, 20, 14, 21, 15, 15, 2, 3, 6, 4, 4, 6]))
46 | 
47 |     def test_one_hot(self):
48 |         source_matrix = source_data.as_matrix()
49 |         test_strings = source_matrix[0] + source_matrix[1]
50 | 
51 |         max_len = 0
52 |         for sentence in test_strings:
53 |             if len(sentence) > max_len:
54 |                 max_len = len(sentence)
55 | 
56 |         tokenizer = Tokenize(test_strings)
57 |         matrix = tokenizer.one_hot(test_strings, max_len)
58 |         self.assertEqual(matrix[0][0][19], True)  # 'P' for POLYGON
59 | 
60 |     def test_detokenize(self):
61 |         test_strings = ['A test string']
62 |         tokenizer = Tokenize(test_strings)
63 |         tokenized = tokenizer.char_level_tokenize(test_strings)
64 |         detokenized = tokenizer.decypher(tokenized)
65 |         self.assertEqual(detokenized, test_strings)
66 | 


--------------------------------------------------------------------------------
/model/topoml_util/test_files/big_multipolygon_wkt.txt:
--------------------------------------------------------------------------------
1 | MULTIPOLYGON (((6.83347875187002 53.319132848582356, 6.833999853819664 53.31918330312409, 6.836043494585205 53.31930920917599, 6.835895913178991 53.31813487667021, 6.835980841423087 53.31761993697426, 6.836337152462576 53.31719832819632, 6.837172560351007 53.31682710709162, 6.839040096236629 53.31657989801909, 6.841303534858739 53.31641267969816, 6.843051364598726 53.315732694765565, 6.843804351387401 53.31543051198378, 6.843844040555951 53.31496554356866, 6.849161839342751 53.31495092942536, 6.854083876463372 53.31678939207867, 6.855817302106883 53.31761756753287, 6.850097590717466 53.31124090458927, 6.856680386462843 53.3104102095955, 6.856912492582837 53.31122852049775, 6.859123448118157 53.31125088626337, 6.861203354792467 53.31152026628038, 6.86171860496607 53.311319047409356, 6.863699715806039 53.31102400147218, 6.868928270350355 53.31065634847005, 6.870198211839971 53.30892311646559, 6.856851778148959 53.306587127303814, 6.834839966102368 53.302824411833214, 6.831951384722409 53.30253790911038, 6.827197021362261 53.30176091777146, 6.823401207719804 53.30062293103662, 6.8149978182745246 53.2977805535316, 6.814368920545618 53.297587512118206, 6.814355036282553 53.297579899925616, 6.814344609167823 53.297591710031135, 6.814167366642931 53.29777480453906, 6.814165945015529 53.297776277130794, 6.814156763728136 53.297785759416136, 6.8132054150148535 53.29876810478963, 6.812798235162585 53.299188555730275, 6.8114504868753 53.30058019919939, 6.810796554979605 53.30125536048411, 6.8100540057226056 53.30202197233889, 6.808020590522993 53.3041211487334, 6.807369373578128 53.304791782942885, 6.8074642014179165 53.30482400231268, 6.809360787743384 53.3054683699806, 6.8094433474374805 53.30548788179985, 6.809507351512904 53.30549521230928, 6.809561812550777 53.30549977944671, 6.809633709214066 53.30550530900296, 6.809881554858264 53.30547343849502, 6.809936166023247 53.30547800368545, 6.809968511855471 53.305485259570084, 6.8100214353329465 53.305502336099075, 6.81010782501163 53.30553788833553, 6.810170686045728 53.3055696750993, 6.810242128379014 53.30561142506304, 6.810336248676477 53.30570844357485, 6.810399054046888 53.30578399518662, 6.810520660444627 53.305909174570495, 6.810625372616963 53.305990700250284, 6.8106842287552265 53.30602837564755, 6.810757613771548 53.30605626270446, 6.810840881580534 53.306079000749335, 6.810955806184349 53.30610172291632, 6.811057894566426 53.306117407970795, 6.811492049300044 53.30619808385792, 6.811603756545034 53.30621877682113, 6.809514757555597 53.30895295506413, 6.809417978884093 53.309084764676555, 6.808907354913822 53.30977980216965, 6.808352734605847 53.3104333906923, 6.80827109565832 53.31040110596566, 6.80779479031262 53.31021262930991, 6.8076086001684395 53.31012658269649, 6.80658157663805 53.311034401770506, 6.806528647701738 53.31108112768483, 6.807344330519582 53.31140057658718, 6.808051642865825 53.31167762711978, 6.808064449528665 53.311697425463, 6.808056273864496 53.311713428402804, 6.80745021347699 53.31224603265588, 6.807323362419456 53.312367259332134, 6.807027905044086 53.31264957413987, 6.806965634405601 53.31271797782896, 6.806910322822276 53.31276985438571, 6.806796452836691 53.31286160363022, 6.806611786750899 53.31301808332564, 6.806449445197397 53.31315533716344, 6.806273163757848 53.313352335840335, 6.8061145136413055 53.313487838146735, 6.805917675122718 53.31362567786664, 6.80580331189864 53.313716263642355, 6.805494112983976 53.31398285776417, 6.805107140671515 53.31433493177683, 6.80479416442644 53.31461927164204, 6.804440870955177 53.314937156267426, 6.803874223055891 53.31542515300257, 6.803685238349022 53.31560293172313, 6.803652993117111 53.315629507152174, 6.803599976096668 53.315678389334735, 6.803524239948995 53.31574829797531, 6.803338105146352 53.31592015611583, 6.803499314648956 53.31596705293939, 6.803527881389022 53.31597636095488, 6.803642509617262 53.31601371265091, 6.803771479418676 53.3160704253172, 6.80391455866432 53.316131823477136, 6.804051480927207 53.31619760793609, 6.804202494474142 53.316281199053954, 6.804207839380495 53.31628859496464, 6.804568764582166 53.31650810246428, 6.804806604068407 53.31665357416869, 6.804896149856973 53.316706975941216, 6.8049633807597 53.316743476412405, 6.805088161704932 53.31680050640781, 6.805242275838586 53.31685961670394, 6.805550628475197 53.316958783406996, 6.805626899179788 53.31698268549428, 6.805725540059064 53.31700686321823, 6.805827736940912 53.317029460370534, 6.807760988463772 53.31669925345747, 6.808857000386165 53.31667225343516, 6.809526380893851 53.316805307837264, 6.810005620768624 53.317002954508865, 6.810811943353177 53.31742700406379, 6.811731841739554 53.31769693190602, 6.812809105837255 53.31769089841459, 6.813935828099859 53.31822963677398, 6.815413044903393 53.31846820665727, 6.816475138225733 53.31859093880479, 6.817382611586303 53.318350427544644, 6.8184281201242944 53.31829136328779, 6.819391014094212 53.31827259252478, 6.820452966469373 53.31865870909369, 6.821619054574962 53.31897448879506, 6.822971223362212 53.31920210221392, 6.82416316749997 53.31920920031828, 6.825290045360304 53.31929738314048, 6.827008187082374 53.31960594368717, 6.828086428006082 53.3196205421661, 6.829063536762218 53.31949312285018, 6.831350888042208 53.31922920403511, 6.832784801743416 53.31906761895827, 6.83347875187002 53.319132848582356)))


--------------------------------------------------------------------------------
/model/topoml_util/test_fourier_descriptors.py:
--------------------------------------------------------------------------------
 1 | import unittest
 2 | import numpy as np
 3 | from pyefd import elliptic_fourier_descriptors
 4 | 
 5 | 
 6 | class TestFourierDescriptors(unittest.TestCase):
 7 |     def test_same_descriptors(self):
 8 |         square1 = [[0, 0], [1, 0], [1, 0.5], [1, 1], [0, 1], [0, 0]]
 9 |         square2 = [[0, 0], [0.5, 0], [1, 0], [1, 1], [0, 1], [0, 0]]
10 |         descriptors1 = elliptic_fourier_descriptors(square1)
11 |         descriptors2 = elliptic_fourier_descriptors(square2)
12 |         np.testing.assert_array_almost_equal(descriptors1, descriptors2)
13 | 
14 |     def test_different_descriptors(self):
15 |         square1 = [[0, 0], [1, 0], [1, 0.5], [1, 1], [0, 1], [0, 0]]
16 |         square2 = [[0, 0], [0.5, 0], [1, 0], [200, 300], [0, 1], [0, 0]]
17 |         descriptors1 = elliptic_fourier_descriptors(square1)
18 |         descriptors2 = elliptic_fourier_descriptors(square2)
19 |         coeffs = np.append(descriptors1, descriptors2, axis=0)
20 |         try:
21 |             np.testing.assert_array_almost_equal(descriptors1, descriptors2)
22 |         except Exception as e:
23 |             self.assertEqual('Arrays are not almost equal to 6 decimals', e.args[0][1:42])
24 | 
25 |     def test_normalized_descriptors(self):
26 |         square1 = [[0, 0], [1, 0], [1, 0.5], [1, 1], [0, 1], [0, 0]]
27 |         descriptors1 = elliptic_fourier_descriptors(square1, normalize=True)
28 |         descriptors2 = elliptic_fourier_descriptors(square1)
29 |         try:
30 |             np.testing.assert_array_almost_equal(descriptors1, descriptors2)
31 |         except Exception as e:
32 |             self.assertEqual('Arrays are not almost equal to 6 decimals', e.args[0][1:42])
33 | 


--------------------------------------------------------------------------------
/model/topoml_util/test_geom_scaler.py:
--------------------------------------------------------------------------------
 1 | import unittest
 2 | 
 3 | import numpy as np
 4 | 
 5 | from topoml_util import geom_scaler as gs
 6 | 
 7 | # noinspection PyUnresolvedReferences
 8 | dummy_geom = np.zeros((1, 1, 5))
 9 | 
10 | square = np.array([[
11 |     [0., 0., 1., 0., 0.],
12 |     [1., 0., 1., 0., 0.],
13 |     [1., 1., 1., 0., 0.],
14 |     [0., 1., 1., 0., 0.],
15 |     [0., 0., 0., 0., 1.],
16 | ]])
17 | 
18 | square_duplicate_nodes = np.array([[
19 |     [0., 0., 1., 0., 0.],
20 |     [1., 0., 1., 0., 0.],
21 |     [1., 1., 1., 0., 0.],
22 |     [1., 1., 1., 0., 0.],
23 |     [1., 1., 1., 0., 0.],
24 |     [1., 1., 1., 0., 0.],
25 |     [1., 1., 1., 0., 0.],
26 |     [0., 1., 1., 0., 0.],
27 |     [0., 0., 0., 0., 1.],
28 | ]])
29 | 
30 | rectangle = np.array([[
31 |     [0., 0., 1., 0., 0.],
32 |     [1., 0., 1., 0., 0.],
33 |     [1., 2., 1., 0., 0.],
34 |     [0., 2., 1., 0., 0.],
35 |     [0., 0., 0., 0., 1.],
36 | ]])
37 | 
38 | normalized_square = np.array([[
39 |     [-1., -1., 1., 0., 0.],
40 |     [ 1., -1., 1., 0., 0.],
41 |     [ 1.,  1., 1., 0., 0.],
42 |     [-1.,  1., 1., 0., 0.],
43 |     [-1., -1., 0., 0., 1.],
44 | ]])
45 | 
46 | 
47 | class TestGeomScaler(unittest.TestCase):
48 |     def test_localized_mean(self):
49 |         means = gs.localized_mean(square)
50 |         for mean in means[0]:
51 |             self.assertTrue((mean == 0.5).all())
52 | 
53 |     def test_localized_mean_rectangle(self):
54 |         means = gs.localized_mean(rectangle)
55 |         self.assertEqual(means[0, 0, 0], 0.5)
56 |         self.assertEqual(means[0, 0, 1], 1)
57 | 
58 |     def test_localized_mean_dup_nodes(self):
59 |         means = gs.localized_mean(square_duplicate_nodes)
60 |         self.assertTrue((means == 0.75).all())
61 | 
62 |     def test_scaling_square(self):
63 |         scale = gs.scale(square)
64 |         self.assertEqual(scale, 0.5)
65 | 
66 |     def test_scaling_square_dup_nodes(self):
67 |         scale = gs.scale(square_duplicate_nodes)
68 |         self.assertEqual(scale, 0.5)
69 | 
70 |     def test_transform(self):
71 |         # scaled_square = square[0] * 2
72 |         # scaled_square[4, 12] = 1.
73 |         scale = gs.scale(square)
74 |         n_square = gs.transform(square, scale=scale)
75 |         self.assertTrue((n_square == normalized_square).all())
76 |         coords = [geom[:, :2].flatten() for geom in n_square]
77 |         coords = [item for sublist in coords for item in sublist]
78 |         std = np.std(coords)
79 |         self.assertAlmostEqual(std, 1., 1)
80 | 
81 |     def test_upsized_transform(self):
82 |         square_0 = square[0] * 2
83 |         square_0[:4, 2] = 1.
84 |         square_0[4, 4] = 1.
85 |         scale = gs.scale([square_0])
86 |         n_square = gs.transform([square_0], scale=scale)
87 |         self.assertTrue((n_square == normalized_square).all())
88 |         coords = [geom[:, :2].flatten() for geom in n_square]
89 |         coords = [item for sublist in coords for item in sublist]
90 |         std = np.std(coords)
91 |         self.assertAlmostEqual(std, 1., 1)
92 | 


--------------------------------------------------------------------------------
/model/topoml_util/test_np_gaussian_2d_loss.py:
--------------------------------------------------------------------------------
 1 | import unittest
 2 | import numpy as np
 3 | from np_gaussian_2d_loss import np_r2_bivariate_gaussian_loss, np_r4_bivariate_gaussian, softmax, epsilon, \
 4 |     np_r4_bivariate_gaussian_loss
 5 | 
 6 | 
 7 | class TestNumpy2DGaussianLoss(unittest.TestCase):
 8 |     def test_r2_2d_loss(self):
 9 |         vec_in = np.array([[1, 1, 0, 0, 0]])
10 |         vec_out = vec_in
11 |         loss1 = np_r2_bivariate_gaussian_loss(vec_in, vec_out)
12 |         vec_out = np.array([[1, 1, 5, 5, 0]])
13 |         loss2 = np_r2_bivariate_gaussian_loss(vec_in, vec_out)
14 |         self.assertLess(loss1, loss2)
15 | 
16 |         vec_out = np.array([[1, 1, 5, -5, 0]])
17 |         loss3 = np_r2_bivariate_gaussian_loss(vec_in, vec_out)
18 |         self.assertLess(loss1, loss3)
19 |         self.assertEqual(loss2, loss3)
20 | 
21 |     def test_r4_bivariate_gaussian_loss(self):
22 |         vec_in = np.array([[[[1, 1, 0, 0, 0, 0]]]])
23 |         vec_out = vec_in
24 |         loss1 = np_r4_bivariate_gaussian_loss(vec_in, vec_out)
25 |         vec_out = np.array([[[[1, 1, 5, 5, 0, 0]]]])
26 |         loss2 = np_r4_bivariate_gaussian_loss(vec_in, vec_out)
27 |         self.assertLess(loss1, loss2)
28 | 
29 |         vec_out = np.array([[[[1, 1, 5, -5, 0, 0]]]])
30 |         loss3 = np_r4_bivariate_gaussian_loss(vec_in, vec_out)
31 |         self.assertLess(loss1, loss3)
32 |         self.assertEqual(loss2, loss3)
33 | 
34 |     def test_r4_bivariate_gmm_zeros_loss(self):
35 |         vec_in = np.array([[[[0, 0, 0, 0, 0, 0]]]])
36 |         vec_in = np.repeat(vec_in, 6, axis=2)  # 6 gaussian mixture components
37 |         pi_index = 5
38 |         pi_weights = softmax(vec_in[:, :, :, pi_index])
39 |         vec_out = vec_in
40 |         loss1 = np_r4_bivariate_gaussian(vec_in, vec_out)
41 |         loss1 = loss1 * pi_weights
42 |         gmm_loss1 = np.sum(-np.log(loss1 + epsilon), keepdims=True)
43 | 
44 |         vec_out = np.array([[[[1, 1, 5, 5, 0, 0]]]])
45 |         vec_out = np.repeat(vec_out, 6, axis=2)
46 |         pi_weights = softmax(vec_out[:, :, :, pi_index])
47 |         loss2 = np_r4_bivariate_gaussian(vec_in, vec_out) * pi_weights
48 |         gmm_loss2 = np.sum(-np.log(loss2 + epsilon), keepdims=True)
49 |         self.assertLess(gmm_loss1[0, 0, 0], gmm_loss2[0, 0, 0])
50 | 


--------------------------------------------------------------------------------
/model/topoml_util/test_rasterization.py:
--------------------------------------------------------------------------------
 1 | import unittest
 2 | 
 3 | from rasterio import features
 4 | from shapely import wkt
 5 | 
 6 | 
 7 | class TestRasterize(unittest.TestCase):
 8 |     def test_first(self):
 9 |         size = 20
10 |         first = "POLYGON(({0} {0}, {0} -{0}, -{0} -{0}, -{0} {0}, {0} {0}))".format(size)
11 |         geo_interfaces = [wkt.loads(first).__geo_interface__]
12 |         raster = features.rasterize(geo_interfaces, out_shape=[255, 255])
13 |         self.assertEqual(raster[100, 100], 1)
14 | 


--------------------------------------------------------------------------------
/model/topoml_util/test_sketch-rnn-model.py:
--------------------------------------------------------------------------------
  1 | import unittest
  2 | from keras import backend as K
  3 | import tensorflow as tf
  4 | import numpy as np
  5 | from sketch_rnn_model import tf_2d_normal
  6 | 
  7 | PRECISION = 6
  8 | sess = tf.InteractiveSession()
  9 | 
 10 | 
 11 | class TestSketchRnnLoss(unittest.TestCase):
 12 |     def test_2d_gaussian_zeros(self):
 13 |         target = np.array([[[0, 0]]], dtype=float)
 14 |         prediction = np.array([[[0, 0, 0, 0, 0]]], dtype=float)
 15 |         args = np.append(target, prediction)
 16 |         loss = -K.log(tf_2d_normal(*args) + K.epsilon()).eval()
 17 |         self.assertAlmostEqual(loss, 1.1048509233685306, places=PRECISION)
 18 | 
 19 |     def test_2d_gaussian_small_mu_diff(self):
 20 |         target = np.array([[[5, 52]]], dtype=float)
 21 |         prediction = np.array([[[5 + 1e-6, 52 + 1e-6, 0, 0, 0]]], dtype=float)
 22 |         args = np.append(target, prediction)
 23 |         loss = -K.log(tf_2d_normal(*args) + K.epsilon()).eval()
 24 |         self.assertAlmostEqual(loss, 1.1048509233706119, places=PRECISION)
 25 | 
 26 |     def test_2d_gaussian_small_sigma_diff(self):
 27 |         tensor_train = np.array([[[5, 52]]], dtype=float)
 28 |         tensor_predict = np.array([[[5, 52, 1e-6, 1e-6, 0]]], dtype=float)
 29 |         loss = tf_2d_normal(tensor_train, tensor_predict).eval()
 30 |         self.assertAlmostEqual(loss, 1.1048523660629765, places=PRECISION)
 31 | 
 32 |     def test_2d_gaussian_mu_ones(self):
 33 |         target = np.array([[[1, 1]]], dtype=float)
 34 |         prediction = np.array([[[1, 1, 1, 1, 0]]], dtype=float)
 35 |         args = np.append(target, prediction)
 36 |         loss = -K.log(tf_2d_normal(*args) + K.epsilon()).eval()
 37 |         self.assertAlmostEqual(loss, 2.3829037437816121, places=PRECISION)
 38 | 
 39 |     def test_2d_gaussian_mu_minus_ones(self):
 40 |         target = np.array([[[1, 1]]], dtype=float)
 41 |         prediction = np.array([[[1, 1, -1, -1, 0]]], dtype=float)
 42 |         args = np.append(target, prediction)
 43 |         loss = -K.log(tf_2d_normal(*args) + K.epsilon()).eval()
 44 |         self.assertAlmostEqual(loss, 2.3829037437816121, places=PRECISION)
 45 | 
 46 |     def test_2d_gaussian_ones(self):
 47 |         target = np.array([[[1, 1]]], dtype=float)
 48 |         prediction = np.array([[[1, 1, 1, 1, 1]]], dtype=float)
 49 |         args = np.append(target, prediction)
 50 |         loss = -K.log(tf_2d_normal(*args) + K.epsilon()).eval()
 51 |         self.assertAlmostEqual(loss, 1.9491232946784192, places=PRECISION)
 52 | 
 53 |     def test_2d_gaussian_rho_one(self):
 54 |         target = np.array([[[1, 2]]], dtype=float)
 55 |         prediction = np.array([[[1, 2, 0, 0, 1]]], dtype=float)
 56 |         args = np.append(target, prediction)
 57 |         loss = -K.log(tf_2d_normal(*args) + K.epsilon()).eval()
 58 |         self.assertAlmostEqual(loss, 1.9491232946784192, places=PRECISION)
 59 | 
 60 |     def test_2d_gaussian_rho_minus_one(self):
 61 |         target = np.array([[[1, 2]]], dtype=float)
 62 |         prediction = np.array([[[1, 2, 0, 0, -1]]], dtype=float)
 63 |         args = np.append(target, prediction)
 64 |         loss = -K.log(tf_2d_normal(*args) + K.epsilon()).eval()
 65 |         self.assertAlmostEqual(loss, 1.9491232946784192, places=PRECISION)
 66 | 
 67 |     def test_2d_gaussian_rho_two(self):
 68 |         target = np.array([[[1, 2]]], dtype=float)
 69 |         prediction = np.array([[[1, 2, 0, 0, 2]]], dtype=float)
 70 |         args = np.append(target, prediction)
 71 |         loss = -K.log(tf_2d_normal(*args) + K.epsilon()).eval()
 72 |         self.assertAlmostEqual(loss, 1.9491232946784192, places=PRECISION)
 73 | 
 74 |     def test_2d_gaussian_rho_minus_two(self):
 75 |         target = np.array([[[1, 2]]], dtype=float)
 76 |         prediction = np.array([[[1, 2, 0, 0, -2]]], dtype=float)
 77 |         args = np.append(target, prediction)
 78 |         loss = -K.log(tf_2d_normal(*args) + K.epsilon()).eval()
 79 |         self.assertAlmostEqual(loss, 1.9491232946784192, places=PRECISION)
 80 | 
 81 |     def test_2d_gaussian_big_diff(self):
 82 |         target = np.array([[[5, 52]]], dtype=float)
 83 |         prediction = np.array([[[1, 2, 3, 4, 5]]], dtype=float)
 84 |         args = np.append(target, prediction)
 85 |         loss = -K.log(tf_2d_normal(*args) + K.epsilon()).eval()
 86 |         self.assertAlmostEqual(loss, 16.11809565095832, places=PRECISION)
 87 | 
 88 |     def test_2d_gaussian_really_big_diff(self):
 89 |         target = np.array([[[5, 52]]], dtype=float)
 90 |         prediction = np.array([[[0, 0, 3, 4, 5]]], dtype=float)
 91 |         args = np.append(target, prediction)
 92 |         loss = -K.log(tf_2d_normal(*args) + K.epsilon()).eval()
 93 |         self.assertAlmostEqual(loss, 16.11809565095832, places=PRECISION)
 94 | 
 95 |     def test_2d_gaussian_max_neg_rho(self):
 96 |         min_rho = -19.06  # This is about the limit of rho before geom_gaussian_loss returns NaN
 97 |         target = np.array([[[5, 52]]], dtype=float)
 98 |         prediction = np.array([[[5, 52, -1, -1, min_rho]]], dtype=float)
 99 |         args = np.append(target, prediction)
100 |         loss = -K.log(tf_2d_normal(*args) + K.epsilon()).eval()
101 |         self.assertAlmostEqual(loss, -18.505382378927028, places=PRECISION)


--------------------------------------------------------------------------------
/model/topoml_util/test_wkt2pyplot.py:
--------------------------------------------------------------------------------
 1 | import unittest
 2 | import numpy as np
 3 | from datetime import datetime
 4 | 
 5 | from shapely.geometry import Point
 6 | from topoml_util.wkt2pyplot import wkt2pyplot
 7 | 
 8 | from topoml_util.GeoVectorizer import GeoVectorizer
 9 | 
10 | 
11 | class TestWktToPyplotPoly(unittest.TestCase):
12 |     def test_geometrycollection_empty(self):
13 |         inputs = ['GEOMETRYCOLLECTION EMPTY']  # This is valid WKT
14 |         plt, fig, ax = wkt2pyplot(inputs)
15 |         plt.show()  # It should show an empty plot
16 | 
17 |     def test_polygon_conversion(self):
18 |         TIMESTAMP = str(datetime.now()).replace(':', '.')
19 | 
20 |         inputs = 'POLYGON((1.09872727273 -0.289454545452,-0.241272727273 0.682545454538,-0.992272727274 ' \
21 |                  '0.292545454528,0.347727272727 -0.680454545474,1.09872727273 -0.289454545452))\nPOLYGON((' \
22 |                  '-0.976272727273 0.302545454574,-0.25627272727 0.676545454539,1.05372727273 -0.276454545443,' \
23 |                  '0.320727272731 -0.654454545455,-0.477272727268 -0.0664545454754,-0.976272727273 ' \
24 |                  '0.302545454574))'
25 |         inputs = inputs.split('\n')
26 | 
27 |         target = 'POLYGON((-0.974272727277 0.301545454562,-0.255272727276 0.675545454527,1.05372727273 ' \
28 |                  '-0.276454545443,0.320727272731 -0.654454545455,-0.477272727268 -0.0664545454754,-0.974272727277 ' \
29 |                  '0.301545454562))'
30 | 
31 |         prediction = [
32 |             'POINT(-0.974272727277 0.301545454562)',
33 |             'POINT(-0.255272727276 0.675545454527)',
34 |             'POINT(1.05372727273 -0.276454545443)',
35 |             'POINT(0.320727272731 -0.654454545455)',
36 |             'POINT(-0.477272727268 -0.0664545454754)',
37 |             'POINT(-0.974272727277 0.301545454562)',
38 |         ]
39 |         plt, fig, ax = wkt2pyplot(inputs, [target], prediction)
40 |         plt.text(0.01, 0.06, 'prediction: some more text', transform=ax.transAxes)
41 |         plt.text(0.01, 0.01, 'target: some text', transform=ax.transAxes)
42 | 
43 |         plt.show()
44 | 
45 |     def test_gaussian_sample_plot(self):
46 | 
47 |         inputs = 'POLYGON((1.09872727273 -0.289454545452,-0.241272727273 0.682545454538,-0.992272727274 ' \
48 |                  '0.292545454528,0.347727272727 -0.680454545474,1.09872727273 -0.289454545452))\nPOLYGON((' \
49 |                  '-0.976272727273 0.302545454574,-0.25627272727 0.676545454539,1.05372727273 -0.276454545443,' \
50 |                  '0.320727272731 -0.654454545455,-0.477272727268 -0.0664545454754,-0.976272727273 ' \
51 |                  '0.302545454574))'
52 |         inputs = inputs.split('\n')
53 | 
54 |         target = np.array([
55 |             # mu1 mu2  s1  s2  rho pi  [geo type one-hot            ]  [render 1hot]
56 |             [0.1,   0.1, 0.1, 0.1, 0., 0., 0., 0., 0., 1., 0., 0., 0., 0., 1., 0., 0., ],
57 |             [0.1,  -0.1, 0.1, 0.1, 0., 0., 0., 0., 0., 1., 0., 0., 0., 0., 1., 0., 0., ],
58 |             [-0.1, -0.1, 0.1, 0.1, 0., 0., 0., 0., 0., 1., 0., 0., 0., 0., 1., 0., 0., ],
59 |             [-0.1,  0.1, 0.1, 0.1, 0., 0., 0., 0., 0., 1., 0., 0., 0., 0., 1., 0., 0., ],
60 |             [0.1,   0.1, 0.1, 0.1, 0., 0., 0., 0., 0., 1., 0., 0., 0., 0., 0., 0., 1., ],
61 |             [0.,     0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 1., ],
62 |             [0.,     0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 1., ]
63 |         ])
64 | 
65 |         target = [Point(point).wkt for point in
66 |                   GeoVectorizer(gmm_size=1).decypher_gmm_geom(target, 1000)]
67 | 
68 |         plt, fig, ax = wkt2pyplot(inputs, target, None)
69 |         plt.show()
70 | 


--------------------------------------------------------------------------------
/model/topoml_util/wkt2pyplot.py:
--------------------------------------------------------------------------------
 1 | from datetime import datetime
 2 | 
 3 | import matplotlib
 4 | import os
 5 | 
 6 | if not os.environ.get('MATPLOTLIB_TEST'):
 7 |     matplotlib.use('Agg')  # for headless machine instances
 8 | 
 9 | from shapely import wkt
10 | from matplotlib import pyplot as plt
11 | 
12 | 
13 | def wkt2pyplot(input_wkts, target_wkts=None, prediction_wkts=None,
14 |                input_color='green', target_color='red', pred_color='blue'):
15 |     """
16 |     Convert arrays of input, target and prediction well-known encoded geometry arrays to pyplot
17 |     :param input_wkts: an array of input geometries, rendered in (standard) green
18 |     :param target_wkts: optional array of target geometries, rendered in (standard) red
19 |     :param prediction_wkts: optional array of prediction geometries, rendered in (standard) blue
20 |     :param input_color: a pyplot-compatible notation of color, default green
21 |     :param pred_color: a pyplot-compatible notation of color, default red
22 |     :param target_color: a pyplot-compatible notation of color, default blue
23 |     :return: a matplotlib pyplot fig, ax and plt
24 |     """
25 |     input_geoms = [wkt.loads(input_wkt) for input_wkt in input_wkts]
26 | 
27 |     fig, ax = plt.subplots()
28 | 
29 |     input_polys = []
30 |     for input_geom in input_geoms:
31 |         if len(input_geom.bounds) > 0 and input_geom.geom_type == 'Polygon':
32 |             input_polys.append(matplotlib.patches.Polygon(input_geom.boundary.coords))
33 | 
34 |     inputs = matplotlib.collections.PatchCollection(input_polys, alpha=0.4, linewidth=1)
35 |     inputs.set_color(input_color)
36 |     ax.add_collection(inputs)
37 | 
38 |     # target_polys = [Polygon(target_geom.boundary.coords) for target_geom in target_geoms]
39 |     # targets = PatchCollection(target_polys, alpha=0.4, linewidth=1)
40 |     # targets.set_color(target_color)
41 |     # ax.add_collection(targets)
42 | 
43 |     # TODO: handle other types of geometries
44 |     # TODO: handle holes in polygons (donuts)
45 |     if target_wkts:
46 |         target_geoms = [wkt.loads(target_wkt) for target_wkt in target_wkts]
47 |         for geom in target_geoms:
48 |             if geom.type == 'Point':
49 |                 plt.plot(geom.coords.xy[0][0], geom.coords.xy[1][0],
50 |                          marker='o', color=target_color, alpha=0.4, linewidth=0)
51 |             elif geom.type == 'Polygon':
52 |                 collection = matplotlib.collections.PatchCollection([matplotlib.patches.Polygon(geom.boundary.coords)],
53 |                                                                     alpha=0.4, linewidth=1)
54 |                 collection.set_color(target_color)
55 |                 ax.add_collection(collection)
56 | 
57 |     if prediction_wkts:
58 |         prediction_geoms = [wkt.loads(prediction_wkt) for prediction_wkt in prediction_wkts]
59 |         for geom in prediction_geoms:
60 |             if geom.geom_type == 'Point':
61 |                 plt.plot(geom.coords.xy[0][0], geom.coords.xy[1][0],
62 |                          marker='o', color=pred_color, alpha=0.1, linewidth=0)
63 |             elif geom.type == 'Polygon':
64 |                 collection = matplotlib.collections.PatchCollection([matplotlib.patches.Polygon(geom.boundary.coords)],
65 |                                                                     alpha=0.4, linewidth=1)
66 |                 collection.set_color(pred_color)
67 |                 ax.add_collection(collection)
68 | 
69 |     plt.axis('auto')
70 | 
71 |     return plt, fig, ax
72 | 
73 | 
74 | def save_plot(geoms, plot_dir='plots', timestamp=None):
75 |     os.makedirs(str(plot_dir), exist_ok=True)
76 |     plt, fig, ax = wkt2pyplot(*geoms)
77 |     plt.savefig(plot_dir + '/plt_' + timestamp + '.png')
78 |     plt.close('all')
79 | 


--------------------------------------------------------------------------------
/prep/ProgressBar.py:
--------------------------------------------------------------------------------
 1 | """
 2 | Adapted from http://stackoverflow.com/questions/3160699/python-progress-bar
 3 | """
 4 | 
 5 | import sys
 6 | 
 7 | from time import time
 8 | 
 9 | 
10 | class ProgressBar:
11 |     """
12 |     Class for creating std output progress indication bars
13 |     """
14 |     def __init__(self, bar_length=40):
15 |         """
16 |         Constructor
17 |         :param bar_length: length of the bar in characters
18 |         """
19 |         self.start_seconds = time()
20 |         self.bar_length = bar_length
21 | 
22 |     def update_progress(self, progress, status=''):
23 |         """
24 |         update_progress() : Displays or updates a std out progress bar
25 | 
26 |         The method simply repeats  on the console each time the method is called
27 |         :param status: Optional status message
28 |         :param progress: Accepts a float between 0 and 1. Any int will be converted to a float.
29 |         A value under 0 represents a 'halt'.
30 |         A value at 1 or bigger represents 100%
31 |         :return: None
32 |         """
33 | 
34 |         if isinstance(progress, int):
35 |             progress = float(progress)
36 |         if not isinstance(progress, float):
37 |             raise ValueError("error: progress must be numeric")
38 |         if progress < 0:
39 |             progress = 0
40 |             status = "Halt...\r\n"
41 |         if progress >= 1:
42 |             progress = 1
43 |             status = "Done...\r\n"
44 | 
45 |         progress_rounded = "{:10.2f}".format(float(progress*100))
46 |         elapsed_time = time() - self.start_seconds
47 |         if progress > 0:
48 |             projected_time = elapsed_time / progress - elapsed_time
49 |         else:
50 |             projected_time = 0
51 | 
52 |         block = round(self.bar_length * min(progress, 1))
53 |         progress_line = "\U000025B0" * (max(0, block - 1)) + "\U000025BA"
54 |         progress_line += "\U000025B1" * (self.bar_length - block)
55 | 
56 |         hours, remainder = divmod(projected_time, 3600)
57 |         minutes, seconds = divmod(remainder, 60)
58 |         eta = '{}h{}m{}s'.format(int(hours), int(minutes), int(seconds))
59 | 
60 |         text = "\r\U0001F680{}\U0001F3C1 {}% {} {}".format(progress_line, progress_rounded, eta, status)
61 |         sys.stdout.write(text)
62 |         sys.stdout.flush()
63 | 


--------------------------------------------------------------------------------
/prep/densified.py:
--------------------------------------------------------------------------------
 1 | import numpy as np
 2 | 
 3 | from topoml_util.GeoVectorizer import GeoVectorizer
 4 | from topoml_util.geom_scaler import localized_mean, localized_normal
 5 | 
 6 | DATA_FILE = '../files/brt_osm/brt_osm.npz'
 7 | TARGET_FILE = '../files/brt_osm/densified_vectorized.npz'
 8 | DENSIFIED = 100
 9 | 
10 | loaded = np.load(DATA_FILE)
11 | raw_training_vectors = loaded['input_geoms']
12 | raw_target_vectors = loaded['intersection']
13 | 
14 | training_vectors = []
15 | target_vectors = []
16 | 
17 | # skip non-intersecting geometries
18 | for train, target in zip(raw_training_vectors, raw_target_vectors):
19 |     if not target[0, 0] == 0:  # a zero coordinate designates an empty geometry
20 |         training_vectors.append(train)
21 |         target_vectors.append(target)
22 | 
23 | print('Preprocessing vectors...')
24 | means = localized_mean(training_vectors)
25 | training_vectors = localized_normal(training_vectors, means, 1e4)
26 | training_vectors = np.array([GeoVectorizer.interpolate(vector, DENSIFIED) for vector in training_vectors])
27 | target_vectors = localized_normal(target_vectors, means, 1e4)
28 | target_vectors = np.array([GeoVectorizer.interpolate(vector, 50) for vector in target_vectors])
29 | 
30 | print('Saving compressed numpy data file', TARGET_FILE)
31 | 
32 | np.savez_compressed(
33 |     TARGET_FILE,
34 |     input_geoms=training_vectors,
35 |     intersection=target_vectors
36 | )
37 | 
38 | print('Done!')
39 | 


--------------------------------------------------------------------------------
/prep/export-data.sh:
--------------------------------------------------------------------------------
 1 | #!/usr/bin/env bash
 2 | echo "exporting data, this will take a couple of minutes..."
 3 | rm /data/files/brt_osm/brt_osm.csv
 4 | # extract the joined data
 5 | # https://gis.stackexchange.com/questions/185072/ogr2ogr-sql-query-from-text-file#185141
 6 | cd /data/files
 7 | set -ex
 8 | ogr2ogr -f CSV brt_osm/brt_osm.csv PG:"host=postgis port=5432 dbname=postgres user=postgres password=postgres" -sql @../prep/spatial-join.sql
 9 | set -e
10 | lines=$(tail -n +2 brt_osm.csv | wc -l)
11 | echo
12 | echo "Wrote $lines number of data points"
13 | echo "The export script ran successfully. The generated data set was saved to files/brt_osm/brt_osm.csv"
14 | 
15 | cd ../prep
16 | echo "Creating BRT/OSM numpy archive..."
17 | python3 vectorize_brt_osm.py
18 | echo "Creating neighborhoods numpy archive..."
19 | python3 get-neighborhoods.py
20 | 
21 | echo "Done!"


--------------------------------------------------------------------------------
/prep/get-data.sh:
--------------------------------------------------------------------------------
 1 | #!/usr/bin/env bash
 2 | echo "importing data... this will take a while depending on your internet connection speed"
 3 | set -ex
 4 | mkdir -p /data/files
 5 | cd /data/files
 6 | 
 7 | # Get the Base Registration for Topograpy data
 8 | curl -o base_registration_topography/TOP10NL_25W.zip https://geodata.nationaalgeoregister.nl/top10nlv2/extract/kaartbladen/TOP10NL_25W.zip?formaat=gml
 9 | curl -o base_registration_topography/TOP10NL_34O.zip https://geodata.nationaalgeoregister.nl/top10nlv2/extract/kaartbladen/TOP10NL_34O.zip?formaat=gml
10 | 
11 | # Get the OpenStreetMap data
12 | curl -o openstreetmap/netherlands-latest-free.shp.zip http://download.geofabrik.de/europe/netherlands-latest-free.shp.zip
13 | 
14 | # Get neighborhoods
15 | curl -X GET \
16 |   -o neighborhoods/neighborhoods.csv \
17 |   'https://geodata.nationaalgeoregister.nl/wijkenbuurten2017/wfs?request=GetFeature&service=WFS&version=2.0.0&typeName=cbs_buurten_2017&outputFormat=csv&srsName=EPSG%3A4326&PropertyName=aantal_inwoners%2Cgeom'
18 | 
19 | # Get BAG buildings
20 | types=( woonfunctie winkelfunctie bijeenkomstfunctie onderwijsfunctie gezondheidszorgfunctie kantoorfunctie industriefunctie sportfunctie logiesfunctie )
21 | pages=( 1000 2000 3000 4000 5000 6000 7000 8000 9000 10000 11000 12000 13000 14000 15000 16000 17000 18000 19000 20000 21000 22000 )
22 | 
23 | for type in "${types[@]}"
24 | do
25 |   url="https://geodata.nationaalgeoregister.nl/bag/wfs?request=GetFeature&service=WFS&version=2.0.0&typeName=pand&outputFormat=csv&srsName=EPSG%3A4326&PropertyName=geometrie%2Cgebruiksdoel&cql_filter=(gebruiksdoel%3D'${type}')"
26 |   curl -X GET ${url} | grep -e gebruiksdoel -e pand > buildings/buildings-${type}.csv
27 |   for page in "${pages[@]}"
28 |   do
29 |     url="https://geodata.nationaalgeoregister.nl/bag/wfs?request=GetFeature&service=WFS&version=2.0.0&typeName=pand&outputFormat=csv&srsName=EPSG%3A4326&PropertyName=geometrie%2Cgebruiksdoel&startIndex="${page}"&cql_filter=(gebruiksdoel%3D'${type}')"
30 |     echo ${url}
31 |     curl -X GET ${url} | grep -v gebruiksdoel | grep pand >> buildings/buildings-${type}.csv
32 |   done
33 | done
34 | 
35 | # Inflate
36 | unzip -o base_registration_topography/TOP10NL_25W.zip
37 | unzip -o base_registration_topography/TOP10NL_34O.zip
38 | unzip -o openstreetmap/netherlands-latest-free.shp.zip *buildings*
39 | 
40 | # Load the database. Be sure to have the postgis container running
41 | ogr2ogr -f "PostgreSQL" PG:"host=postgis port=5432 dbname=postgres user=postgres password=postgres" base_registration_topography/TOP10NL_25W.gml -overwrite -progress -t_srs "EPSG:4326" -oo GML_ATTRIBUTES_TO_OGR_FIELDS=YES
42 | ogr2ogr -f "PostgreSQL" PG:"host=postgis port=5432 dbname=postgres user=postgres password=postgres" base_registration_topography/TOP10NL_34O.gml -append -progress -t_srs "EPSG:4326" -oo GML_ATTRIBUTES_TO_OGR_FIELDS=YES
43 | # https://trac.osgeo.org/gdal/ticket/4939
44 | # http://www.bostongis.com/PrinterFriendly.aspx?content_name=ogr_cheatsheet
45 | ogr2ogr -f "PostgreSQL" PG:"host=postgis port=5432 dbname=postgres user=postgres password=postgres" openstreetmap/gis.osm_buildings_a_free_1.shp -overwrite -progress -nln osm_buildings -nlt PROMOTE_TO_MULTI -lco EXTRACT_SCHEMA_FROM_LAYER_NAME=no
46 | 
47 | bash ./export-data.sh


--------------------------------------------------------------------------------
/prep/preprocess-neighborhoods.py:
--------------------------------------------------------------------------------
  1 | """
  2 | Preprocessing script to convert well-known-text geometries to matrix representations thereof.
  3 | With a SANE_NUMBER_OF_POINTS set to 2048, it simplifies only 248
  4 | """
  5 | 
  6 | import os
  7 | from datetime import timedelta
  8 | from time import time
  9 | from zipfile import ZipFile
 10 | 
 11 | import matplotlib.pyplot as plt
 12 | import numpy as np
 13 | from pandas import read_csv
 14 | from shapely import wkt
 15 | from sklearn.model_selection import train_test_split
 16 | 
 17 | from model.topoml_util.GeoVectorizer import GeoVectorizer
 18 | from model.topoml_util.geom_fourier_descriptors import create_geom_fourier_descriptor
 19 | from prep.ProgressBar import ProgressBar
 20 | 
 21 | SCRIPT_VERSION = '7'
 22 | SOURCE_DIR = '../files/neighborhoods/'
 23 | SOURCE_ZIP = SOURCE_DIR + 'neighborhoods.csv.zip'
 24 | SOURCE_CSV = 'neighborhoods.csv'
 25 | LOG_FILE = 'neighborhoods_preprocessing.log'
 26 | TRAIN_DATA_FILE = SOURCE_DIR + 'neighborhoods_train_v' + SCRIPT_VERSION
 27 | TEST_DATA_FILE = SOURCE_DIR + 'neighborhoods_test_v' + SCRIPT_VERSION
 28 | SANE_NUMBER_OF_POINTS = 2048
 29 | REDUCED_POINTS = 256
 30 | TRAIN_TEST_SPLIT = 0.1
 31 | FOURIER_DESCRIPTOR_ORDER = 32  # The axis 0 size
 32 | SCRIPT_START = time()
 33 | 
 34 | if not os.path.isfile(SOURCE_ZIP):
 35 |     raise FileNotFoundError('Unable to locate {}. Please run the prep/get-data.sh script first'.format(SOURCE_ZIP))
 36 | 
 37 | print('Preprocessing archaeological features...')
 38 | zip_file = ZipFile(SOURCE_ZIP)
 39 | df = read_csv(zip_file.open(SOURCE_CSV))
 40 | df = df[df.aantal_inwoners >= 0]  # Filter out negative placeholder values for unknowns
 41 | 
 42 | print('Creating geometry vectors and descriptors...')
 43 | wkt_vectors = []
 44 | shapes = [wkt.loads(wkt_string) for wkt_string in df.geom.values]
 45 | number_of_vertices = [GeoVectorizer.num_points_from_wkt(shape.wkt) for shape in shapes]
 46 | 
 47 | plt.hist(number_of_vertices, bins=20, log=True)
 48 | plt.savefig('neighborhood_geom_vertices_distr.png')
 49 | geoms_above_threshold = len([v for v in number_of_vertices if v > SANE_NUMBER_OF_POINTS])
 50 | print('{} of the {} geometries are over the max {} vertices threshold and will be simplified.\n'.format(
 51 |     geoms_above_threshold, len(shapes), SANE_NUMBER_OF_POINTS))
 52 | 
 53 | pgb = ProgressBar()
 54 | logfile = open(LOG_FILE, 'w')
 55 | selected_data = []
 56 | simplified_geometries = 0
 57 | errors = 0
 58 | 
 59 | for index, (inhabitants, wkt_string) in enumerate(zip(df.aantal_inwoners.values, df.geom.values)):
 60 |     pgb.update_progress(index/len(df.geom.values), '{} geometries, {} errors in logfile'.format(index, errors))
 61 |     try:
 62 |         shape = wkt.loads(wkt_string)
 63 |         fixed_size_wkt_vector = GeoVectorizer.vectorize_wkt(wkt_string, REDUCED_POINTS, simplify=True, fixed_size=True)
 64 | 
 65 |         geom_len = min(GeoVectorizer.num_points_from_wkt(shape.wkt), SANE_NUMBER_OF_POINTS)
 66 |         if geom_len == SANE_NUMBER_OF_POINTS:
 67 |             simplified_geometries += 1
 68 |         wkt_vector = GeoVectorizer.vectorize_wkt(wkt_string, geom_len, simplify=True)
 69 | 
 70 |         # If multipart multipolygon: select the largest, but it will throw off the accuracy a bit.
 71 |         if shape.geom_type == 'MultiPolygon':
 72 |             if len(shape.geoms) > 1:
 73 |                 geometries = sorted(shape.geoms, key=lambda x: x.area)
 74 |                 shape = geometries[-1]
 75 |             else:
 76 |                 shape = shape.geoms[0]
 77 |         elif shape.geom_type == 'Polygon':
 78 |             pass
 79 |         else:
 80 |             logfile.write('skipping record: no (multi)polygon entry in {} on line {}'.format(
 81 |                 SOURCE_CSV, index + 2))
 82 |             errors += 1
 83 |             continue
 84 | 
 85 |         efds = create_geom_fourier_descriptor(shape, FOURIER_DESCRIPTOR_ORDER)
 86 | 
 87 |     except Exception as e:
 88 |         logfile.write('Skipping record on account of geometry entry in {} on line {} with error: {}\n'.format(
 89 |             SOURCE_CSV, index + 2, e))
 90 |         errors += 1
 91 |         continue
 92 | 
 93 |     # Append the converted values if all went well
 94 |     selected_data.append({
 95 |         'geom': wkt_vector,
 96 |         'fixed_size_geom': fixed_size_wkt_vector,
 97 |         'elliptic_fourier_descriptors': efds,
 98 |         'inhabitants': inhabitants,
 99 |     })
100 | 
101 | logfile.close()
102 | print('\ncreated {} data points with {} simplified geometries and {} errors'.format(
103 |     len(selected_data), simplified_geometries, errors))
104 | 
105 | median = np.median([p['inhabitants'] for p in selected_data])
106 | print('Median:', median, 'inhabitants')
107 | 
108 | # Split and save data
109 | train, test = train_test_split(selected_data, test_size=0.1, random_state=42)
110 | 
111 | print('Saving test data...')
112 | # Test data is small enough to put in one archive
113 | np.savez_compressed(
114 |     TEST_DATA_FILE,
115 |     geoms=[record['geom'] for record in test],
116 |     fixed_size_geoms=[record['fixed_size_geom'] for record in test],
117 |     elliptic_fourier_descriptors=[record['elliptic_fourier_descriptors'] for record in test],
118 |     inhabitants=[record['inhabitants'] for record in test],
119 |     above_or_below_median=[int(record['inhabitants'] > median) for record in test],
120 |     type_index={0: 'less than median', 1: 'greater than or equal to median'},
121 | )
122 | 
123 | print('Saving training data...')
124 | np.savez_compressed(
125 |     TRAIN_DATA_FILE,
126 |     geoms=[record['geom'] for record in train],
127 |     fixed_size_geoms=[record['fixed_size_geom'] for record in train],
128 |     elliptic_fourier_descriptors=[record['elliptic_fourier_descriptors'] for record in train],
129 |     inhabitants=[record['inhabitants'] for record in test],
130 |     above_or_below_median=[int(record['inhabitants'] > median) for record in train],
131 |     type_index={0: 'less than median', 1: 'greater than or equal to median'},
132 | )
133 | 
134 | runtime = time() - SCRIPT_START
135 | print('Done in {}'.format(timedelta(seconds=runtime)))
136 | 


--------------------------------------------------------------------------------
/prep/spatial-join.sql:
--------------------------------------------------------------------------------
 1 | SELECT st_astext(st_snaptogrid(gebouw.wkb_geometry, 0.0000001)) AS brt_wkt,
 2 |   st_astext(st_snaptogrid(ST_GeometryN(osm_buildings.wkb_geometry, 1), 0.0000001)) AS osm_wkt,
 3 |   st_astext(st_snaptogrid(st_intersection(gebouw.wkb_geometry, osm_buildings.wkb_geometry), 0.0000001)) AS intersection_wkt,
 4 |   st_distance(st_transform(st_centroid(gebouw.wkb_geometry), 28992), st_transform(st_centroid(osm_buildings.wkb_geometry), 28992))::real AS centroid_distance,
 5 |   st_distance(st_transform(gebouw.wkb_geometry, 28992), st_transform(osm_buildings.wkb_geometry, 28992))::real AS geom_distance,
 6 |   st_astext(st_snaptogrid(st_centroid(gebouw.wkb_geometry), 0.0000001)) AS brt_centroid_wkt,
 7 |   st_astext(st_snaptogrid(st_centroid(osm_buildings.wkb_geometry), 0.0000001)) AS osm_centroid_wkt,
 8 |   st_astext(st_snaptogrid(st_transform(st_centroid(gebouw.wkb_geometry), 28992), 0.0000001)) AS brt_centroid_wkt_rd,
 9 |   st_astext(st_snaptogrid(st_transform(st_centroid(osm_buildings.wkb_geometry), 28992), 0.0000001)) AS osm_centroid_wkt_rd,
10 |   st_area(st_transform(st_intersection(gebouw.wkb_geometry, osm_buildings.wkb_geometry), 28992))::real AS intersection_surface
11 | FROM gebouw, osm_buildings
12 | WHERE
13 |   -- Allow only polygons (there are a few point buildings in there, don't ask me why)
14 | 	ST_GeometryType(gebouw.wkb_geometry) = 'ST_Polygon' AND
15 | 	-- Expand each source geometry with a buffer of a few meters to include non-intersecting target geometries
16 | 	st_intersects(st_buffer(gebouw.wkb_geometry, 0.00005), osm_buildings.wkb_geometry) AND
17 | 	-- Guarantee good geometries
18 | 	st_issimple(st_snaptogrid(gebouw.wkb_geometry, 0.000001)) AND
19 | 	st_issimple(st_snaptogrid(osm_buildings.wkb_geometry, 0.000001)) AND
20 | 	st_issimple(st_snaptogrid(st_intersection(gebouw.wkb_geometry, osm_buildings.wkb_geometry), 0.000001)) AND
21 | 	-- Restrict to ringless polygons
22 | 	ST_NumInteriorRings(gebouw.wkb_geometry) = 0 AND
23 | 	ST_NumInteriorRings(ST_GeometryN(osm_buildings.wkb_geometry, 1)) = 0
24 | LIMIT 500000


--------------------------------------------------------------------------------
/prep/triangles.py:
--------------------------------------------------------------------------------
 1 | import numpy as np
 2 | from model.topoml_util.GeoVectorizer import GeoVectorizer
 3 | from shapely.geometry import Polygon
 4 | from shapely.wkt import loads
 5 | 
 6 | SET_SIZE = 100000
 7 | TRIANGLES = '../files/triangles.npz'
 8 | 
 9 | 
10 | print('Creating triangles')
11 | raw_training_vectors = np.random.normal(size=(SET_SIZE, 6, 2))
12 | triangle_sets = np.array([[Polygon(point_set[0:3]).wkt, Polygon(point_set[3:]).wkt]
13 |                           for point_set in raw_training_vectors])
14 | max_points = GeoVectorizer.max_points(triangle_sets[:, 0], triangle_sets[:, 1])
15 | 
16 | print('Intersecting triangles and pruning')
17 | intersection_area = []
18 | intersection_vectors = []
19 | for index, (a, b) in enumerate(triangle_sets):
20 |     # if loads(a).intersection_surface_area(loads(b)).type == 'Polygon':  # constrain to actually intersecting
21 |     intersection = loads(a).intersection(loads(b))
22 |     intersection_area.append(intersection.area)
23 |     intersection_vectors.append(GeoVectorizer.vectorize_wkt(intersection.wkt, 12))
24 | 
25 | training_vectors = np.reshape(raw_training_vectors, (SET_SIZE, 12))
26 | (_, GEO_VECTOR_LEN) = np.array(training_vectors).shape
27 | intersection_area = np.array(intersection_area)
28 | 
29 | print('Saving compressed numpy data file', TRIANGLES)
30 | 
31 | np.savez_compressed(
32 |     TRIANGLES,
33 |     point_sequence=training_vectors,          # Sets of two geometries in WGS84 lon/lat, 25% of them overlapping
34 |     intersection_geoms=intersection_vectors,  # Geometries representing the intersection_surface_area in WGS84 lon/lat
35 |     intersection_surface=intersection_area,   # Surface in square meters of the intersection_surface_area
36 | )
37 | print('Saved vectorized geometries to', TRIANGLES)
38 | 


--------------------------------------------------------------------------------
/prep/util/__init__.py:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/SPINLab/geometry-learning/5300d421ef848c2748a2ba41ced5c6e2fba93200/prep/util/__init__.py


--------------------------------------------------------------------------------
/prep/util/layerToWGS.py:
--------------------------------------------------------------------------------
 1 | from osgeo import osr, gdal
 2 | 
 3 | 
 4 | def gdal_error_handler(err_class, err_num, err_msg):
 5 |     err_type = {
 6 |         gdal.CE_None: 'None',
 7 |         gdal.CE_Debug: 'Debug',
 8 |         gdal.CE_Warning: 'Warning',
 9 |         gdal.CE_Failure: 'Failure',
10 |         gdal.CE_Fatal: 'Fatal'
11 |     }
12 |     err_msg = err_msg.replace('\n', ' ')
13 |     err_class = err_type.get(err_class, 'None')
14 |     print('Error Number: %s' % err_num)
15 |     print('Error Type: %s' % err_class)
16 |     print('Error Message: %s' % err_msg)
17 | 
18 | 
19 | # install error handler
20 | gdal.PushErrorHandler(gdal_error_handler)
21 | 
22 | 
23 | def layerToWGS(in_layer):
24 |     out_driver = gdal.ogr.GetDriverByName('MEMORY')
25 |     out_dataset = out_driver.CreateDataSource('Output datasource')
26 |     out_layer = out_dataset.CreateLayer('Gebouw', geom_type=in_layer.GetGeomType())
27 | 
28 |     # input SpatialReference
29 |     in_spatial_ref = osr.SpatialReference()
30 |     in_spatial_ref.ImportFromEPSG(28992)
31 | 
32 |     # output SpatialReference
33 |     out_spatial_ref = osr.SpatialReference()
34 |     out_spatial_ref.ImportFromEPSG(4326)
35 | 
36 |     # create the CoordinateTransformation
37 |     coord_trans = osr.CoordinateTransformation(in_spatial_ref, out_spatial_ref)
38 | 
39 |     in_layer_defn = in_layer.GetLayerDefn()
40 |     # get the output layer's feature definition
41 |     out_layer_defn = out_layer.GetLayerDefn()
42 | 
43 |     for i in range(0, in_layer_defn.GetFieldCount()):
44 |         field_defn = in_layer_defn.GetFieldDefn(i)
45 |         out_layer.CreateField(field_defn)
46 | 
47 |     # loop through the input features
48 |     in_feature = in_layer.GetNextFeature()
49 |     while in_feature:
50 |         # get the input geometry
51 |         geometry = in_feature.GetGeometryRef()
52 |         # reproject the geometry
53 |         geometry.Transform(coord_trans)
54 |         # create a new feature
55 |         out_feature = in_feature.Clone()
56 |         # set the geometry and attribute
57 |         out_feature.SetGeometry(geometry)
58 |         # out_feature.SetFieldsFrom(in_feature)
59 |         # for i in range(0, out_layer_defn.GetFieldCount()):
60 |             # out_feature.SetField(out_layer_defn.GetFieldDefn(i).GetNameRef(), in_feature.GetField(i))
61 |         # add the feature to the layer
62 |         out_layer.CreateFeature(out_feature)
63 |         # dereference the features and get the next input feature
64 |         out_feature = None
65 |         in_feature = in_layer.GetNextFeature()
66 | 
67 |     return out_layer
68 | 


--------------------------------------------------------------------------------
/requirements.txt:
--------------------------------------------------------------------------------
 1 | boto3
 2 | Keras>=2.1.2
 3 | numpy>=1.14.0
 4 | pandas>=0.22.0
 5 | scikit-learn>=0.19.1
 6 | scipy>=1.0.0
 7 | Shapely>=1.6.3
 8 | slackclient>=1.1.0
 9 | tensorflow-gpu>=1.4.1
10 | matplotlib>=2.1.2
11 | pyefd>=1.0
12 | 


--------------------------------------------------------------------------------
/script/build-script.sh:
--------------------------------------------------------------------------------
 1 | #!/usr/bin/env bash
 2 | set -x
 3 | echo "Changes:"
 4 | cat $1
 5 | 
 6 | # TeamCity style
 7 | CHANGED_MODEL_FILES="$(cat $1 | \
 8 |   grep -v REMOVED | \
 9 |   cut -d \: -f 1 | \
10 |   grep -e model | \
11 |   grep .py | \
12 |   grep -v util | \
13 |   grep -v baseline | \
14 |   grep -v png \
15 |   )"
16 | echo ${CHANGED_MODEL_FILES}
17 | 
18 | # Comment out line below to enable automated script execution
19 | #CHANGED_MODEL_FILES="echo ${CHANGED_MODEL_FILES} | grep DISABLE_AUTOMATED_EXECUTION"
20 | 
21 | set -e
22 | cd model
23 | for FILE in ${CHANGED_MODEL_FILES}
24 | do
25 | 	python3 ../${FILE}
26 | done
27 | 
28 | echo "built!"


--------------------------------------------------------------------------------
/script/install-docker-ubuntu.sh:
--------------------------------------------------------------------------------
 1 | #!/usr/bin/env bash
 2 | sudo apt-get update
 3 | sudo apt-get install -y \
 4 |     apt-transport-https \
 5 |     ca-certificates \
 6 |     curl \
 7 |     software-properties-common
 8 | curl -fsSL https://download.docker.com/linux/ubuntu/gpg | sudo apt-key add -
 9 | sudo add-apt-repository \
10 |    "deb [arch=amd64] https://download.docker.com/linux/ubuntu \
11 |    $(lsb_release -cs) \
12 |    stable"
13 | sudo apt-get update
14 | sudo apt-get -y install docker-ce docker-compose
15 | sudo gpasswd -a ${USER} docker
16 | newgrp docker
17 | 
18 | # nvidia-docker
19 | curl -s -L https://nvidia.github.io/nvidia-docker/gpgkey | \
20 |   sudo apt-key add -
21 | distribution=$(. /etc/os-release;echo ${ID}${VERSION_ID})
22 | curl -s -L https://nvidia.github.io/nvidia-docker/$distribution/nvidia-docker.list | \
23 |   sudo tee /etc/apt/sources.list.d/nvidia-docker.list
24 | sudo apt-get update
25 | sudo apt-get install -y nvidia-docker2
26 | sudo tee /etc/docker/daemon.json <<EOF
27 | {
28 |     "runtimes": {
29 |         "nvidia": {
30 |             "path": "/usr/bin/nvidia-container-runtime",
31 |             "runtimeArgs": []
32 |         }
33 |     },
34 |     "default-runtime": "nvidia"
35 | }
36 | EOF
37 | sudo service docker restart


--------------------------------------------------------------------------------
/script/install-requirements.sh:
--------------------------------------------------------------------------------
 1 | #!/usr/bin/env bash
 2 | # This is not a fully automated install - it can't be run from docker for instance because the CUDA installation requires user input
 3 | 
 4 | # Set proper time zone
 5 | sudo apt-get install -y chrony
 6 | echo 'server 169.245.169.123 prefer iburst' | sudo tee -a /etc/chrony/chrony.conf
 7 | sudo /etc/init.d/chrony restart
 8 | sudo timedatectl set-timezone "Europe/Amsterdam"
 9 | 
10 | set -e
11 | git config --global user.name reinvantveer
12 | git config --global user.email 'rein.van.t.veer@geodan.nl'
13 | 
14 | # ownCloud client
15 | wget http://download.opensuse.org/repositories/isv:ownCloud:desktop/Ubuntu_16.04/Release.key
16 | sudo apt-key add - < Release.key
17 | sudo sh -c "echo 'deb http://download.opensuse.org/repositories/isv:/ownCloud:/desktop/Ubuntu_16.04/ /' > /etc/apt/sources.list.d/owncloud-client.list"
18 | sudo apt-get update
19 | sudo apt-get install owncloud-client
20 | 
21 | # Geospatial dependencies
22 | sudo add-apt-repository ppa:ubuntugis/ppa
23 | sudo apt-get update
24 | sudo apt-get install -y python-numpy gdal-bin libgdal-dev
25 | pip3 install shapely rasterio
26 | sudo apt-get install -y libgeos-dev python3-tk  # reinstall python3?
27 | 
28 | # Machine learning dependencies
29 | sudo pip3 install --upgrade keras  # check ~/.keras/keras.json for correct settings!
30 | # Install magenta requirement cuda 8.0 v6 for tf 1.2 - 1.4
31 | # From https://gitlab.com/nvidia/cuda/blob/c5e8c8d7a9fd444c4e45573f36cbeb8f4e10f71c/8.0/runtime/cudnn6/Dockerfile
32 | # And https://stackoverflow.com/questions/41991101/importerror-libcudnn-when-running-a-tensorflow-program
33 | 
34 | # Updated drivers
35 | sudo add-apt-repository ppa:graphics-drivers/ppa
36 | sudo apt-get update
37 | 
38 | #Install the recommended driver (currently nvidia-390)
39 | sudo ubuntu-drivers autoinstall
40 | 
41 | # cuda toolkit, see also https://developer.nvidia.com/cuda-toolkit-archive
42 | wget -O cuda_8_linux.run https://developer.nvidia.com/compute/cuda/8.0/Prod2/local_installers/cuda_8.0.61_375.26_linux-run
43 | sudo chmod +x cuda_8_linux.run
44 | ech./cuda_8_linux.run
45 | #Do you accept the previously read EULA?
46 | #accept
47 | #Install NVIDIA Accelerated Graphics Driver for Linux-x86_64 367.48?
48 | #n (we installed drivers previously)
49 | #Install the CUDA 8.0 Toolkit?
50 | #y
51 | #Enter Toolkit Location:
52 | #/usr/local/cuda-8.0 (enter)
53 | #Do you wish to run the installation with ‚sudo’?
54 | #y
55 | #Do you want to install a symbolic link at /usr/local/cuda?
56 | #y
57 | #Install the CUDA 8.0 Samples?
58 | #y
59 | #Enter CUDA Samples Location:
60 | #enter
61 | 
62 | sudo apt-get install -y libcupti-dev
63 | 
64 | # Install cudnn
65 | cd ~
66 | wget http://developer.download.nvidia.com/compute/redist/cudnn/v6.0/cudnn-8.0-linux-x64-v6.0.tgz
67 | tar xvzf cudnn-8.0-linux-x64-v6.0.tgz
68 | sudo cp -P cuda/include/cudnn.h /usr/local/cuda-8.0/include
69 | sudo cp -P cuda/lib64/libcudnn* /usr/local/cuda-8.0/lib64/
70 | sudo chmod a+r /usr/local/cuda-8.0/include/cudnn.h /usr/local/cuda-8.0/lib64/libcudnn*
71 | 
72 | # set environment variables
73 | echo export PATH=/usr/local/cuda-8.0/bin${PATH:+:${PATH}} >> ~/.bashrc
74 | echo export LD_LIBRARY_PATH=/usr/local/cuda-8.0/lib64/:/usr/lib/nvidia-384${LD_LIBRARY_PATH:+:${LD_LIBRARY_PATH}} >> ~/.bashrc
75 | echo export CUDA_HOME=/usr/local/cuda-8.0 >> ~/.bashrc
76 | source ~/.bashrc
77 | 
78 | # GUI and remote access
79 | sudo apt-get install -y lxde
80 | # sudo rm /home/ubuntu/.Xauthority
81 | sudo startlxde
82 | sudo add-apt-repository -y ppa:x2go/stable
83 | sudo apt-get update
84 | sudo apt-get install -y x2goserver x2goserver-xsession
85 | wget https://download.jetbrains.com/python/pycharm-community-2017.2.3.tar.gz
86 | tar xvzf pycharm-community-2017.2.3.tar.gz
87 | 
88 | # time zone and numlock config
89 | sudo timedatectl set-timezone Europe/Amsterdam
90 | sudo apt-get install numlockx
91 | sudo sed -i 's|^exit 0.*$|# Numlock enable\n[ -x /usr/bin/numlockx ] \&\& numlockx on\n\nexit 0|' /etc/rc.local
92 | echo "/usr/bin/numlockx on" | sudo tee -a /etc/X11/xinit/xinitrc
93 | echo "JAVA_HOME=\"/usr/lib/jvm/java-8-openjdk-amd64\"" | sudo tee -a /etc/environment
94 | sudo reboot
95 | 


--------------------------------------------------------------------------------
/script/run-all-models.sh:
--------------------------------------------------------------------------------
 1 | #!/usr/bin/env bash
 2 | set -x
 3 | "${SLACK_API_TOKEN:?You need to provide a SLACK_TOKEN_API environment parameter}"
 4 | for script in '../model/*.py'
 5 | do
 6 |     python3 $1
 7 |     EC=$?
 8 |     if [ ${EC} -eq 0 ]
 9 |     then
10 |         echo "${1} completed successfully."
11 |     else
12 |         echo "${1} failed, sending notification..."
13 |         python3 ./slack_notify.py "Failed at executing ${1}"
14 |     fi
15 | done


--------------------------------------------------------------------------------
/script/slack_notify.py:
--------------------------------------------------------------------------------
 1 | import os
 2 | 
 3 | import sys
 4 | from slackclient import SlackClient
 5 | 
 6 | slack_token = os.environ.get("SLACK_API_TOKEN")
 7 | 
 8 | 
 9 | if slack_token:
10 |     sc = SlackClient(slack_token)
11 |     sc.api_call(
12 |       "chat.postMessage",
13 |       channel="#machinelearning",
14 |       text="Notification: " + ', '.join(sys.argv[1:]))
15 | else:
16 |     print('No slack notification: no slack API token environment variable "SLACK_API_TOKEN" set.')
17 | 


--------------------------------------------------------------------------------
/script/test-tensorflow.py:
--------------------------------------------------------------------------------
1 | import tensorflow as tf
2 | hello = tf.constant('Hello, TensorFlow!')
3 | sess = tf.Session()
4 | print(sess.run(hello))
5 | 


--------------------------------------------------------------------------------
/setup.py:
--------------------------------------------------------------------------------
 1 | #!/usr/bin/env python
 2 | 
 3 | from distutils.core import setup
 4 | 
 5 | setup(name='Topology learning',
 6 |       version='1.0',
 7 |       description='Machine learning experiments for geospatial vector geometries',
 8 |       author='Rein van \'t Veer',
 9 |       author_email='rein.van.t.veer@geodan.nl',
10 |       url='https://github.com/reinvantveer/Topology-Learning',
11 |       packages=['model', 'model.topoml_util', 'model.baseline'],
12 |       license='MIT',
13 |       install_requires=[
14 |           'sklearn',
15 |           'slackclient',
16 |           'scipy',
17 |           'keras',
18 |           'numpy',
19 |           'shapely',
20 |           'tensorflow-gpu'
21 |       ],
22 |       )
23 | 


--------------------------------------------------------------------------------