├── .gitignore ├── Chapter 1-2 ├── Computing measures of spatial dependence.ipynb ├── Continuous Spatial Models.ipynb ├── Discrete Spatial Models.ipynb ├── Visualizing spatial data with CARTOframes.ipynb ├── datasets.py └── utils.py ├── Chapter 3 ├── agglomerative.ipynb ├── datasets.py ├── dbscan.ipynb ├── skater.ipynb └── utils.py ├── Chapter 4 ├── Travelling Salesman Problem.ipynb ├── data │ └── target_mn_loc.p ├── datasets.py └── utilis.py ├── Dockerfile ├── README.md ├── requirements.txt └── src_import ├── modules.R └── utils.R /.gitignore: -------------------------------------------------------------------------------- 1 | MANIFEST 2 | build 3 | dist 4 | _build 5 | docs/man/*.gz 6 | docs/source/api/generated 7 | docs/source/config.rst 8 | docs/gh-pages 9 | notebook/i18n/*/LC_MESSAGES/*.mo 10 | notebook/i18n/*/LC_MESSAGES/nbjs.json 11 | notebook/static/components 12 | notebook/static/style/*.min.css* 13 | notebook/static/*/js/built/ 14 | notebook/static/*/built/ 15 | notebook/static/built/ 16 | notebook/static/*/js/main.min.js* 17 | notebook/static/lab/*bundle.js 18 | node_modules 19 | *.py[co] 20 | __pycache__ 21 | *.egg-info 22 | *~ 23 | *.bak 24 | .ipynb_checkpoints 25 | .tox 26 | .DS_Store 27 | \#*# 28 | .#* 29 | .coverage 30 | .pytest_cache 31 | src 32 | 33 | *.swp 34 | *.map 35 | .idea/ 36 | Read the Docs 37 | config.rst 38 | *.iml 39 | /.project 40 | /.pydevproject 41 | 42 | package-lock.json 43 | geckodriver.log 44 | *.iml 45 | -------------------------------------------------------------------------------- /Chapter 1-2/datasets.py: -------------------------------------------------------------------------------- 1 | import pandas as pd 2 | import geopandas as gpd 3 | import numpy as np 4 | import matplotlib.pyplot as plt 5 | import shapely 6 | from libpysal.weights import Queen 7 | import pointpats 8 | import pointpats.centrography 9 | 10 | from cartoframes.auth import set_default_credentials 11 | from cartoframes import read_carto 12 | from cartoframes import to_carto 13 | 14 | set_default_credentials('ebook-sds') 15 | 16 | ## The Meuse dataset from R gstat package 17 | class GetMeuse(): 18 | def __init__(self): 19 | self.data = read_carto('meuse') 20 | self.data['log_zinc'] = np.log(self.data['zinc']) 21 | self.data = self.data.to_crs({'init': 'epsg:28992'}) 22 | self.data_lonlat = self.data.to_crs({'init': 'epsg:4326'}) 23 | 24 | self.data_grid = read_carto('meuse_grid') 25 | self.data_grid = self.data_grid.to_crs({'init': 'epsg:28992'}) 26 | self.data_grid_lonlat = self.data_grid.to_crs({'init': 'epsg:4326'}) 27 | 28 | def loadpred_krg(self): 29 | 30 | self.data_krg = pd.read_csv('/tmp/meuse_krg.csv') 31 | self.data_krg = gpd.GeoDataFrame(self.data_krg, geometry=gpd.points_from_xy(self.data_krg.x, self.data_krg.y)) 32 | self.data_krg.crs = {'init': 'epsg:28992'} 33 | self.data_krg_lonlat = self.data_krg.to_crs({'init': 'epsg:4326'}) 34 | 35 | self.data_grid_krg = pd.read_csv('/tmp/meuse.grid_krg.csv') 36 | self.data_grid_krg = gpd.GeoDataFrame(self.data_grid_krg, geometry=gpd.points_from_xy(self.data_grid_krg.x, self.data_grid_krg.y)) 37 | self.data_grid_krg.crs = {'init': 'epsg:28992'} 38 | self.data_grid_krg_lonlat = self.data_grid_krg.to_crs({'init': 'epsg:4326'}) 39 | 40 | def loadpred_INLAspde(self): 41 | 42 | self.data_INLAspde = pd.read_csv('/tmp/meuse_INLAspde.csv') 43 | self.data_INLAspde = gpd.GeoDataFrame(self.data_INLAspde, geometry=gpd.points_from_xy(self.data_INLAspde.x, self.data_INLAspde.y)) 44 | self.data_INLAspde.crs = {'init': 'epsg:28992'} 45 | self.data_INLAspde_lonlat = self.data_INLAspde.to_crs({'init': 'epsg:4326'}) 46 | 47 | self.data_grid_INLAspde = pd.read_csv('/tmp/meuse.grid_INLAspde.csv') 48 | self.data_grid_INLAspde = gpd.GeoDataFrame(self.data_grid_INLAspde, geometry=gpd.points_from_xy(self.data_grid_INLAspde.x, self.data_grid_INLAspde.y)) 49 | self.data_grid_INLAspde.crs = {'init': 'epsg:28992'} 50 | self.data_grid_INLAspde_lonlat = self.data_grid_INLAspde.to_crs({'init': 'epsg:4326'}) 51 | 52 | ## The Boston dataset from R spData package 53 | class GetBostonHousing(): 54 | def __init__(self): 55 | self.data_carto = read_carto('boston_housing') 56 | ## Renaming the geometry column from 'the_geom' to 'geometry' 57 | ## (pysal expect the geometry column to be called 'geometry') 58 | self.data = self.data_carto.copy() 59 | self.data['geometry'] = self.data.geometry 60 | self.data.drop(['the_geom'],axis = 1, inplace = True) 61 | self.data = gpd.GeoDataFrame(self.data, geometry = 'geometry') 62 | self.w = Queen.from_dataframe(self.data) 63 | 64 | def loadpred(self): 65 | self.data_preds = gpd.read_file('/tmp/boston_housing_predictions.shp') 66 | self.data_preds.crs = {'init': 'epsg:4326'} 67 | 68 | ## The Crime dataset from UK Police data 69 | class GetCrimeLondon(): 70 | def __init__(self, var, var_value): 71 | self.filename = '/tmp/UK_Police_street_crimes_2019_04.csv' 72 | self.data = read_carto('uk_police_street_crimes_2019_04') 73 | self.data = self.data[self.data[var] == var_value] 74 | self.data_lonlat = self.data 75 | self.data_lonlat = read_carto(''' 76 | SELECT c.* 77 | FROM uk_police_street_crimes_2019_04 as c 78 | JOIN london_borough_excluding_mhw as g 79 | ON ST_Intersects(c.the_geom, g.the_geom) 80 | 81 | ''') 82 | self.data = self.data.to_crs({'init': 'epsg:32630'}) 83 | 84 | def pp(self): 85 | self.pointpattern = pointpats.PointPattern( 86 | pd.concat([self.data.geometry.x,self.data.geometry.y], axis=1) 87 | ) -------------------------------------------------------------------------------- /Chapter 1-2/utils.py: -------------------------------------------------------------------------------- 1 | import numpy as np 2 | import matplotlib.pyplot as plt 3 | import pandas as pd 4 | import geopandas as gpd 5 | import shapely 6 | 7 | def Variogram_plot(v, fig_title=None, axes=None, grid=True, show=False, hist=True): 8 | """Variogram Plot 9 | Plot the experimental variogram, the fitted theoretical function and 10 | an histogram for the lag classes. The axes attribute can be used to 11 | pass a list of AxesSubplots or a single instance to the plot 12 | function. Then these Subplots will be used. If only a single instance 13 | is passed, the hist attribute will be ignored as only the variogram 14 | will be plotted anyway. 15 | Parameters 16 | ---------- 17 | axes : list, tuple, array, AxesSubplot or None 18 | If None, the plot function will create a new matplotlib figure. 19 | Otherwise a single instance or a list of AxesSubplots can be 20 | passed to be used. If a single instance is passed, the hist 21 | attribute will be ignored. 22 | grid : bool 23 | Defaults to True. If True a custom grid will be drawn through 24 | the lag class centers 25 | show : bool 26 | Defaults to True. If True, the show method of the passed or 27 | created matplotlib Figure will be called before returning the 28 | Figure. This should be set to False, when used in a Notebook, 29 | as a returned Figure object will be plotted anyway. 30 | hist : bool 31 | Defaults to True. If False, the creation of a histogram for the 32 | lag classes will be suppressed. 33 | Returns 34 | ------- 35 | matplotlib.Figure 36 | """ 37 | # get the parameters 38 | _bins = v.bins 39 | _exp = v.experimental 40 | x = np.linspace(0, np.nanmax(_bins), 100) # make the 100 a param? 41 | 42 | # do the plotting 43 | if axes is None: 44 | if hist: 45 | fig = plt.figure(figsize=(8, 5)) 46 | ax1 = plt.subplot2grid((5, 1), (1, 0), rowspan=4) 47 | ax2 = plt.subplot2grid((5, 1), (0, 0), sharex=ax1) 48 | fig.subplots_adjust(hspace=0) 49 | else: 50 | fig, ax1 = plt.subplots(1, 1, figsize=(8, 4)) 51 | ax2 = None 52 | elif isinstance(axes, (list, tuple, np.ndarray)): 53 | ax1, ax2 = axes 54 | fig = ax1.get_figure() 55 | else: 56 | ax1 = axes 57 | ax2 = None 58 | fig = ax1.get_figure() 59 | 60 | # apply the model 61 | y = v.transform(x) 62 | 63 | # handle the relative experimental variogram 64 | if v.normalized: 65 | _bins /= np.nanmax(_bins) 66 | y /= np.max(_exp) 67 | _exp /= np.nanmax(_exp) 68 | x /= np.nanmax(x) 69 | 70 | # ------------------------ 71 | # plot Variograms 72 | ax1.plot(_bins, _exp, marker=".", color='orange', markersize=15, linestyle='None') 73 | ax1.plot(x, y, 'blue', linewidth=2) 74 | ax1.set_facecolor('white') 75 | 76 | # ax limits 77 | if v.normalized: 78 | ax1.set_xlim([0, 1.05]) 79 | ax1.set_ylim([0, 1.05]) 80 | if grid: 81 | ax1.grid('off') 82 | ax1.vlines(_bins, *ax1.axes.get_ybound(), colors=(.85, .85, .85), 83 | linestyles='dashed',linewidth=0.5) 84 | # annotation 85 | ax1.axes.set_ylabel('semivariance (%s)' % v._estimator.__name__) 86 | ax1.axes.set_xlabel('Lag (-)') 87 | 88 | # ------------------------ 89 | # plot histogram 90 | if ax2 is not None and hist: 91 | # calc the histogram 92 | _count = np.fromiter( 93 | (g.size for g in v.lag_classes()), dtype=int 94 | ) 95 | 96 | # set the sum of hist bar widths to 70% of the x-axis space 97 | w = (np.max(_bins) * 0.7) / len(_count) 98 | 99 | # plot 100 | ax2.bar(_bins, _count, width=w, align='center', color='blue') 101 | 102 | # adjust 103 | plt.setp(ax2.axes.get_xticklabels(), visible=False) 104 | ax2.axes.set_yticks(ax2.axes.get_yticks()[1:]) 105 | 106 | # need a grid? 107 | if grid: 108 | ax2.grid('off') 109 | ax2.vlines(_bins, *ax2.axes.get_ybound(), 110 | colors=(.85, .85, .85), linestyles='dashed',linewidth=0.5) 111 | 112 | # anotate 113 | ax2.axes.set_ylabel('N') 114 | ax2.set_facecolor('white') 115 | 116 | plt.title(fig_title) 117 | return fig 118 | 119 | def geom2gdf(geom, crs, lonlat = True): 120 | geom = [['geom', geom]] 121 | geom = pd.DataFrame(geom, columns = ['geom', 'geometry']) 122 | geom = gpd.GeoDataFrame(geom, geometry=geom.geometry) 123 | geom.crs = {'init': crs} 124 | if(lonlat): 125 | geom = geom.to_crs({'init': 'epsg:4326'}) 126 | 127 | return geom 128 | 129 | def ell2gdf(M, sMx, sMy, theta, crs): 130 | circ = shapely.geometry.Point(M).buffer(1) 131 | ell = shapely.affinity.scale(circ, int(sMx), int(sMy)) 132 | ellr = shapely.affinity.rotate(ell,-np.degrees(theta)) 133 | poly = geom2gdf(ellr, crs) 134 | 135 | return poly 136 | 137 | def ncdump(nc_fid, verb=True): 138 | ''' 139 | ncdump outputs dimensions, variables and their attribute information. 140 | The information is similar to that of NCAR's ncdump utility. 141 | ncdump requires a valid instance of Dataset. 142 | Parameters 143 | ---------- 144 | nc_fid : netCDF4.Dataset 145 | A netCDF4 dateset object 146 | verb : Boolean 147 | whether or not nc_attrs, nc_dims, and nc_vars are printed 148 | Returns 149 | ------- 150 | nc_attrs : list 151 | A Python list of the NetCDF file global attributes 152 | nc_dims : list 153 | A Python list of the NetCDF file dimensions 154 | nc_vars : list 155 | A Python list of the NetCDF file variables 156 | ''' 157 | def print_ncattr(key): 158 | """ 159 | Prints the NetCDF file attributes for a given key 160 | Parameters 161 | ---------- 162 | key : unicode 163 | a valid netCDF4.Dataset.variables key 164 | """ 165 | try: 166 | print("\t\ttype:", repr(nc_fid.variables[key].dtype)) 167 | for ncattr in nc_fid.variables[key].ncattrs(): 168 | print('\t\t%s:' % ncattr,\ 169 | repr(nc_fid.variables[key].getncattr(ncattr))) 170 | except KeyError: 171 | print("\t\tWARNING: %s does not contain variable attributes" % key) 172 | # NetCDF global attributes 173 | nc_attrs = nc_fid.ncattrs() 174 | if verb: 175 | print("NetCDF Global Attributes:") 176 | for nc_attr in nc_attrs: 177 | print('\t%s:' % nc_attr, repr(nc_fid.getncattr(nc_attr))) 178 | nc_dims = [dim for dim in nc_fid.dimensions] # list of nc dimensions 179 | # Dimension shape information. 180 | if verb: 181 | print("NetCDF dimension information:") 182 | for dim in nc_dims: 183 | print("\tName:", dim) 184 | print("\t\tsize:", len(nc_fid.dimensions[dim])) 185 | print_ncattr(dim) 186 | # Variable information. 187 | nc_vars = [var for var in nc_fid.variables] # list of nc variables 188 | if verb: 189 | print("NetCDF variable information:") 190 | for var in nc_vars: 191 | if var not in nc_dims: 192 | print('\tName:', var) 193 | print("\t\tdimensions:", nc_fid.variables[var].dimensions) 194 | print("\t\tsize:", nc_fid.variables[var].size) 195 | print_ncattr(var) 196 | return nc_attrs, nc_dims, nc_vars -------------------------------------------------------------------------------- /Chapter 3/datasets.py: -------------------------------------------------------------------------------- 1 | import geopandas as gpd 2 | from cartoframes.auth import set_default_credentials 3 | from cartoframes import read_carto 4 | from cartoframes import to_carto 5 | 6 | set_default_credentials("ebook-sds") 7 | 8 | 9 | def get_table(tablename): 10 | """Retrieve tablename as a GeoDataFrame ordered by database id 11 | 12 | Returns: 13 | geopandas.GeoDataFrame: GeoDataFrame representation of table 14 | """ 15 | base_query = ("SELECT * FROM {tablename} ORDER BY cartodb_id ASC").format( 16 | tablename=tablename 17 | ) 18 | data_carto = read_carto(base_query) 19 | ## Renaming the geometry column from 'the_geom' to 'geometry' 20 | ## (pysal expect the geometry column to be called 'geometry') 21 | data = data_carto.copy() 22 | data['geometry'] = data.geometry 23 | data.drop(['the_geom'],axis = 1, inplace = True) 24 | data = gpd.GeoDataFrame(data, geometry = 'geometry') 25 | data.crs = {"init": "epsg:4326"} 26 | 27 | return data 28 | 29 | 30 | def get_nyc_census_tracts(): 31 | """Retrieve dataset on NYC Census Tracts 32 | 33 | Returns: 34 | geopandas.GeoDataFrame: GeoDataFrame representation of table 35 | """ 36 | return get_table("census_tracts_cleaned") 37 | 38 | 39 | def get_safegraph_visits(): 40 | """Retrieve Safegraph visit data for Panama City Beach in July 2019 41 | as a GeoDataFrame ordered by database id 42 | 43 | Returns: 44 | geopandas.GeoDataFrame: GeoDataFrame representation of table 45 | """ 46 | return get_table("safegraph_pcb_visits") 47 | -------------------------------------------------------------------------------- /Chapter 3/dbscan.ipynb: -------------------------------------------------------------------------------- 1 | { 2 | "cells": [ 3 | { 4 | "cell_type": "markdown", 5 | "metadata": {}, 6 | "source": [ 7 | "# DBSCAN with Visit Data\n", 8 | "\n", 9 | "For this exercise, we will be working with a sample of [Safegraph's Patterns dataset](https://blog.safegraph.com/introducing-places-patterns-17ac5b96fb33).\n", 10 | "\n", 11 | "The data is a set of home locations from which people travel to visit Panama City Beach, Florida during the month of July 2019. This example is a basic reproduction of some of the findings in the [CARTO <> Safegraph partnership blog post](https://carto.com/blog/visit-pattern-footfall-data-safegraph/). The data comes from the `visitor_home_cbgs` home attribute for all Points of Interest (POIs) in Panama City Beach, Florida. See the [Patterns documentation](https://docs.safegraph.com/docs/places-schema#section-patterns) for more information.\n", 12 | "\n", 13 | "Since we know the locations that people are coming from, it might be natural to ask if there are general regions that we can identify as drivers of the visits. For example, are there areas with a higher density of source visits that could be used to understand visit demographics?\n", 14 | "\n", 15 | "Let's get started by downloading the data and taking a look at it." 16 | ] 17 | }, 18 | { 19 | "cell_type": "code", 20 | "execution_count": 1, 21 | "metadata": {}, 22 | "outputs": [], 23 | "source": [ 24 | "import geopandas as gpd\n", 25 | "import numpy as np\n", 26 | "\n", 27 | "from cartoframes.viz import Map, Layer\n", 28 | "\n", 29 | "import datasets\n", 30 | "import warnings\n", 31 | "\n", 32 | "warnings.filterwarnings(\"ignore\")" 33 | ] 34 | }, 35 | { 36 | "cell_type": "markdown", 37 | "metadata": {}, 38 | "source": [ 39 | "### Retrieve the data" 40 | ] 41 | }, 42 | { 43 | "cell_type": "code", 44 | "execution_count": 2, 45 | "metadata": {}, 46 | "outputs": [ 47 | { 48 | "data": { 49 | "text/html": [ 50 | "
\n", 51 | "\n", 64 | "\n", 65 | " \n", 66 | " \n", 67 | " \n", 68 | " \n", 69 | " \n", 70 | " \n", 71 | " \n", 72 | " \n", 73 | " \n", 74 | " \n", 75 | " \n", 76 | " \n", 77 | " \n", 78 | " \n", 79 | " \n", 80 | " \n", 81 | " \n", 82 | " \n", 83 | " \n", 84 | " \n", 85 | " \n", 86 | " \n", 87 | " \n", 88 | " \n", 89 | " \n", 90 | " \n", 91 | " \n", 92 | " \n", 93 | " \n", 94 | " \n", 95 | " \n", 96 | " \n", 97 | " \n", 98 | " \n", 99 | " \n", 100 | " \n", 101 | " \n", 102 | " \n", 103 | " \n", 104 | " \n", 105 | " \n", 106 | " \n", 107 | " \n", 108 | " \n", 109 | " \n", 110 | " \n", 111 | " \n", 112 | " \n", 113 | " \n", 114 | " \n", 115 | " \n", 116 | " \n", 117 | "
cartodb_idlongitudelatitudenum_visitsgeometry
01-84.69018233.9909245POINT (-84.69018 33.99092)
12-85.87721230.21667913POINT (-85.87721 30.21668)
23-85.17326331.9042746POINT (-85.17326 31.90427)
34-86.00685234.6326415POINT (-86.00685 34.63264)
45-85.03878332.52374111POINT (-85.03878 32.52374)
\n", 118 | "
" 119 | ], 120 | "text/plain": [ 121 | " cartodb_id longitude latitude num_visits geometry\n", 122 | "0 1 -84.690182 33.990924 5 POINT (-84.69018 33.99092)\n", 123 | "1 2 -85.877212 30.216679 13 POINT (-85.87721 30.21668)\n", 124 | "2 3 -85.173263 31.904274 6 POINT (-85.17326 31.90427)\n", 125 | "3 4 -86.006852 34.632641 5 POINT (-86.00685 34.63264)\n", 126 | "4 5 -85.038783 32.523741 11 POINT (-85.03878 32.52374)" 127 | ] 128 | }, 129 | "execution_count": 2, 130 | "metadata": {}, 131 | "output_type": "execute_result" 132 | } 133 | ], 134 | "source": [ 135 | "sg_pcb = datasets.get_safegraph_visits()\n", 136 | "sg_pcb.head()" 137 | ] 138 | }, 139 | { 140 | "cell_type": "markdown", 141 | "metadata": {}, 142 | "source": [ 143 | "This is a point dataset associated with the number of visits." 144 | ] 145 | }, 146 | { 147 | "cell_type": "markdown", 148 | "metadata": {}, 149 | "source": [ 150 | "### Visualize points on map" 151 | ] 152 | }, 153 | { 154 | "cell_type": "code", 155 | "execution_count": 3, 156 | "metadata": {}, 157 | "outputs": [ 158 | { 159 | "data": { 160 | "text/html": [ 161 | "\n", 170 | "\n", 171 | "\n", 172 | " None\n", 173 | " \n", 174 | " \n", 175 | " \n", 176 | " \n", 177 | " \n", 178 | " \n", 179 | " \n", 180 | " \n", 181 | " \n", 182 | "\n", 183 | " \n", 184 | " \n", 185 | " \n", 186 | " \n", 187 | " \n", 188 | " \n", 189 | "\n", 190 | " \n", 191 | "\n", 192 | " \n", 193 | "\n", 194 | " \n", 195 | " \n", 196 | " \n", 197 | " \n", 198 | " \n", 199 | "\n", 200 | " \n", 201 | " \n", 217 | " \n", 255 | " \n", 302 | " \n", 326 | " \n", 340 | "\n", 341 | "\n", 342 | "\n", 343 | " Static map image\n", 344 | " \n", 345 | " \n", 346 | "
\n", 347 | "
\n", 348 | "
\n", 349 | " \n", 350 | " \n", 351 | "
\n", 352 | "
\n", 353 | "
\n", 354 | "\n", 355 | " \n", 356 | "\n", 357 | "
\n", 358 | "

There is a \n", 359 | " from the CARTO VL library:

\n", 360 | "
\n", 361 | " :\n", 362 | "
\n", 363 | " \n", 364 | " \n", 365 | "
\n", 366 | "
\n", 367 | "\n", 368 | "
\n", 369 | " StackTrace\n", 370 | "
    \n", 371 | "
    \n", 372 | "
    \n", 373 | "\n", 374 | "\n", 375 | "\n", 1021 | "\n", 1048 | "\n", 1049 | "\">\n", 1050 | "\n", 1051 | "" 1052 | ], 1053 | "text/plain": [ 1054 | "" 1055 | ] 1056 | }, 1057 | "execution_count": 3, 1058 | "metadata": {}, 1059 | "output_type": "execute_result" 1060 | } 1061 | ], 1062 | "source": [ 1063 | "Layer(sg_pcb)" 1064 | ] 1065 | }, 1066 | { 1067 | "cell_type": "markdown", 1068 | "metadata": {}, 1069 | "source": [ 1070 | "### Calculate Clusters\n", 1071 | "\n", 1072 | "To calculate clusters, we will use DBSCAN because it works well for finding clusters based on density and works well with spatial measurements." 1073 | ] 1074 | }, 1075 | { 1076 | "cell_type": "code", 1077 | "execution_count": 4, 1078 | "metadata": {}, 1079 | "outputs": [ 1080 | { 1081 | "name": "stdout", 1082 | "output_type": "stream", 1083 | "text": [ 1084 | "Number of clusters: 9\n" 1085 | ] 1086 | } 1087 | ], 1088 | "source": [ 1089 | "from sklearn.cluster import dbscan\n", 1090 | "\n", 1091 | "# use lat/lng in radians as coordinates\n", 1092 | "coords = np.radians(sg_pcb[[\"latitude\", \"longitude\"]].values)\n", 1093 | "\n", 1094 | "# choose appropriate epsilon value\n", 1095 | "# here we use ~35 kilometers\n", 1096 | "kms_per_radian = 6371\n", 1097 | "epsilon = 35 / kms_per_radian\n", 1098 | "\n", 1099 | "# calculate clusters\n", 1100 | "# use haversine metric for calculating approximate distances on earth's surface (crow fly)\n", 1101 | "_, cluster_labels = dbscan(\n", 1102 | " coords, eps=epsilon, min_samples=4, algorithm=\"ball_tree\", metric=\"haversine\",\n", 1103 | ")\n", 1104 | "\n", 1105 | "print(\"Number of clusters: {}\".format(len(set(cluster_labels))))" 1106 | ] 1107 | }, 1108 | { 1109 | "cell_type": "markdown", 1110 | "metadata": {}, 1111 | "source": [ 1112 | "### Add cluster labels to data\n", 1113 | "\n", 1114 | "Now that we have uncovered some natural clusters, let's give them some appropriate labels.\n", 1115 | "\n", 1116 | "Looking at the map below, we can see there are a few clusters that we can easily identify (e.g., local Panama City Beach and the large area in northern Alabama and Georgia), while other clusters are smaller and less significant. The values of `-1` indicate 'noise' or not falling into a cluster." 1117 | ] 1118 | }, 1119 | { 1120 | "cell_type": "code", 1121 | "execution_count": 5, 1122 | "metadata": {}, 1123 | "outputs": [ 1124 | { 1125 | "data": { 1126 | "text/html": [ 1127 | "\n", 1136 | "\n", 1137 | "\n", 1138 | " None\n", 1139 | " \n", 1140 | " \n", 1141 | " \n", 1142 | " \n", 1143 | " \n", 1144 | " \n", 1145 | " \n", 1146 | " \n", 1147 | " \n", 1148 | "\n", 1149 | " \n", 1150 | " \n", 1151 | " \n", 1152 | " \n", 1153 | " \n", 1154 | " \n", 1155 | "\n", 1156 | " \n", 1157 | "\n", 1158 | " \n", 1159 | "\n", 1160 | " \n", 1161 | " \n", 1162 | " \n", 1163 | " \n", 1164 | " \n", 1165 | "\n", 1166 | " \n", 1167 | " \n", 1183 | " \n", 1221 | " \n", 1268 | " \n", 1292 | " \n", 1306 | "\n", 1307 | "\n", 1308 | "\n", 1309 | " Static map image\n", 1310 | " \n", 1311 | " \n", 1312 | "
    \n", 1313 | "
    \n", 1314 | "
    \n", 1315 | " \n", 1316 | " \n", 1317 | "
    \n", 1318 | "
    \n", 1319 | " \n", 1320 | "\n", 1321 | "
    \n", 1322 | " \n", 1323 | " \n", 1324 | " \n", 1325 | " \n", 1326 | " \n", 1327 | " \n", 1330 | " \n", 1331 | " \n", 1332 | " \n", 1333 | " \n", 1334 | " \n", 1335 | " \n", 1336 | " \n", 1337 | "
    \n", 1338 | "
    \n", 1339 | "
    \n", 1340 | " \n", 1341 | "
    \n", 1342 | "
    \n", 1343 | "
    \n", 1344 | "\n", 1345 | " \n", 1346 | "\n", 1347 | "
    \n", 1348 | "

    There is a \n", 1349 | " from the CARTO VL library:

    \n", 1350 | "
    \n", 1351 | " :\n", 1352 | "
    \n", 1353 | " \n", 1354 | " \n", 1355 | "
    \n", 1356 | "
    \n", 1357 | "\n", 1358 | "
    \n", 1359 | " StackTrace\n", 1360 | "
      \n", 1361 | "
      \n", 1362 | "
      \n", 1363 | "\n", 1364 | "\n", 1365 | "\n", 2011 | "\n", 2038 | "\n", 2039 | "\">\n", 2040 | "\n", 2041 | "" 2042 | ], 2043 | "text/plain": [ 2044 | "" 2045 | ] 2046 | }, 2047 | "execution_count": 5, 2048 | "metadata": {}, 2049 | "output_type": "execute_result" 2050 | } 2051 | ], 2052 | "source": [ 2053 | "from cartoframes.viz import color_category_style\n", 2054 | "\n", 2055 | "# convert labels to text for creating a category map\n", 2056 | "sg_pcb[\"dbscan_labels\"] = [str(s) for s in cluster_labels]\n", 2057 | "\n", 2058 | "# show distribution of labels\n", 2059 | "Layer(sg_pcb, color_category_style('dbscan_labels'))" 2060 | ] 2061 | }, 2062 | { 2063 | "cell_type": "markdown", 2064 | "metadata": {}, 2065 | "source": [ 2066 | "### Apply readable labels to clusters" 2067 | ] 2068 | }, 2069 | { 2070 | "cell_type": "code", 2071 | "execution_count": 6, 2072 | "metadata": {}, 2073 | "outputs": [], 2074 | "source": [ 2075 | "sg_pcb[\"dbscan_labels\"] = cluster_labels\n", 2076 | "\n", 2077 | "# identify points as within a cluster or not\n", 2078 | "def in_cluster(cluster_num):\n", 2079 | " if cluster_num == -1:\n", 2080 | " return \"Out of cluster\"\n", 2081 | " return \"In cluster\"\n", 2082 | "\n", 2083 | "\n", 2084 | "sg_pcb[\"in_cluster\"] = sg_pcb[\"dbscan_labels\"].apply(in_cluster)" 2085 | ] 2086 | }, 2087 | { 2088 | "cell_type": "markdown", 2089 | "metadata": {}, 2090 | "source": [ 2091 | "### Calculate Convex Hulls to show approximate cluster region\n", 2092 | "\n", 2093 | "To get approximate polygons to represent the regions, we can group the points by label and draw a convex hull. We also added a small buffer to improve the cartography." 2094 | ] 2095 | }, 2096 | { 2097 | "cell_type": "code", 2098 | "execution_count": 7, 2099 | "metadata": {}, 2100 | "outputs": [], 2101 | "source": [ 2102 | "# group clusters (excluding noise)\n", 2103 | "# union points within cluster\n", 2104 | "# create a convex hull and small buffer\n", 2105 | "cluster_hulls = (\n", 2106 | " sg_pcb[sg_pcb[\"dbscan_labels\"] != -1]\n", 2107 | " .groupby(\"dbscan_labels\")\n", 2108 | " .geometry.apply(lambda x: x.unary_union.convex_hull.buffer(0.05))\n", 2109 | " .reset_index()\n", 2110 | ")\n", 2111 | "\n", 2112 | "cluster_hulls = gpd.GeoDataFrame(cluster_hulls)\n", 2113 | "\n", 2114 | "# Give cluster labels more readable titles\n", 2115 | "cluster_title_mapping = {\n", 2116 | " -1: \"Outlier\",\n", 2117 | " 0: \"Northern Alabama and Georgia\",\n", 2118 | " 1: \"Panama City Beach (Locals)\",\n", 2119 | "}\n", 2120 | "cluster_title_mapping.update(\n", 2121 | " {k: \"Other smaller region\" for k in range(2, max(cluster_labels) + 1)}\n", 2122 | ")\n", 2123 | "\n", 2124 | "cluster_hulls[\"dbscan_labels_readable\"] = cluster_hulls[\"dbscan_labels\"].apply(\n", 2125 | " lambda x: cluster_title_mapping.get(x)\n", 2126 | ")" 2127 | ] 2128 | }, 2129 | { 2130 | "cell_type": "code", 2131 | "execution_count": 8, 2132 | "metadata": {}, 2133 | "outputs": [ 2134 | { 2135 | "data": { 2136 | "text/html": [ 2137 | "
      \n", 2138 | "\n", 2151 | "\n", 2152 | " \n", 2153 | " \n", 2154 | " \n", 2155 | " \n", 2156 | " \n", 2157 | " \n", 2158 | " \n", 2159 | " \n", 2160 | " \n", 2161 | " \n", 2162 | " \n", 2163 | " \n", 2164 | " \n", 2165 | " \n", 2166 | " \n", 2167 | " \n", 2168 | " \n", 2169 | " \n", 2170 | " \n", 2171 | " \n", 2172 | " \n", 2173 | " \n", 2174 | " \n", 2175 | " \n", 2176 | " \n", 2177 | " \n", 2178 | " \n", 2179 | " \n", 2180 | " \n", 2181 | " \n", 2182 | " \n", 2183 | " \n", 2184 | " \n", 2185 | " \n", 2186 | " \n", 2187 | " \n", 2188 | " \n", 2189 | " \n", 2190 | " \n", 2191 | " \n", 2192 | " \n", 2193 | " \n", 2194 | " \n", 2195 | " \n", 2196 | " \n", 2197 | " \n", 2198 | " \n", 2199 | " \n", 2200 | " \n", 2201 | " \n", 2202 | " \n", 2203 | " \n", 2204 | " \n", 2205 | " \n", 2206 | " \n", 2207 | " \n", 2208 | " \n", 2209 | " \n", 2210 | "
      dbscan_labelsgeometrydbscan_labels_readable
      00POLYGON ((-84.94719 32.21661, -84.95174 32.215...Northern Alabama and Georgia
      11POLYGON ((-85.69464 30.06520, -85.69932 30.063...Panama City Beach (Locals)
      22POLYGON ((-85.12766 31.76488, -85.12806 31.759...Other smaller region
      33POLYGON ((-83.43705 31.46919, -83.43920 31.464...Other smaller region
      44POLYGON ((-87.06329 36.04642, -87.06598 36.042...Other smaller region
      55POLYGON ((-85.91513 31.18973, -85.91990 31.189...Other smaller region
      66POLYGON ((-88.37893 34.01090, -88.38195 34.006...Other smaller region
      77POLYGON ((-86.39062 35.74135, -86.39316 35.737...Other smaller region
      \n", 2211 | "
      " 2212 | ], 2213 | "text/plain": [ 2214 | " dbscan_labels geometry \\\n", 2215 | "0 0 POLYGON ((-84.94719 32.21661, -84.95174 32.215... \n", 2216 | "1 1 POLYGON ((-85.69464 30.06520, -85.69932 30.063... \n", 2217 | "2 2 POLYGON ((-85.12766 31.76488, -85.12806 31.759... \n", 2218 | "3 3 POLYGON ((-83.43705 31.46919, -83.43920 31.464... \n", 2219 | "4 4 POLYGON ((-87.06329 36.04642, -87.06598 36.042... \n", 2220 | "5 5 POLYGON ((-85.91513 31.18973, -85.91990 31.189... \n", 2221 | "6 6 POLYGON ((-88.37893 34.01090, -88.38195 34.006... \n", 2222 | "7 7 POLYGON ((-86.39062 35.74135, -86.39316 35.737... \n", 2223 | "\n", 2224 | " dbscan_labels_readable \n", 2225 | "0 Northern Alabama and Georgia \n", 2226 | "1 Panama City Beach (Locals) \n", 2227 | "2 Other smaller region \n", 2228 | "3 Other smaller region \n", 2229 | "4 Other smaller region \n", 2230 | "5 Other smaller region \n", 2231 | "6 Other smaller region \n", 2232 | "7 Other smaller region " 2233 | ] 2234 | }, 2235 | "execution_count": 8, 2236 | "metadata": {}, 2237 | "output_type": "execute_result" 2238 | } 2239 | ], 2240 | "source": [ 2241 | "cluster_hulls" 2242 | ] 2243 | }, 2244 | { 2245 | "cell_type": "markdown", 2246 | "metadata": {}, 2247 | "source": [ 2248 | "### Visualize outputs" 2249 | ] 2250 | }, 2251 | { 2252 | "cell_type": "code", 2253 | "execution_count": 9, 2254 | "metadata": {}, 2255 | "outputs": [ 2256 | { 2257 | "data": { 2258 | "text/html": [ 2259 | "\n", 2268 | "\n", 2269 | "\n", 2270 | " None\n", 2271 | " \n", 2272 | " \n", 2273 | " \n", 2274 | " \n", 2275 | " \n", 2276 | " \n", 2277 | " \n", 2278 | " \n", 2279 | " \n", 2280 | "\n", 2281 | " \n", 2282 | " \n", 2283 | " \n", 2284 | " \n", 2285 | " \n", 2286 | " \n", 2287 | "\n", 2288 | " \n", 2289 | "\n", 2290 | " \n", 2291 | "\n", 2292 | " \n", 2293 | " \n", 2294 | " \n", 2295 | " \n", 2296 | " \n", 2297 | "\n", 2298 | " \n", 2299 | " \n", 2315 | " \n", 2353 | " \n", 2400 | " \n", 2424 | " \n", 2438 | "\n", 2439 | "\n", 2440 | "\n", 2441 | " Static map image\n", 2442 | " \n", 2443 | " \n", 2444 | " \n", 2445 | "\n", 2446 | "\n", 2471 | " \n", 2472 | "
      \n", 2473 | "
      \n", 2474 | "
      \n", 2475 | " \n", 2476 | " \n", 2477 | "
      \n", 2478 | "
      \n", 2479 | " \n", 2480 | "\n", 2481 | "
      \n", 2482 | " \n", 2483 | " \n", 2484 | " \n", 2485 | " \n", 2486 | " \n", 2487 | " \n", 2490 | " \n", 2491 | " \n", 2492 | " \n", 2493 | " \n", 2494 | " \n", 2495 | " \n", 2496 | " \n", 2497 | " \n", 2498 | " \n", 2499 | " \n", 2500 | " \n", 2501 | " \n", 2504 | " \n", 2505 | " \n", 2506 | " \n", 2507 | " \n", 2508 | " \n", 2509 | " \n", 2510 | " \n", 2511 | "
      \n", 2512 | "
      \n", 2513 | "
      \n", 2514 | " \n", 2515 | "
      \n", 2516 | "
      \n", 2517 | "
      \n", 2518 | "\n", 2519 | " \n", 2520 | "\n", 2521 | "
      \n", 2522 | "

      There is a \n", 2523 | " from the CARTO VL library:

      \n", 2524 | "
      \n", 2525 | " :\n", 2526 | "
      \n", 2527 | " \n", 2528 | " \n", 2529 | "
      \n", 2530 | "
      \n", 2531 | "\n", 2532 | "
      \n", 2533 | " StackTrace\n", 2534 | "
        \n", 2535 | "
        \n", 2536 | "
        \n", 2537 | "\n", 2538 | "\n", 2539 | "\n", 3185 | "\n", 3212 | "\n", 3213 | "\">\n", 3214 | "\n", 3215 | "" 3216 | ], 3217 | "text/plain": [ 3218 | "" 3219 | ] 3220 | }, 3221 | "execution_count": 9, 3222 | "metadata": {}, 3223 | "output_type": "execute_result" 3224 | } 3225 | ], 3226 | "source": [ 3227 | "from cartoframes.viz import color_category_legend, category_widget\n", 3228 | "\n", 3229 | "\n", 3230 | "Map(\n", 3231 | " [Layer(cluster_hulls,\n", 3232 | " style = color_category_style(\"dbscan_labels_readable\",\n", 3233 | " opacity=0.7,\n", 3234 | " palette=[\"#66C5CC\", \"#DCB0F2\", \"#F89C74\"],\n", 3235 | " stroke_color=\"transparent\"),\n", 3236 | " legends=color_category_legend(title=\"Visit Regions\"),\n", 3237 | " widgets=[category_widget('dbscan_labels_readable',\n", 3238 | " title='Cluster lables',\n", 3239 | " description='Select a category to filter')]\n", 3240 | " ),\n", 3241 | " Layer(sg_pcb,\n", 3242 | " style = color_category_style(\"in_cluster\",\n", 3243 | " palette=[\"#666\", \"deeppink\"],\n", 3244 | " opacity=0.5),\n", 3245 | " legends=color_category_legend(title=\"In Cluster\")\n", 3246 | " )]\n", 3247 | ")" 3248 | ] 3249 | } 3250 | ], 3251 | "metadata": { 3252 | "kernelspec": { 3253 | "display_name": "Python 3", 3254 | "language": "python", 3255 | "name": "python3" 3256 | }, 3257 | "language_info": { 3258 | "codemirror_mode": { 3259 | "name": "ipython", 3260 | "version": 3 3261 | }, 3262 | "file_extension": ".py", 3263 | "mimetype": "text/x-python", 3264 | "name": "python", 3265 | "nbconvert_exporter": "python", 3266 | "pygments_lexer": "ipython3", 3267 | "version": "3.7.3" 3268 | } 3269 | }, 3270 | "nbformat": 4, 3271 | "nbformat_minor": 4 3272 | } 3273 | -------------------------------------------------------------------------------- /Chapter 3/utils.py: -------------------------------------------------------------------------------- 1 | def get_nyc_bridge_connections(): 2 | return { 3 | # Verrazzano-Narrows Bridge (Staten Island <-> Brooklyn) 4 | 4551: [2571], 5 | 2571: [4551], 6 | # Williamsburg Bridge (Manhattan <-> Brooklyn) 7 | 2923: [3498], 8 | 3498: [2923], 9 | # Queensborough Bridge (Manhattan <-> Queens) 10 | 3595: [3900], 11 | 3900: [3736, 3595], 12 | # Roosevelt Island <-> Queens 13 | 3736: [3900], 14 | # Astoria <-> East Harlem 15 | 3943: [3688], 16 | 3688: [2028, 3943], 17 | # East Harlem <-> Bronx 18 | 2028: [3688], 19 | # Northern Manhattan <-> Marble Hill 20 | 3769: [2237], 21 | 2237: [3769], 22 | # Rockaways 23 | 4411: [3009], 24 | 3009: [4411], 25 | } 26 | -------------------------------------------------------------------------------- /Chapter 4/data/target_mn_loc.p: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/CartoDB/data-science-book/c0b6813117709c43dca6a9330c8380e52fba0d5e/Chapter 4/data/target_mn_loc.p -------------------------------------------------------------------------------- /Chapter 4/datasets.py: -------------------------------------------------------------------------------- 1 | import geopandas as gpd 2 | from cartoframes.auth import set_default_credentials 3 | from cartoframes import read_carto 4 | from cartoframes import to_carto 5 | 6 | set_default_credentials('ebook-sds') 7 | 8 | def get_retail_store_minnesota(): 9 | """Retrieve Retail Store Locations in Minnesota 10 | 11 | Returns: 12 | geopandas.GeoDataFrame: GeoDataFrame representation of table 13 | """ 14 | table_name = 'retail_store_minnesota' 15 | data = read_carto(table_name) 16 | data['store_id'] = data['store_id'].apply(lambda x: str(int(x))) 17 | data.crs = {'init': 'epsg:4326'} 18 | return data -------------------------------------------------------------------------------- /Chapter 4/utilis.py: -------------------------------------------------------------------------------- 1 | import logging 2 | logging.basicConfig(level=logging.INFO, format='%(levelname)s - %(message)s') 3 | 4 | import operator 5 | import random 6 | from copy import deepcopy 7 | from itertools import combinations 8 | import pickle as pkl 9 | 10 | import numpy as np 11 | import math 12 | import pandas as pd 13 | import geopandas as gpd 14 | from shapely.geometry import * 15 | from geopy.distance import great_circle 16 | from simanneal import Annealer 17 | 18 | import cartoframes 19 | from cartoframes.viz import * 20 | from cartoframes.data import * 21 | 22 | import seaborn as sns 23 | from matplotlib import pyplot as plt 24 | plt.style.use('ggplot') 25 | 26 | 27 | ####################################################### 28 | # Solve TSP using Ant Colony Optimization in Python 3 # 29 | # Code Source: # 30 | # https://github.com/ppoffice/ant-colony-tsp # 31 | ####################################################### 32 | class Graph(object): 33 | def __init__(self, cost_matrix: list, rank: int): 34 | """ 35 | :param cost_matrix: 36 | :param rank: rank of the cost matrix 37 | """ 38 | self.matrix = cost_matrix 39 | self.rank = rank 40 | self.pheromone = [[1 / (rank * rank) for j in range(rank)] for i in range(rank)] 41 | logging.info(f'[Done] Load the Graph') 42 | 43 | 44 | class ACO(object): 45 | def __init__(self, ant_count: int, generations: int, alpha: float, beta: float, rho: float, q: int, 46 | strategy: int): 47 | """ 48 | :param ant_count: 49 | :param generations: 50 | :param alpha: relative importance of pheromone 51 | :param beta: relative importance of heuristic information 52 | :param rho: pheromone residual coefficient 53 | :param q: pheromone intensity 54 | :param strategy: pheromone update strategy. 0 - ant-cycle, 1 - ant-quality, 2 - ant-density 55 | """ 56 | self.Q = q 57 | self.rho = rho 58 | self.beta = beta 59 | self.alpha = alpha 60 | self.ant_count = ant_count 61 | self.generations = generations 62 | self.update_strategy = strategy 63 | 64 | def _update_pheromone(self, graph: Graph, ants: list): 65 | for i, row in enumerate(graph.pheromone): 66 | for j, col in enumerate(row): 67 | graph.pheromone[i][j] *= self.rho 68 | for ant in ants: 69 | graph.pheromone[i][j] += ant.pheromone_delta[i][j] 70 | 71 | def solve(self, graph: Graph): 72 | """ 73 | :param graph: 74 | """ 75 | best_cost = float('inf') 76 | best_solution = [] 77 | all_costs = [] 78 | for gen in range(self.generations): 79 | ants = [_Ant(self, graph) for i in range(self.ant_count)] 80 | for ind, ant in enumerate(ants): 81 | for i in range(graph.rank - 1): 82 | ant._select_next() 83 | ant.total_cost += graph.matrix[ant.tabu[-1]][ant.tabu[0]] 84 | if ant.total_cost < best_cost: 85 | best_cost = ant.total_cost 86 | best_solution = [] + ant.tabu 87 | # update pheromone 88 | ant._update_pheromone_delta() 89 | self._update_pheromone(graph, ants) 90 | logging.info(f'[Generation #{gen}] [best cost: {best_cost}]') 91 | return best_solution, best_cost 92 | 93 | class _Ant(object): 94 | def __init__(self, aco: ACO, graph: Graph): 95 | self.colony = aco 96 | self.graph = graph 97 | self.total_cost = 0.0 98 | self.tabu = [] # tabu list 99 | self.pheromone_delta = [] # the local increase of pheromone 100 | self.allowed = [i for i in range(graph.rank)] # nodes which are allowed for the next selection 101 | self.eta = [[0 if i == j else 1 / graph.matrix[i][j] for j in range(graph.rank)] for i in 102 | range(graph.rank)] # heuristic information 103 | start = random.randint(0, graph.rank - 1) # start from any node 104 | self.tabu.append(start) 105 | self.current = start 106 | self.allowed.remove(start) 107 | 108 | def _select_next(self): 109 | denominator = 0 110 | for i in self.allowed: 111 | denominator += self.graph.pheromone[self.current][i] ** self.colony.alpha * self.eta[self.current][i] ** self.colony.beta 112 | probabilities = [0 for i in range(self.graph.rank)] # probabilities for moving to a node in the next step 113 | for i in range(self.graph.rank): 114 | try: 115 | self.allowed.index(i) # test if allowed list contains i 116 | probabilities[i] = self.graph.pheromone[self.current][i] ** self.colony.alpha * \ 117 | self.eta[self.current][i] ** self.colony.beta / denominator 118 | except ValueError: 119 | pass # do nothing 120 | # select next node by probability roulette 121 | selected = 0 122 | rand = random.random() 123 | for i, probability in enumerate(probabilities): 124 | rand -= probability 125 | if rand <= 0: 126 | selected = i 127 | break 128 | self.allowed.remove(selected) 129 | self.tabu.append(selected) 130 | self.total_cost += self.graph.matrix[self.current][selected] 131 | self.current = selected 132 | 133 | def _update_pheromone_delta(self): 134 | self.pheromone_delta = [[0 for j in range(self.graph.rank)] for i in range(self.graph.rank)] 135 | for _ in range(1, len(self.tabu)): 136 | i = self.tabu[_ - 1] 137 | j = self.tabu[_] 138 | if self.colony.update_strategy == 1: # ant-quality system 139 | self.pheromone_delta[i][j] = self.colony.Q 140 | elif self.colony.update_strategy == 2: # ant-density system 141 | self.pheromone_delta[i][j] = self.colony.Q / self.graph.matrix[i][j] 142 | else: # ant-cycle system 143 | self.pheromone_delta[i][j] = self.colony.Q / self.total_cost 144 | 145 | def distance_aco(cities: dict): 146 | distance_matrix = [] 147 | for ka, va in cities.items(): 148 | record_distance = [] 149 | for kb, vb in cities.items(): 150 | if kb == ka: 151 | record_distance.append(0.0) 152 | else: 153 | record_distance.append(great_circle(va, vb).m) 154 | distance_matrix.append(record_distance) 155 | logging.info(f'[Done] Create A Distance Matrix For ACO') 156 | return distance_matrix 157 | 158 | 159 | 160 | def location(data: pd.DataFrame, 161 | id_col: str, 162 | geometry_col: str) -> dict : 163 | """ 164 | Extract Location from dataframe. Output a dict as {id: (lng, lat), ...} 165 | """ 166 | loc = {} 167 | for row in data.iterrows(): 168 | loc_id = row[1][id_col] 169 | x, y = row[1][geometry_col].x, row[1][geometry_col].y 170 | loc[loc_id] = loc.get(loc_id, (x,y)) 171 | logging.info(f'[Done] Transform DataFrame To Location Dict)') 172 | return loc 173 | 174 | 175 | ####################################################### 176 | # Christofides algorithm # 177 | # Code Source: # 178 | # https://github.com/Retsediv/ChristofidesAlgorithm # 179 | ####################################################### 180 | 181 | def christofides(data): 182 | # build a graph 183 | G = build_graph(data) 184 | # print("Graph: ", G) 185 | 186 | # build a minimum spanning tree 187 | MSTree = minimum_spanning_tree(G) 188 | MSTree_init = deepcopy(MSTree) 189 | # print("MSTree: ", MSTree) 190 | 191 | # find odd vertexes 192 | odd_vertexes = find_odd_vertexes(MSTree) 193 | odd_vertexes_init = deepcopy(odd_vertexes) 194 | # print("Odd vertexes in MSTree: ", odd_vertexes) 195 | 196 | # add minimum weight matching edges to MST 197 | new_added_matching = minimum_weight_matching(MSTree, G, odd_vertexes) 198 | united_MSTree_perfect_matching = deepcopy(MSTree) 199 | # print("Minimum weight matching: ", MSTree) 200 | 201 | # find an eulerian tour 202 | eulerian_tour = find_eulerian_tour(MSTree, G) 203 | 204 | # print("Eulerian tour: ", eulerian_tour) 205 | 206 | current = eulerian_tour[0] 207 | path = [current] 208 | visited = [False] * len(eulerian_tour) 209 | 210 | length = 0 211 | 212 | for v in eulerian_tour[1:]: 213 | if not visited[v]: 214 | path.append(v) 215 | visited[v] = True 216 | 217 | length += G[current][v] 218 | current = v 219 | 220 | # path.append(path[0]) 221 | 222 | # print("Result path: ", path) 223 | # print("Result length of the path: ", length) 224 | 225 | return G, MSTree_init, odd_vertexes_init, new_added_matching, united_MSTree_perfect_matching, eulerian_tour, length, path 226 | 227 | 228 | 229 | def get_length(x1, y1, x2, y2, name='great_circle'): 230 | ''' 231 | x1: lat1 232 | y1: lng1 233 | x2: lat2 234 | y2: lng2 235 | ''' 236 | if name == 'great_circle': 237 | return great_circle((x1,y1), (x2,y2)).km 238 | else: 239 | return ((x1 - x2) ** 2 + (y1 - y2) ** 2) ** (1 / 2) 240 | 241 | 242 | def build_graph(data): 243 | graph = {} 244 | for this in range(len(data)): 245 | for another_point in range(len(data)): 246 | if this != another_point: 247 | if this not in graph: 248 | graph[this] = {} 249 | 250 | graph[this][another_point] = get_length(data[this][0], 251 | data[this][1], 252 | data[another_point][0], 253 | data[another_point][1], 254 | name='great_circle') 255 | 256 | return graph 257 | 258 | 259 | class UnionFind: 260 | def __init__(self): 261 | self.weights = {} 262 | self.parents = {} 263 | 264 | def __getitem__(self, object): 265 | if object not in self.parents: 266 | self.parents[object] = object 267 | self.weights[object] = 1 268 | return object 269 | 270 | # find path of objects leading to the root 271 | path = [object] 272 | root = self.parents[object] 273 | while root != path[-1]: 274 | path.append(root) 275 | root = self.parents[root] 276 | 277 | # compress the path and return 278 | for ancestor in path: 279 | self.parents[ancestor] = root 280 | return root 281 | 282 | def __iter__(self): 283 | return iter(self.parents) 284 | 285 | def union(self, *objects): 286 | roots = [self[x] for x in objects] 287 | heaviest = max([(self.weights[r], r) for r in roots])[1] 288 | for r in roots: 289 | if r != heaviest: 290 | self.weights[heaviest] += self.weights[r] 291 | self.parents[r] = heaviest 292 | 293 | 294 | def minimum_spanning_tree(G): 295 | tree = [] 296 | subtrees = UnionFind() 297 | for W, u, v in sorted((G[u][v], u, v) for u in G for v in G[u]): 298 | if subtrees[u] != subtrees[v]: 299 | tree.append((u, v, W)) 300 | subtrees.union(u, v) 301 | 302 | return tree 303 | 304 | 305 | def find_odd_vertexes(MST): 306 | tmp_g = {} 307 | vertexes = [] 308 | for edge in MST: 309 | if edge[0] not in tmp_g: 310 | tmp_g[edge[0]] = 0 311 | 312 | if edge[1] not in tmp_g: 313 | tmp_g[edge[1]] = 0 314 | 315 | tmp_g[edge[0]] += 1 316 | tmp_g[edge[1]] += 1 317 | 318 | for vertex in tmp_g: 319 | if tmp_g[vertex] % 2 == 1: 320 | vertexes.append(vertex) 321 | 322 | return vertexes 323 | 324 | 325 | def minimum_weight_matching(MST, G, odd_vert): 326 | import random 327 | random.shuffle(odd_vert) 328 | 329 | new_added = [] 330 | while odd_vert: 331 | v = odd_vert.pop() 332 | length = float("inf") 333 | u = 1 334 | closest = 0 335 | for u in odd_vert: 336 | if v != u and G[v][u] < length: 337 | length = G[v][u] 338 | closest = u 339 | 340 | MST.append((v, closest, length)) 341 | new_added.append((v, closest, length)) 342 | odd_vert.remove(closest) 343 | return new_added 344 | 345 | def find_eulerian_tour(MatchedMSTree, G): 346 | # find neigbours 347 | neighbours = {} 348 | for edge in MatchedMSTree: 349 | if edge[0] not in neighbours: 350 | neighbours[edge[0]] = [] 351 | 352 | if edge[1] not in neighbours: 353 | neighbours[edge[1]] = [] 354 | 355 | neighbours[edge[0]].append(edge[1]) 356 | neighbours[edge[1]].append(edge[0]) 357 | 358 | # print("Neighbours: ", neighbours) 359 | 360 | # finds the hamiltonian circuit 361 | start_vertex = MatchedMSTree[0][0] 362 | EP = [neighbours[start_vertex][0]] 363 | 364 | while len(MatchedMSTree) > 0: 365 | for i, v in enumerate(EP): 366 | if len(neighbours[v]) > 0: 367 | break 368 | 369 | while len(neighbours[v]) > 0: 370 | w = neighbours[v][0] 371 | 372 | remove_edge_from_matchedMST(MatchedMSTree, v, w) 373 | 374 | del neighbours[v][(neighbours[v].index(w))] 375 | del neighbours[w][(neighbours[w].index(v))] 376 | 377 | i += 1 378 | EP.insert(i, w) 379 | 380 | v = w 381 | 382 | return EP 383 | 384 | 385 | def remove_edge_from_matchedMST(MatchedMST, v1, v2): 386 | 387 | for i, item in enumerate(MatchedMST): 388 | if (item[0] == v2 and item[1] == v1) or (item[0] == v1 and item[1] == v2): 389 | del MatchedMST[i] 390 | 391 | return MatchedMST 392 | 393 | 394 | def Euler_Tour(multigraph): 395 | """ Uses Fleury's algorithm to find the Euler Tour of the MultiGraph. 396 | """ 397 | tour = [] 398 | temp_graph = nx.MultiGraph() 399 | graph_nodes = nx.nodes(multigraph) 400 | current_node = graph_nodes[0] 401 | tour.append(current_node) 402 | while nx.number_of_edges(multigraph) > 0: 403 | for edge in multigraph.edges(current_node): 404 | temp_graph = copy.deepcopy(multigraph) 405 | temp_graph.remove_edge(edge[0], edge[1], key=None) 406 | if nx.is_connected(temp_graph): 407 | tour.append(edge[1]) 408 | current_node = edge[1] 409 | multigraph.remove_edge(edge[0], edge[1], key=None) 410 | break 411 | else: 412 | tour.append(edge[1]) 413 | current_node = edge[1] 414 | multigraph.remove_edge(edge[0], edge[1], key=None) 415 | multigraph.remove_nodes_from(nx.isolates(multigraph)) 416 | return tour 417 | 418 | 419 | def shortcut_Euler_Tour(tour): 420 | """Find's the shortcut of the Euler Tour to obtain the Approximation. 421 | """ 422 | Tour = [] 423 | for vertex in tour: 424 | if vertex not in Tour: 425 | Tour.append(vertex) 426 | Tour.append(tour[0]) 427 | return Tour 428 | 429 | 430 | class TravelingSalesman(Annealer): 431 | """Calculates sequence of places to visit""" 432 | def __init__(self, state, distance_matrix): 433 | self.distance_matrix = distance_matrix 434 | super(TravelingSalesman, self).__init__(state) 435 | 436 | def move(self): 437 | """Swaps two cities in the route.""" 438 | a = random.randint(0, len(self.state) - 1) 439 | b = random.randint(0, len(self.state) - 1) 440 | self.state[a], self.state[b] = self.state[b], self.state[a] 441 | 442 | def energy(self): 443 | """Calculates energy with current configuration""" 444 | total_dist = 0 445 | # add distances from i-1 -> i 446 | for i in range(len(self.state)): 447 | # loop, back to the start. 448 | total_dist += self.distance_matrix[self.state[i-1]][self.state[i]] 449 | return total_dist 450 | 451 | 452 | def TravelingSalesmanRun(loc: dict, iteration: int): 453 | output = pd.DataFrame({'id': [], 'iteration': [], 'distance': []}) 454 | 455 | # create a distance matrix 456 | distance_matrix = {} 457 | for ka, va in loc.items(): 458 | distance_matrix[ka] = {} 459 | for kb, vb in loc.items(): 460 | if kb == ka: 461 | distance_matrix[ka][kb] = 0.0 462 | else: 463 | distance_matrix[ka][kb] = great_circle(va, vb).m 464 | logging.info(f'[Done] Create A Distance Matrix)') 465 | 466 | for iter_i in range(iteration): 467 | # initial state 468 | 469 | # init_state = sorted(list(loc.keys())) 470 | init_state = list(loc.keys()) 471 | random.shuffle(init_state) 472 | 473 | # run 474 | distance_matrix_copy = distance_matrix.copy() 475 | 476 | tsp = TravelingSalesman(init_state, distance_matrix_copy) 477 | 478 | ################################################## 479 | # Tmax = 25000.0 # Max (starting) temperature # 480 | # Tmin = 2.5 # Min (ending) temperature # 481 | # steps = 50000 # Number of iterations # 482 | # updates = 100 # 483 | ################################################## 484 | auto_schedule = tsp.auto(minutes=1) 485 | tsp.set_schedule(auto_schedule) 486 | tsp.copy_strategy = "slice" 487 | 488 | state, e = tsp.anneal() 489 | 490 | logging.info(f'[{iter_i+1}]: {e} m route)') 491 | 492 | # record 493 | output_i = pd.DataFrame({'id': state, 'iteration': [iter_i]*len(loc), 'distance': [e]*len(loc)}) 494 | output = output.append(output_i) 495 | logging.info(f'[Done]: Traveling Salesman Run') 496 | return output 497 | 498 | def result(loc: dict, Output: pd.DataFrame) -> pd.DataFrame: 499 | output_copy = Output.copy() 500 | loc_copy = loc.copy() 501 | 502 | loc_copy = pd.DataFrame.from_dict(loc_copy, orient='index') 503 | loc_copy.columns = ['lng','lat'] 504 | 505 | output_copy['shortest'] = output_copy['distance'].rank(method="min") 506 | output_copy['visitOrder'] = output_copy.index + 1 507 | output_copy.index = output_copy['id'] 508 | output_copy = output_copy.join(loc_copy) 509 | output_copy['shortest'] = output_copy['shortest'].astype(int) 510 | output_copy = output_copy.sort_values(by=['shortest', 'visitOrder']) 511 | output_copy.reset_index(inplace=True, drop=True) 512 | output_copy['geometry'] = output_copy.apply(lambda x: Point(x.lng, x.lat), axis=1) 513 | logging.info(f'[Done]: Organize Result') 514 | return output_copy 515 | 516 | def shortestRoute(result: pd.DataFrame): 517 | shortest_route = result[result.shortest == 1] 518 | shortest_route = gpd.GeoDataFrame(shortest_route) 519 | logging.info(f'[Done]: Find The Shortest Route') 520 | return shortest_route 521 | 522 | 523 | -------------------------------------------------------------------------------- /Dockerfile: -------------------------------------------------------------------------------- 1 | FROM jupyter/datascience-notebook 2 | 3 | USER root 4 | RUN apt-get update 5 | RUN apt-get install build-essential software-properties-common -y 6 | RUN apt-get install pkg-config 7 | RUN jupyter nbextension enable --py widgetsnbextension 8 | RUN jupyter labextension install @jupyter-widgets/jupyterlab-manager 9 | RUN apt-get install -y gdal-bin 10 | RUN apt-get install -y libspatialindex-dev 11 | 12 | USER jovyan 13 | COPY . /tmp/ 14 | RUN pip install --requirement /tmp/requirements.txt 15 | RUN conda install -y r-rgdal 16 | RUN conda install -y r-spdep 17 | RUN Rscript /tmp/src_import/modules.R 18 | 19 | -------------------------------------------------------------------------------- /README.md: -------------------------------------------------------------------------------- 1 | # Becoming a Spatial Data Scientist Materials 2 | 3 | Example notebooks to accompany [Becoming a Spatial Data Scientist](https://go.carto.com/ebooks/spatial-data-science). 4 | 5 | ![](https://go.carto.com/hubfs/spatial-data-scientist-ebook-cover.png) 6 | 7 | 8 | 9 | ## Installation requirements 10 | 11 | The notebooks in this repository use a ready-to-run Docker image containing Jupyter applications and interactive computing tools. To run the notebooks, please follow the instructions below. 12 | 13 | 1. Clone this repository 14 | ```bash 15 | $ git clone git@github.com:CartoDB/data-science-book.git 16 | $ cd data-science-book 17 | ``` 18 | 19 | 2. Download and install docker. Follow instructions here: https://docs.docker.com/install/ 20 | 21 | 3. Run the image. Open your terminal and run 22 | ```bash 23 | $ docker run --user root -p 8888:8888 -e JUPYTER_ENABLE_LAB=yes -e GRANT_SUDO=yes -v "$PWD":/home/jovyan/workspace cartodb/data-science-book 24 | ``` 25 | 26 | A local address will be created. Copy and paste the address in your browser, this will launch Jupyter Lab. **Note**: If you have another Jupyter server running, make sure it's on a different port than 8888. Otherwise change the port number above or close down the other notebook server. 27 | 28 | 4. Start experimenting with the code in each of the Chapter directories 29 | 30 | 31 | 32 | ## Table of Contents 33 | 34 | ### Chapter 1 - 2 35 | 36 | - `Visualizing spatial data with CARTOframes` ([static preview](https://nbviewer.jupyter.org/github/CartoDB/data-science-book/blob/master/Chapter%201-2/Visualizing%20spatial%20data%20with%20CARTOframes.ipynb)) - a notebook for easily visualizing your data on a map using CARTOframes. 37 | 38 | - `Computing measures of spatial dependence` ([static preview](https://nbviewer.jupyter.org/github/CartoDB/data-science-book/blob/master/Chapter%201-2/Computing%20measures%20of%20spatial%20dependence.ipynb)) - a notebook for exploring spatial dependence in your data and visualize the results using CARTOframes. 39 | 40 | - `Discrete spatial models` ([static preview](https://nbviewer.jupyter.org/github/CartoDB/data-science-book/blob/master/Chapter%201-2/Discrete%20Spatial%20Models.ipynb)) - a notebook with examples of spatial models for discrete processes and visualize the results using CARTOframes. 41 | 42 | - `Continous spatial models` ([static preview](https://nbviewer.jupyter.org/github/CartoDB/data-science-book/blob/master/Chapter%201-2/Continuous%20Spatial%20Models.ipynb)) - a notebook with examples of spatial models for continuous processes and visualize the results using CARTOframes. 43 | 44 | ### Chapter 3 45 | 46 | - `Agglomerative Clustering` ([static preview](https://nbviewer.jupyter.org/github/CartoDB/data-science-book/blob/master/Chapter%203/agglomerative.ipynb)) - a notebook demonstrating how to create spatially constrained clusters using agglomerative clustering 47 | - `DBSCAN` ([static preview](https://nbviewer.jupyter.org/github/CartoDB/data-science-book/blob/master/Chapter%203/dbscan.ipynb)) - a notebook demonstrating how to create clusters of points in geographic coordinates 48 | - `SKATER` ([static preview](https://nbviewer.jupyter.org/github/CartoDB/data-science-book/blob/master/Chapter%203/skater.ipynb)) - a notebook demonstrating how to create spatially constrained clusters that are homogeneous 49 | 50 | ### Chapter 4 51 | 52 | - `Travelling Salesman Problem` ([static preview](https://nbviewer.jupyter.org/github/CartoDB/data-science-book/blob/master/Chapter%204/Travelling%20Salesman%20Problem.ipynb)) - a notebook demonstrating how to solve travelling salesman problem. 53 | -------------------------------------------------------------------------------- /requirements.txt: -------------------------------------------------------------------------------- 1 | geopandas 2 | cartoframes==v1.0.0 3 | shapely 4 | scikit-gstat 5 | scikit-learn 6 | libpysal 7 | esda 8 | pointpats 9 | splot 10 | rpy2 11 | rtree 12 | netCDF4 13 | git+https://github.com/pysal/region.git#egg=region 14 | geopy 15 | simanneal 16 | -------------------------------------------------------------------------------- /src_import/modules.R: -------------------------------------------------------------------------------- 1 | libraries <- c('devtools', 2 | 'sp', 3 | 'spdep', 4 | 'rgdal', 5 | 'raster', 6 | 'maptools', 7 | 'gstat', 8 | 'reshape2', 9 | 'magrittr', 10 | 'dplyr', 11 | 'mgcv', 12 | 'splancs', 13 | 'INLA') 14 | CheckInstallPackages <- function(pkgs){ 15 | 16 | #For each pkg in pkgs (attempt to load each package one at a time): 17 | 18 | x <- lapply(pkgs, function(pkg){ 19 | 20 | #Load the package if available, 21 | 22 | if(!do.call("require", list(pkg))) { 23 | #Silently attempt to install into the default library 24 | 25 | try(install.packages(pkg, lib=.Library,repos="http://cran.rstudio.com")) 26 | 27 | #Now attempt to load the package, catch error if it wasn't installed 28 | 29 | tryCatch(do.call("library", list(pkg)), 30 | 31 | #Catch if we're unable to install into the default library 32 | 33 | error = function(err) { 34 | 35 | #If non-interactive, install into this user's personal library 36 | 37 | if(!interactive()) { 38 | 39 | #Get the path to this user's personal library 40 | 41 | personalLibPath <- Sys.getenv("R_LIBS_USER") 42 | 43 | #If the personal library is not in the list of libraries 44 | 45 | if(is.na(match(personalLibPath, .libPaths()))) { 46 | 47 | #Then create the personal library 48 | 49 | dir.create(personalLibPath, recursive = TRUE) 50 | #And add the personal library to the list of libraries 51 | 52 | .libPaths(personalLibPath) 53 | 54 | } 55 | 56 | #Attempt to install the package into the personal library 57 | 58 | #If this fails, raise the error back to the report 59 | 60 | if(pkg=='INLA'){ 61 | install.packages('INLA', repos="https://inla.r-inla-download.org/R/stable", dep=TRUE) 62 | } 63 | if(pkg=='INLAutils'){ 64 | install_github('timcdlucas/INLAutils', dep = TRUE) 65 | } 66 | 67 | install.packages(pkg, lib=personalLibPath, repos="http://cran.rstudio.com") 68 | 69 | #Finally, attempt to load the package 70 | 71 | do.call("library", list(pkg)) 72 | 73 | }})}}) 74 | 75 | } 76 | 77 | CheckInstallPackages(libraries) -------------------------------------------------------------------------------- /src_import/utils.R: -------------------------------------------------------------------------------- 1 | loadMEUSE <- function(){ 2 | 3 | data(meuse) 4 | coordinates(meuse) <- ~x+y 5 | proj4string(meuse) <- CRS("+proj=sterea +lat_0=52.15616055555555 +lon_0=5.38763888888889 +k=0.9999079 +x_0=155000 +y_0=463000 +ellps=bessel +towgs84=565.2369,50.0087,465.658,-0.406857,0.350733,-1.87035,4.0812 +units=m +no_defs") 6 | 7 | data(meuse.grid) 8 | coordinates(meuse.grid) = ~x+y 9 | proj4string(meuse.grid) <- CRS("+proj=sterea +lat_0=52.15616055555555 +lon_0=5.38763888888889 +k=0.9999079 +x_0=155000 +y_0=463000 +ellps=bessel +towgs84=565.2369,50.0087,465.658,-0.406857,0.350733,-1.87035,4.0812 +units=m +no_defs") 10 | gridded(meuse.grid) = TRUE 11 | 12 | meuse_list <- list(meuse,meuse.grid) 13 | names(meuse_list) <- c('meuse','meuse.grid') 14 | 15 | return(meuse_list) 16 | } 17 | 18 | pseudoR2 <- function(y_obs, y_pred){ 19 | res <- y_obs-y_pred 20 | tmp <- 1-sum(res^2)/sum((y_obs-mean(y_obs))^2) 21 | return(tmp) 22 | } 23 | 24 | ## Code from gstat 25 | gstatkrg <- function(data, data.grid, formula, filename.out,filename.grid.out, var_model = "Sph"){ 26 | 27 | # Compute the empirical variogram and fit model to the residuals 28 | fit <- lm(formula, data) 29 | vg <- variogram(formula, data) 30 | fit.vg <- fit.variogram(vg, vgm(var_model)) 31 | data$res <- fit$res 32 | write.table(data, filename.out, sep = ",", col.names = TRUE, row.names = FALSE, quote = FALSE) 33 | 34 | # (Universal) Kriging 35 | krg <- krige(formula, data, data.grid, model = fit.vg) 36 | data.grid$mean <- krg$var1.pred 37 | data.grid$sd <- sqrt(krg$var1.var) 38 | write.table(data.grid, filename.grid.out, sep = ",", col.names = TRUE, row.names = FALSE, quote = FALSE) 39 | } 40 | 41 | get_INLAspde_results <- function(data, data.grid, mesh, model, stack,response_var, filename.out, filename.grid.out, sp_res = c(300,300)){ 42 | 43 | ## Predictions 44 | data_out <- data.frame(data,model$summary.fitted.values[inla.stack.index(stack, tag = "train")$data, 1:5]) 45 | data_out$pr2 <- pseudoR2(data_out[[response_var]],data_out$mean) 46 | data.grid_out <- data.frame(data.grid,model$summary.fitted.values[inla.stack.index(stack, tag = "pred")$data, 1:5]) 47 | 48 | ## Spatial latent field 49 | proj = inla.mesh.projector(mesh, dims = sp_res) 50 | spatial_mean = inla.mesh.project(proj, model$summary.random[['spatial.field']][['mean']], dims = sp_res) 51 | spatial_sd = inla.mesh.project(proj, model$summary.random[['spatial.field']][['sd']], dims = sp_res) 52 | spatial_0.025quant = inla.mesh.project(proj, model$summary.random[['spatial.field']][['0.025quant']], dims = sp_res) 53 | spatial_0.5quant = inla.mesh.project(proj, model$summary.random[['spatial.field']][['0.5quant']], dims = sp_res) 54 | spatial_0.975quant = inla.mesh.project(proj, model$summary.random[['spatial.field']][['0.975quant']], dims = sp_res) 55 | spatial_mode = inla.mesh.project(proj, model$summary.random[['spatial.field']][['mode']], dims = sp_res) 56 | 57 | sp_out <- as.data.frame(cbind(x=proj$lattice$loc[,1],y=proj$lattice$loc[,2], spatial_mean = melt(spatial_mean)$value, spatial_SD = melt(spatial_sd)$value, 58 | spatial_0.025quant =melt(spatial_0.025quant)$value, 59 | spatial_0.5quant =melt(spatial_0.5quant)$value, 60 | spatial_0.975quant =melt(spatial_0.975quant)$value, 61 | spatial_mode =melt(spatial_mode)$value)) 62 | colnames(sp_out)[colnames(sp_out)=="spatial_SD"] <- "spatial_sd" 63 | 64 | write.table(data_out,filename.out, sep = ",", col.names = TRUE, row.names = FALSE, quote = FALSE) 65 | write.table(data.grid_out,filename.grid.out, sep = ",", col.names = TRUE, row.names = FALSE, quote = FALSE) 66 | write.table(sp_out,gsub('.csv','_sp.csv',filename.out), sep = ",", col.names = TRUE, row.names = FALSE, quote = FALSE) 67 | 68 | } 69 | 70 | ## Code from https://becarioprecario.bitbucket.io 71 | INLAspde <- function(data, data.grid, family, response_var, predictors, data.crs,filename.out, filename.grid.out){ 72 | 73 | #Define mesh 74 | bnd <- inla.nonconvex.hull(coordinates(data.grid),crs=data.crs) 75 | mesh <- inla.mesh.2d(loc = coordinates(data.grid), boundary = bnd, cutoff = 100, max.edge = c(250, 500), offset = c(100, 250)) 76 | 77 | png(gsub('.csv','_mesh.png',filename.out), width = 20, height = 20, units = 'cm', res = 300) 78 | plot(mesh, asp = 1, main = "") 79 | points(coordinates(data), pch = 21, bg = 'red', col = 'red', cex = 1) 80 | dev.off() 81 | 82 | #Create SPDE 83 | #sig0 = 0.1; rho0 = 0.1 #rho0 is typical range, sig0 typical sd 84 | ## (ρ_0, P(ρ < ρ_0)=p_ρ) where ρ is the spatial range of the random field. 85 | ## (σ_0, P(σ > σ_0)=p_σ) where σ is the marginal standard deviation of the field) 86 | spde <- inla.spde2.pcmatern(mesh = mesh, alpha = 2, constr=TRUE, prior.range=c(700,0.1), prior.sigma=c(0.2,0.1)) 87 | s.index <- inla.spde.make.index(name = "spatial.field",n.spde = spde$n.spde) 88 | 89 | #Create data structure 90 | A.train <- inla.spde.make.A(mesh = mesh, loc = coordinates(data)) 91 | stack.train <- inla.stack(data = list(response_var = data[[response_var]]), 92 | A = list(A.train, 1), 93 | effects = list(c(s.index, list(Intercept = 1)), 94 | data.frame(data) %>% 95 | select(predictors) %>% 96 | as.list()), 97 | tag = "train") 98 | 99 | #Create data structure for prediction 100 | A.pred <- inla.spde.make.A(mesh = mesh, loc = coordinates(data.grid)) 101 | stack.pred <- inla.stack(data = list(response_var = NA), 102 | A = list(A.pred, 1), 103 | effects = list(c(s.index, list(Intercept = 1)), 104 | data.frame(data.grid) %>% 105 | select(predictors) %>% 106 | as.list()), 107 | tag = "pred") 108 | 109 | #Join stack 110 | stack.join <- inla.stack(stack.train, stack.pred) 111 | 112 | #Fit model 113 | ff <- as.formula(paste('response_var', paste(c('-1', 'Intercept', predictors, 'f(spatial.field, model = spde)'), collapse=" + "),sep='~')) 114 | model <- inla(ff, data = inla.stack.data(stack.join, spde = spde), 115 | family = family, 116 | control.predictor = list(A = inla.stack.A(stack.join), compute = TRUE,link = 1), 117 | control.compute = list(cpo = TRUE, dic = TRUE), verbose = TRUE) 118 | 119 | #Summary of results 120 | print(summary(model)) 121 | 122 | get_INLAspde_results(data, data.grid, mesh, model, stack.join,response_var, filename.out, filename.grid.out) 123 | 124 | } 125 | --------------------------------------------------------------------------------