├── .gitignore
├── Chapter 1-2
├── Computing measures of spatial dependence.ipynb
├── Continuous Spatial Models.ipynb
├── Discrete Spatial Models.ipynb
├── Visualizing spatial data with CARTOframes.ipynb
├── datasets.py
└── utils.py
├── Chapter 3
├── agglomerative.ipynb
├── datasets.py
├── dbscan.ipynb
├── skater.ipynb
└── utils.py
├── Chapter 4
├── Travelling Salesman Problem.ipynb
├── data
│ └── target_mn_loc.p
├── datasets.py
└── utilis.py
├── Dockerfile
├── README.md
├── requirements.txt
└── src_import
├── modules.R
└── utils.R
/.gitignore:
--------------------------------------------------------------------------------
1 | MANIFEST
2 | build
3 | dist
4 | _build
5 | docs/man/*.gz
6 | docs/source/api/generated
7 | docs/source/config.rst
8 | docs/gh-pages
9 | notebook/i18n/*/LC_MESSAGES/*.mo
10 | notebook/i18n/*/LC_MESSAGES/nbjs.json
11 | notebook/static/components
12 | notebook/static/style/*.min.css*
13 | notebook/static/*/js/built/
14 | notebook/static/*/built/
15 | notebook/static/built/
16 | notebook/static/*/js/main.min.js*
17 | notebook/static/lab/*bundle.js
18 | node_modules
19 | *.py[co]
20 | __pycache__
21 | *.egg-info
22 | *~
23 | *.bak
24 | .ipynb_checkpoints
25 | .tox
26 | .DS_Store
27 | \#*#
28 | .#*
29 | .coverage
30 | .pytest_cache
31 | src
32 |
33 | *.swp
34 | *.map
35 | .idea/
36 | Read the Docs
37 | config.rst
38 | *.iml
39 | /.project
40 | /.pydevproject
41 |
42 | package-lock.json
43 | geckodriver.log
44 | *.iml
45 |
--------------------------------------------------------------------------------
/Chapter 1-2/datasets.py:
--------------------------------------------------------------------------------
1 | import pandas as pd
2 | import geopandas as gpd
3 | import numpy as np
4 | import matplotlib.pyplot as plt
5 | import shapely
6 | from libpysal.weights import Queen
7 | import pointpats
8 | import pointpats.centrography
9 |
10 | from cartoframes.auth import set_default_credentials
11 | from cartoframes import read_carto
12 | from cartoframes import to_carto
13 |
14 | set_default_credentials('ebook-sds')
15 |
16 | ## The Meuse dataset from R gstat package
17 | class GetMeuse():
18 | def __init__(self):
19 | self.data = read_carto('meuse')
20 | self.data['log_zinc'] = np.log(self.data['zinc'])
21 | self.data = self.data.to_crs({'init': 'epsg:28992'})
22 | self.data_lonlat = self.data.to_crs({'init': 'epsg:4326'})
23 |
24 | self.data_grid = read_carto('meuse_grid')
25 | self.data_grid = self.data_grid.to_crs({'init': 'epsg:28992'})
26 | self.data_grid_lonlat = self.data_grid.to_crs({'init': 'epsg:4326'})
27 |
28 | def loadpred_krg(self):
29 |
30 | self.data_krg = pd.read_csv('/tmp/meuse_krg.csv')
31 | self.data_krg = gpd.GeoDataFrame(self.data_krg, geometry=gpd.points_from_xy(self.data_krg.x, self.data_krg.y))
32 | self.data_krg.crs = {'init': 'epsg:28992'}
33 | self.data_krg_lonlat = self.data_krg.to_crs({'init': 'epsg:4326'})
34 |
35 | self.data_grid_krg = pd.read_csv('/tmp/meuse.grid_krg.csv')
36 | self.data_grid_krg = gpd.GeoDataFrame(self.data_grid_krg, geometry=gpd.points_from_xy(self.data_grid_krg.x, self.data_grid_krg.y))
37 | self.data_grid_krg.crs = {'init': 'epsg:28992'}
38 | self.data_grid_krg_lonlat = self.data_grid_krg.to_crs({'init': 'epsg:4326'})
39 |
40 | def loadpred_INLAspde(self):
41 |
42 | self.data_INLAspde = pd.read_csv('/tmp/meuse_INLAspde.csv')
43 | self.data_INLAspde = gpd.GeoDataFrame(self.data_INLAspde, geometry=gpd.points_from_xy(self.data_INLAspde.x, self.data_INLAspde.y))
44 | self.data_INLAspde.crs = {'init': 'epsg:28992'}
45 | self.data_INLAspde_lonlat = self.data_INLAspde.to_crs({'init': 'epsg:4326'})
46 |
47 | self.data_grid_INLAspde = pd.read_csv('/tmp/meuse.grid_INLAspde.csv')
48 | self.data_grid_INLAspde = gpd.GeoDataFrame(self.data_grid_INLAspde, geometry=gpd.points_from_xy(self.data_grid_INLAspde.x, self.data_grid_INLAspde.y))
49 | self.data_grid_INLAspde.crs = {'init': 'epsg:28992'}
50 | self.data_grid_INLAspde_lonlat = self.data_grid_INLAspde.to_crs({'init': 'epsg:4326'})
51 |
52 | ## The Boston dataset from R spData package
53 | class GetBostonHousing():
54 | def __init__(self):
55 | self.data_carto = read_carto('boston_housing')
56 | ## Renaming the geometry column from 'the_geom' to 'geometry'
57 | ## (pysal expect the geometry column to be called 'geometry')
58 | self.data = self.data_carto.copy()
59 | self.data['geometry'] = self.data.geometry
60 | self.data.drop(['the_geom'],axis = 1, inplace = True)
61 | self.data = gpd.GeoDataFrame(self.data, geometry = 'geometry')
62 | self.w = Queen.from_dataframe(self.data)
63 |
64 | def loadpred(self):
65 | self.data_preds = gpd.read_file('/tmp/boston_housing_predictions.shp')
66 | self.data_preds.crs = {'init': 'epsg:4326'}
67 |
68 | ## The Crime dataset from UK Police data
69 | class GetCrimeLondon():
70 | def __init__(self, var, var_value):
71 | self.filename = '/tmp/UK_Police_street_crimes_2019_04.csv'
72 | self.data = read_carto('uk_police_street_crimes_2019_04')
73 | self.data = self.data[self.data[var] == var_value]
74 | self.data_lonlat = self.data
75 | self.data_lonlat = read_carto('''
76 | SELECT c.*
77 | FROM uk_police_street_crimes_2019_04 as c
78 | JOIN london_borough_excluding_mhw as g
79 | ON ST_Intersects(c.the_geom, g.the_geom)
80 |
81 | ''')
82 | self.data = self.data.to_crs({'init': 'epsg:32630'})
83 |
84 | def pp(self):
85 | self.pointpattern = pointpats.PointPattern(
86 | pd.concat([self.data.geometry.x,self.data.geometry.y], axis=1)
87 | )
--------------------------------------------------------------------------------
/Chapter 1-2/utils.py:
--------------------------------------------------------------------------------
1 | import numpy as np
2 | import matplotlib.pyplot as plt
3 | import pandas as pd
4 | import geopandas as gpd
5 | import shapely
6 |
7 | def Variogram_plot(v, fig_title=None, axes=None, grid=True, show=False, hist=True):
8 | """Variogram Plot
9 | Plot the experimental variogram, the fitted theoretical function and
10 | an histogram for the lag classes. The axes attribute can be used to
11 | pass a list of AxesSubplots or a single instance to the plot
12 | function. Then these Subplots will be used. If only a single instance
13 | is passed, the hist attribute will be ignored as only the variogram
14 | will be plotted anyway.
15 | Parameters
16 | ----------
17 | axes : list, tuple, array, AxesSubplot or None
18 | If None, the plot function will create a new matplotlib figure.
19 | Otherwise a single instance or a list of AxesSubplots can be
20 | passed to be used. If a single instance is passed, the hist
21 | attribute will be ignored.
22 | grid : bool
23 | Defaults to True. If True a custom grid will be drawn through
24 | the lag class centers
25 | show : bool
26 | Defaults to True. If True, the show method of the passed or
27 | created matplotlib Figure will be called before returning the
28 | Figure. This should be set to False, when used in a Notebook,
29 | as a returned Figure object will be plotted anyway.
30 | hist : bool
31 | Defaults to True. If False, the creation of a histogram for the
32 | lag classes will be suppressed.
33 | Returns
34 | -------
35 | matplotlib.Figure
36 | """
37 | # get the parameters
38 | _bins = v.bins
39 | _exp = v.experimental
40 | x = np.linspace(0, np.nanmax(_bins), 100) # make the 100 a param?
41 |
42 | # do the plotting
43 | if axes is None:
44 | if hist:
45 | fig = plt.figure(figsize=(8, 5))
46 | ax1 = plt.subplot2grid((5, 1), (1, 0), rowspan=4)
47 | ax2 = plt.subplot2grid((5, 1), (0, 0), sharex=ax1)
48 | fig.subplots_adjust(hspace=0)
49 | else:
50 | fig, ax1 = plt.subplots(1, 1, figsize=(8, 4))
51 | ax2 = None
52 | elif isinstance(axes, (list, tuple, np.ndarray)):
53 | ax1, ax2 = axes
54 | fig = ax1.get_figure()
55 | else:
56 | ax1 = axes
57 | ax2 = None
58 | fig = ax1.get_figure()
59 |
60 | # apply the model
61 | y = v.transform(x)
62 |
63 | # handle the relative experimental variogram
64 | if v.normalized:
65 | _bins /= np.nanmax(_bins)
66 | y /= np.max(_exp)
67 | _exp /= np.nanmax(_exp)
68 | x /= np.nanmax(x)
69 |
70 | # ------------------------
71 | # plot Variograms
72 | ax1.plot(_bins, _exp, marker=".", color='orange', markersize=15, linestyle='None')
73 | ax1.plot(x, y, 'blue', linewidth=2)
74 | ax1.set_facecolor('white')
75 |
76 | # ax limits
77 | if v.normalized:
78 | ax1.set_xlim([0, 1.05])
79 | ax1.set_ylim([0, 1.05])
80 | if grid:
81 | ax1.grid('off')
82 | ax1.vlines(_bins, *ax1.axes.get_ybound(), colors=(.85, .85, .85),
83 | linestyles='dashed',linewidth=0.5)
84 | # annotation
85 | ax1.axes.set_ylabel('semivariance (%s)' % v._estimator.__name__)
86 | ax1.axes.set_xlabel('Lag (-)')
87 |
88 | # ------------------------
89 | # plot histogram
90 | if ax2 is not None and hist:
91 | # calc the histogram
92 | _count = np.fromiter(
93 | (g.size for g in v.lag_classes()), dtype=int
94 | )
95 |
96 | # set the sum of hist bar widths to 70% of the x-axis space
97 | w = (np.max(_bins) * 0.7) / len(_count)
98 |
99 | # plot
100 | ax2.bar(_bins, _count, width=w, align='center', color='blue')
101 |
102 | # adjust
103 | plt.setp(ax2.axes.get_xticklabels(), visible=False)
104 | ax2.axes.set_yticks(ax2.axes.get_yticks()[1:])
105 |
106 | # need a grid?
107 | if grid:
108 | ax2.grid('off')
109 | ax2.vlines(_bins, *ax2.axes.get_ybound(),
110 | colors=(.85, .85, .85), linestyles='dashed',linewidth=0.5)
111 |
112 | # anotate
113 | ax2.axes.set_ylabel('N')
114 | ax2.set_facecolor('white')
115 |
116 | plt.title(fig_title)
117 | return fig
118 |
119 | def geom2gdf(geom, crs, lonlat = True):
120 | geom = [['geom', geom]]
121 | geom = pd.DataFrame(geom, columns = ['geom', 'geometry'])
122 | geom = gpd.GeoDataFrame(geom, geometry=geom.geometry)
123 | geom.crs = {'init': crs}
124 | if(lonlat):
125 | geom = geom.to_crs({'init': 'epsg:4326'})
126 |
127 | return geom
128 |
129 | def ell2gdf(M, sMx, sMy, theta, crs):
130 | circ = shapely.geometry.Point(M).buffer(1)
131 | ell = shapely.affinity.scale(circ, int(sMx), int(sMy))
132 | ellr = shapely.affinity.rotate(ell,-np.degrees(theta))
133 | poly = geom2gdf(ellr, crs)
134 |
135 | return poly
136 |
137 | def ncdump(nc_fid, verb=True):
138 | '''
139 | ncdump outputs dimensions, variables and their attribute information.
140 | The information is similar to that of NCAR's ncdump utility.
141 | ncdump requires a valid instance of Dataset.
142 | Parameters
143 | ----------
144 | nc_fid : netCDF4.Dataset
145 | A netCDF4 dateset object
146 | verb : Boolean
147 | whether or not nc_attrs, nc_dims, and nc_vars are printed
148 | Returns
149 | -------
150 | nc_attrs : list
151 | A Python list of the NetCDF file global attributes
152 | nc_dims : list
153 | A Python list of the NetCDF file dimensions
154 | nc_vars : list
155 | A Python list of the NetCDF file variables
156 | '''
157 | def print_ncattr(key):
158 | """
159 | Prints the NetCDF file attributes for a given key
160 | Parameters
161 | ----------
162 | key : unicode
163 | a valid netCDF4.Dataset.variables key
164 | """
165 | try:
166 | print("\t\ttype:", repr(nc_fid.variables[key].dtype))
167 | for ncattr in nc_fid.variables[key].ncattrs():
168 | print('\t\t%s:' % ncattr,\
169 | repr(nc_fid.variables[key].getncattr(ncattr)))
170 | except KeyError:
171 | print("\t\tWARNING: %s does not contain variable attributes" % key)
172 | # NetCDF global attributes
173 | nc_attrs = nc_fid.ncattrs()
174 | if verb:
175 | print("NetCDF Global Attributes:")
176 | for nc_attr in nc_attrs:
177 | print('\t%s:' % nc_attr, repr(nc_fid.getncattr(nc_attr)))
178 | nc_dims = [dim for dim in nc_fid.dimensions] # list of nc dimensions
179 | # Dimension shape information.
180 | if verb:
181 | print("NetCDF dimension information:")
182 | for dim in nc_dims:
183 | print("\tName:", dim)
184 | print("\t\tsize:", len(nc_fid.dimensions[dim]))
185 | print_ncattr(dim)
186 | # Variable information.
187 | nc_vars = [var for var in nc_fid.variables] # list of nc variables
188 | if verb:
189 | print("NetCDF variable information:")
190 | for var in nc_vars:
191 | if var not in nc_dims:
192 | print('\tName:', var)
193 | print("\t\tdimensions:", nc_fid.variables[var].dimensions)
194 | print("\t\tsize:", nc_fid.variables[var].size)
195 | print_ncattr(var)
196 | return nc_attrs, nc_dims, nc_vars
--------------------------------------------------------------------------------
/Chapter 3/datasets.py:
--------------------------------------------------------------------------------
1 | import geopandas as gpd
2 | from cartoframes.auth import set_default_credentials
3 | from cartoframes import read_carto
4 | from cartoframes import to_carto
5 |
6 | set_default_credentials("ebook-sds")
7 |
8 |
9 | def get_table(tablename):
10 | """Retrieve tablename as a GeoDataFrame ordered by database id
11 |
12 | Returns:
13 | geopandas.GeoDataFrame: GeoDataFrame representation of table
14 | """
15 | base_query = ("SELECT * FROM {tablename} ORDER BY cartodb_id ASC").format(
16 | tablename=tablename
17 | )
18 | data_carto = read_carto(base_query)
19 | ## Renaming the geometry column from 'the_geom' to 'geometry'
20 | ## (pysal expect the geometry column to be called 'geometry')
21 | data = data_carto.copy()
22 | data['geometry'] = data.geometry
23 | data.drop(['the_geom'],axis = 1, inplace = True)
24 | data = gpd.GeoDataFrame(data, geometry = 'geometry')
25 | data.crs = {"init": "epsg:4326"}
26 |
27 | return data
28 |
29 |
30 | def get_nyc_census_tracts():
31 | """Retrieve dataset on NYC Census Tracts
32 |
33 | Returns:
34 | geopandas.GeoDataFrame: GeoDataFrame representation of table
35 | """
36 | return get_table("census_tracts_cleaned")
37 |
38 |
39 | def get_safegraph_visits():
40 | """Retrieve Safegraph visit data for Panama City Beach in July 2019
41 | as a GeoDataFrame ordered by database id
42 |
43 | Returns:
44 | geopandas.GeoDataFrame: GeoDataFrame representation of table
45 | """
46 | return get_table("safegraph_pcb_visits")
47 |
--------------------------------------------------------------------------------
/Chapter 3/dbscan.ipynb:
--------------------------------------------------------------------------------
1 | {
2 | "cells": [
3 | {
4 | "cell_type": "markdown",
5 | "metadata": {},
6 | "source": [
7 | "# DBSCAN with Visit Data\n",
8 | "\n",
9 | "For this exercise, we will be working with a sample of [Safegraph's Patterns dataset](https://blog.safegraph.com/introducing-places-patterns-17ac5b96fb33).\n",
10 | "\n",
11 | "The data is a set of home locations from which people travel to visit Panama City Beach, Florida during the month of July 2019. This example is a basic reproduction of some of the findings in the [CARTO <> Safegraph partnership blog post](https://carto.com/blog/visit-pattern-footfall-data-safegraph/). The data comes from the `visitor_home_cbgs` home attribute for all Points of Interest (POIs) in Panama City Beach, Florida. See the [Patterns documentation](https://docs.safegraph.com/docs/places-schema#section-patterns) for more information.\n",
12 | "\n",
13 | "Since we know the locations that people are coming from, it might be natural to ask if there are general regions that we can identify as drivers of the visits. For example, are there areas with a higher density of source visits that could be used to understand visit demographics?\n",
14 | "\n",
15 | "Let's get started by downloading the data and taking a look at it."
16 | ]
17 | },
18 | {
19 | "cell_type": "code",
20 | "execution_count": 1,
21 | "metadata": {},
22 | "outputs": [],
23 | "source": [
24 | "import geopandas as gpd\n",
25 | "import numpy as np\n",
26 | "\n",
27 | "from cartoframes.viz import Map, Layer\n",
28 | "\n",
29 | "import datasets\n",
30 | "import warnings\n",
31 | "\n",
32 | "warnings.filterwarnings(\"ignore\")"
33 | ]
34 | },
35 | {
36 | "cell_type": "markdown",
37 | "metadata": {},
38 | "source": [
39 | "### Retrieve the data"
40 | ]
41 | },
42 | {
43 | "cell_type": "code",
44 | "execution_count": 2,
45 | "metadata": {},
46 | "outputs": [
47 | {
48 | "data": {
49 | "text/html": [
50 | "
\n",
51 | "\n",
64 | "
\n",
65 | " \n",
66 | " \n",
67 | " | \n",
68 | " cartodb_id | \n",
69 | " longitude | \n",
70 | " latitude | \n",
71 | " num_visits | \n",
72 | " geometry | \n",
73 | "
\n",
74 | " \n",
75 | " \n",
76 | " \n",
77 | " 0 | \n",
78 | " 1 | \n",
79 | " -84.690182 | \n",
80 | " 33.990924 | \n",
81 | " 5 | \n",
82 | " POINT (-84.69018 33.99092) | \n",
83 | "
\n",
84 | " \n",
85 | " 1 | \n",
86 | " 2 | \n",
87 | " -85.877212 | \n",
88 | " 30.216679 | \n",
89 | " 13 | \n",
90 | " POINT (-85.87721 30.21668) | \n",
91 | "
\n",
92 | " \n",
93 | " 2 | \n",
94 | " 3 | \n",
95 | " -85.173263 | \n",
96 | " 31.904274 | \n",
97 | " 6 | \n",
98 | " POINT (-85.17326 31.90427) | \n",
99 | "
\n",
100 | " \n",
101 | " 3 | \n",
102 | " 4 | \n",
103 | " -86.006852 | \n",
104 | " 34.632641 | \n",
105 | " 5 | \n",
106 | " POINT (-86.00685 34.63264) | \n",
107 | "
\n",
108 | " \n",
109 | " 4 | \n",
110 | " 5 | \n",
111 | " -85.038783 | \n",
112 | " 32.523741 | \n",
113 | " 11 | \n",
114 | " POINT (-85.03878 32.52374) | \n",
115 | "
\n",
116 | " \n",
117 | "
\n",
118 | "
"
119 | ],
120 | "text/plain": [
121 | " cartodb_id longitude latitude num_visits geometry\n",
122 | "0 1 -84.690182 33.990924 5 POINT (-84.69018 33.99092)\n",
123 | "1 2 -85.877212 30.216679 13 POINT (-85.87721 30.21668)\n",
124 | "2 3 -85.173263 31.904274 6 POINT (-85.17326 31.90427)\n",
125 | "3 4 -86.006852 34.632641 5 POINT (-86.00685 34.63264)\n",
126 | "4 5 -85.038783 32.523741 11 POINT (-85.03878 32.52374)"
127 | ]
128 | },
129 | "execution_count": 2,
130 | "metadata": {},
131 | "output_type": "execute_result"
132 | }
133 | ],
134 | "source": [
135 | "sg_pcb = datasets.get_safegraph_visits()\n",
136 | "sg_pcb.head()"
137 | ]
138 | },
139 | {
140 | "cell_type": "markdown",
141 | "metadata": {},
142 | "source": [
143 | "This is a point dataset associated with the number of visits."
144 | ]
145 | },
146 | {
147 | "cell_type": "markdown",
148 | "metadata": {},
149 | "source": [
150 | "### Visualize points on map"
151 | ]
152 | },
153 | {
154 | "cell_type": "code",
155 | "execution_count": 3,
156 | "metadata": {},
157 | "outputs": [
158 | {
159 | "data": {
160 | "text/html": [
161 | ""
1052 | ],
1053 | "text/plain": [
1054 | ""
1055 | ]
1056 | },
1057 | "execution_count": 3,
1058 | "metadata": {},
1059 | "output_type": "execute_result"
1060 | }
1061 | ],
1062 | "source": [
1063 | "Layer(sg_pcb)"
1064 | ]
1065 | },
1066 | {
1067 | "cell_type": "markdown",
1068 | "metadata": {},
1069 | "source": [
1070 | "### Calculate Clusters\n",
1071 | "\n",
1072 | "To calculate clusters, we will use DBSCAN because it works well for finding clusters based on density and works well with spatial measurements."
1073 | ]
1074 | },
1075 | {
1076 | "cell_type": "code",
1077 | "execution_count": 4,
1078 | "metadata": {},
1079 | "outputs": [
1080 | {
1081 | "name": "stdout",
1082 | "output_type": "stream",
1083 | "text": [
1084 | "Number of clusters: 9\n"
1085 | ]
1086 | }
1087 | ],
1088 | "source": [
1089 | "from sklearn.cluster import dbscan\n",
1090 | "\n",
1091 | "# use lat/lng in radians as coordinates\n",
1092 | "coords = np.radians(sg_pcb[[\"latitude\", \"longitude\"]].values)\n",
1093 | "\n",
1094 | "# choose appropriate epsilon value\n",
1095 | "# here we use ~35 kilometers\n",
1096 | "kms_per_radian = 6371\n",
1097 | "epsilon = 35 / kms_per_radian\n",
1098 | "\n",
1099 | "# calculate clusters\n",
1100 | "# use haversine metric for calculating approximate distances on earth's surface (crow fly)\n",
1101 | "_, cluster_labels = dbscan(\n",
1102 | " coords, eps=epsilon, min_samples=4, algorithm=\"ball_tree\", metric=\"haversine\",\n",
1103 | ")\n",
1104 | "\n",
1105 | "print(\"Number of clusters: {}\".format(len(set(cluster_labels))))"
1106 | ]
1107 | },
1108 | {
1109 | "cell_type": "markdown",
1110 | "metadata": {},
1111 | "source": [
1112 | "### Add cluster labels to data\n",
1113 | "\n",
1114 | "Now that we have uncovered some natural clusters, let's give them some appropriate labels.\n",
1115 | "\n",
1116 | "Looking at the map below, we can see there are a few clusters that we can easily identify (e.g., local Panama City Beach and the large area in northern Alabama and Georgia), while other clusters are smaller and less significant. The values of `-1` indicate 'noise' or not falling into a cluster."
1117 | ]
1118 | },
1119 | {
1120 | "cell_type": "code",
1121 | "execution_count": 5,
1122 | "metadata": {},
1123 | "outputs": [
1124 | {
1125 | "data": {
1126 | "text/html": [
1127 | ""
2042 | ],
2043 | "text/plain": [
2044 | ""
2045 | ]
2046 | },
2047 | "execution_count": 5,
2048 | "metadata": {},
2049 | "output_type": "execute_result"
2050 | }
2051 | ],
2052 | "source": [
2053 | "from cartoframes.viz import color_category_style\n",
2054 | "\n",
2055 | "# convert labels to text for creating a category map\n",
2056 | "sg_pcb[\"dbscan_labels\"] = [str(s) for s in cluster_labels]\n",
2057 | "\n",
2058 | "# show distribution of labels\n",
2059 | "Layer(sg_pcb, color_category_style('dbscan_labels'))"
2060 | ]
2061 | },
2062 | {
2063 | "cell_type": "markdown",
2064 | "metadata": {},
2065 | "source": [
2066 | "### Apply readable labels to clusters"
2067 | ]
2068 | },
2069 | {
2070 | "cell_type": "code",
2071 | "execution_count": 6,
2072 | "metadata": {},
2073 | "outputs": [],
2074 | "source": [
2075 | "sg_pcb[\"dbscan_labels\"] = cluster_labels\n",
2076 | "\n",
2077 | "# identify points as within a cluster or not\n",
2078 | "def in_cluster(cluster_num):\n",
2079 | " if cluster_num == -1:\n",
2080 | " return \"Out of cluster\"\n",
2081 | " return \"In cluster\"\n",
2082 | "\n",
2083 | "\n",
2084 | "sg_pcb[\"in_cluster\"] = sg_pcb[\"dbscan_labels\"].apply(in_cluster)"
2085 | ]
2086 | },
2087 | {
2088 | "cell_type": "markdown",
2089 | "metadata": {},
2090 | "source": [
2091 | "### Calculate Convex Hulls to show approximate cluster region\n",
2092 | "\n",
2093 | "To get approximate polygons to represent the regions, we can group the points by label and draw a convex hull. We also added a small buffer to improve the cartography."
2094 | ]
2095 | },
2096 | {
2097 | "cell_type": "code",
2098 | "execution_count": 7,
2099 | "metadata": {},
2100 | "outputs": [],
2101 | "source": [
2102 | "# group clusters (excluding noise)\n",
2103 | "# union points within cluster\n",
2104 | "# create a convex hull and small buffer\n",
2105 | "cluster_hulls = (\n",
2106 | " sg_pcb[sg_pcb[\"dbscan_labels\"] != -1]\n",
2107 | " .groupby(\"dbscan_labels\")\n",
2108 | " .geometry.apply(lambda x: x.unary_union.convex_hull.buffer(0.05))\n",
2109 | " .reset_index()\n",
2110 | ")\n",
2111 | "\n",
2112 | "cluster_hulls = gpd.GeoDataFrame(cluster_hulls)\n",
2113 | "\n",
2114 | "# Give cluster labels more readable titles\n",
2115 | "cluster_title_mapping = {\n",
2116 | " -1: \"Outlier\",\n",
2117 | " 0: \"Northern Alabama and Georgia\",\n",
2118 | " 1: \"Panama City Beach (Locals)\",\n",
2119 | "}\n",
2120 | "cluster_title_mapping.update(\n",
2121 | " {k: \"Other smaller region\" for k in range(2, max(cluster_labels) + 1)}\n",
2122 | ")\n",
2123 | "\n",
2124 | "cluster_hulls[\"dbscan_labels_readable\"] = cluster_hulls[\"dbscan_labels\"].apply(\n",
2125 | " lambda x: cluster_title_mapping.get(x)\n",
2126 | ")"
2127 | ]
2128 | },
2129 | {
2130 | "cell_type": "code",
2131 | "execution_count": 8,
2132 | "metadata": {},
2133 | "outputs": [
2134 | {
2135 | "data": {
2136 | "text/html": [
2137 | "\n",
2138 | "\n",
2151 | "
\n",
2152 | " \n",
2153 | " \n",
2154 | " | \n",
2155 | " dbscan_labels | \n",
2156 | " geometry | \n",
2157 | " dbscan_labels_readable | \n",
2158 | "
\n",
2159 | " \n",
2160 | " \n",
2161 | " \n",
2162 | " 0 | \n",
2163 | " 0 | \n",
2164 | " POLYGON ((-84.94719 32.21661, -84.95174 32.215... | \n",
2165 | " Northern Alabama and Georgia | \n",
2166 | "
\n",
2167 | " \n",
2168 | " 1 | \n",
2169 | " 1 | \n",
2170 | " POLYGON ((-85.69464 30.06520, -85.69932 30.063... | \n",
2171 | " Panama City Beach (Locals) | \n",
2172 | "
\n",
2173 | " \n",
2174 | " 2 | \n",
2175 | " 2 | \n",
2176 | " POLYGON ((-85.12766 31.76488, -85.12806 31.759... | \n",
2177 | " Other smaller region | \n",
2178 | "
\n",
2179 | " \n",
2180 | " 3 | \n",
2181 | " 3 | \n",
2182 | " POLYGON ((-83.43705 31.46919, -83.43920 31.464... | \n",
2183 | " Other smaller region | \n",
2184 | "
\n",
2185 | " \n",
2186 | " 4 | \n",
2187 | " 4 | \n",
2188 | " POLYGON ((-87.06329 36.04642, -87.06598 36.042... | \n",
2189 | " Other smaller region | \n",
2190 | "
\n",
2191 | " \n",
2192 | " 5 | \n",
2193 | " 5 | \n",
2194 | " POLYGON ((-85.91513 31.18973, -85.91990 31.189... | \n",
2195 | " Other smaller region | \n",
2196 | "
\n",
2197 | " \n",
2198 | " 6 | \n",
2199 | " 6 | \n",
2200 | " POLYGON ((-88.37893 34.01090, -88.38195 34.006... | \n",
2201 | " Other smaller region | \n",
2202 | "
\n",
2203 | " \n",
2204 | " 7 | \n",
2205 | " 7 | \n",
2206 | " POLYGON ((-86.39062 35.74135, -86.39316 35.737... | \n",
2207 | " Other smaller region | \n",
2208 | "
\n",
2209 | " \n",
2210 | "
\n",
2211 | "
"
2212 | ],
2213 | "text/plain": [
2214 | " dbscan_labels geometry \\\n",
2215 | "0 0 POLYGON ((-84.94719 32.21661, -84.95174 32.215... \n",
2216 | "1 1 POLYGON ((-85.69464 30.06520, -85.69932 30.063... \n",
2217 | "2 2 POLYGON ((-85.12766 31.76488, -85.12806 31.759... \n",
2218 | "3 3 POLYGON ((-83.43705 31.46919, -83.43920 31.464... \n",
2219 | "4 4 POLYGON ((-87.06329 36.04642, -87.06598 36.042... \n",
2220 | "5 5 POLYGON ((-85.91513 31.18973, -85.91990 31.189... \n",
2221 | "6 6 POLYGON ((-88.37893 34.01090, -88.38195 34.006... \n",
2222 | "7 7 POLYGON ((-86.39062 35.74135, -86.39316 35.737... \n",
2223 | "\n",
2224 | " dbscan_labels_readable \n",
2225 | "0 Northern Alabama and Georgia \n",
2226 | "1 Panama City Beach (Locals) \n",
2227 | "2 Other smaller region \n",
2228 | "3 Other smaller region \n",
2229 | "4 Other smaller region \n",
2230 | "5 Other smaller region \n",
2231 | "6 Other smaller region \n",
2232 | "7 Other smaller region "
2233 | ]
2234 | },
2235 | "execution_count": 8,
2236 | "metadata": {},
2237 | "output_type": "execute_result"
2238 | }
2239 | ],
2240 | "source": [
2241 | "cluster_hulls"
2242 | ]
2243 | },
2244 | {
2245 | "cell_type": "markdown",
2246 | "metadata": {},
2247 | "source": [
2248 | "### Visualize outputs"
2249 | ]
2250 | },
2251 | {
2252 | "cell_type": "code",
2253 | "execution_count": 9,
2254 | "metadata": {},
2255 | "outputs": [
2256 | {
2257 | "data": {
2258 | "text/html": [
2259 | ""
3216 | ],
3217 | "text/plain": [
3218 | ""
3219 | ]
3220 | },
3221 | "execution_count": 9,
3222 | "metadata": {},
3223 | "output_type": "execute_result"
3224 | }
3225 | ],
3226 | "source": [
3227 | "from cartoframes.viz import color_category_legend, category_widget\n",
3228 | "\n",
3229 | "\n",
3230 | "Map(\n",
3231 | " [Layer(cluster_hulls,\n",
3232 | " style = color_category_style(\"dbscan_labels_readable\",\n",
3233 | " opacity=0.7,\n",
3234 | " palette=[\"#66C5CC\", \"#DCB0F2\", \"#F89C74\"],\n",
3235 | " stroke_color=\"transparent\"),\n",
3236 | " legends=color_category_legend(title=\"Visit Regions\"),\n",
3237 | " widgets=[category_widget('dbscan_labels_readable',\n",
3238 | " title='Cluster lables',\n",
3239 | " description='Select a category to filter')]\n",
3240 | " ),\n",
3241 | " Layer(sg_pcb,\n",
3242 | " style = color_category_style(\"in_cluster\",\n",
3243 | " palette=[\"#666\", \"deeppink\"],\n",
3244 | " opacity=0.5),\n",
3245 | " legends=color_category_legend(title=\"In Cluster\")\n",
3246 | " )]\n",
3247 | ")"
3248 | ]
3249 | }
3250 | ],
3251 | "metadata": {
3252 | "kernelspec": {
3253 | "display_name": "Python 3",
3254 | "language": "python",
3255 | "name": "python3"
3256 | },
3257 | "language_info": {
3258 | "codemirror_mode": {
3259 | "name": "ipython",
3260 | "version": 3
3261 | },
3262 | "file_extension": ".py",
3263 | "mimetype": "text/x-python",
3264 | "name": "python",
3265 | "nbconvert_exporter": "python",
3266 | "pygments_lexer": "ipython3",
3267 | "version": "3.7.3"
3268 | }
3269 | },
3270 | "nbformat": 4,
3271 | "nbformat_minor": 4
3272 | }
3273 |
--------------------------------------------------------------------------------
/Chapter 3/utils.py:
--------------------------------------------------------------------------------
1 | def get_nyc_bridge_connections():
2 | return {
3 | # Verrazzano-Narrows Bridge (Staten Island <-> Brooklyn)
4 | 4551: [2571],
5 | 2571: [4551],
6 | # Williamsburg Bridge (Manhattan <-> Brooklyn)
7 | 2923: [3498],
8 | 3498: [2923],
9 | # Queensborough Bridge (Manhattan <-> Queens)
10 | 3595: [3900],
11 | 3900: [3736, 3595],
12 | # Roosevelt Island <-> Queens
13 | 3736: [3900],
14 | # Astoria <-> East Harlem
15 | 3943: [3688],
16 | 3688: [2028, 3943],
17 | # East Harlem <-> Bronx
18 | 2028: [3688],
19 | # Northern Manhattan <-> Marble Hill
20 | 3769: [2237],
21 | 2237: [3769],
22 | # Rockaways
23 | 4411: [3009],
24 | 3009: [4411],
25 | }
26 |
--------------------------------------------------------------------------------
/Chapter 4/data/target_mn_loc.p:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/CartoDB/data-science-book/c0b6813117709c43dca6a9330c8380e52fba0d5e/Chapter 4/data/target_mn_loc.p
--------------------------------------------------------------------------------
/Chapter 4/datasets.py:
--------------------------------------------------------------------------------
1 | import geopandas as gpd
2 | from cartoframes.auth import set_default_credentials
3 | from cartoframes import read_carto
4 | from cartoframes import to_carto
5 |
6 | set_default_credentials('ebook-sds')
7 |
8 | def get_retail_store_minnesota():
9 | """Retrieve Retail Store Locations in Minnesota
10 |
11 | Returns:
12 | geopandas.GeoDataFrame: GeoDataFrame representation of table
13 | """
14 | table_name = 'retail_store_minnesota'
15 | data = read_carto(table_name)
16 | data['store_id'] = data['store_id'].apply(lambda x: str(int(x)))
17 | data.crs = {'init': 'epsg:4326'}
18 | return data
--------------------------------------------------------------------------------
/Chapter 4/utilis.py:
--------------------------------------------------------------------------------
1 | import logging
2 | logging.basicConfig(level=logging.INFO, format='%(levelname)s - %(message)s')
3 |
4 | import operator
5 | import random
6 | from copy import deepcopy
7 | from itertools import combinations
8 | import pickle as pkl
9 |
10 | import numpy as np
11 | import math
12 | import pandas as pd
13 | import geopandas as gpd
14 | from shapely.geometry import *
15 | from geopy.distance import great_circle
16 | from simanneal import Annealer
17 |
18 | import cartoframes
19 | from cartoframes.viz import *
20 | from cartoframes.data import *
21 |
22 | import seaborn as sns
23 | from matplotlib import pyplot as plt
24 | plt.style.use('ggplot')
25 |
26 |
27 | #######################################################
28 | # Solve TSP using Ant Colony Optimization in Python 3 #
29 | # Code Source: #
30 | # https://github.com/ppoffice/ant-colony-tsp #
31 | #######################################################
32 | class Graph(object):
33 | def __init__(self, cost_matrix: list, rank: int):
34 | """
35 | :param cost_matrix:
36 | :param rank: rank of the cost matrix
37 | """
38 | self.matrix = cost_matrix
39 | self.rank = rank
40 | self.pheromone = [[1 / (rank * rank) for j in range(rank)] for i in range(rank)]
41 | logging.info(f'[Done] Load the Graph')
42 |
43 |
44 | class ACO(object):
45 | def __init__(self, ant_count: int, generations: int, alpha: float, beta: float, rho: float, q: int,
46 | strategy: int):
47 | """
48 | :param ant_count:
49 | :param generations:
50 | :param alpha: relative importance of pheromone
51 | :param beta: relative importance of heuristic information
52 | :param rho: pheromone residual coefficient
53 | :param q: pheromone intensity
54 | :param strategy: pheromone update strategy. 0 - ant-cycle, 1 - ant-quality, 2 - ant-density
55 | """
56 | self.Q = q
57 | self.rho = rho
58 | self.beta = beta
59 | self.alpha = alpha
60 | self.ant_count = ant_count
61 | self.generations = generations
62 | self.update_strategy = strategy
63 |
64 | def _update_pheromone(self, graph: Graph, ants: list):
65 | for i, row in enumerate(graph.pheromone):
66 | for j, col in enumerate(row):
67 | graph.pheromone[i][j] *= self.rho
68 | for ant in ants:
69 | graph.pheromone[i][j] += ant.pheromone_delta[i][j]
70 |
71 | def solve(self, graph: Graph):
72 | """
73 | :param graph:
74 | """
75 | best_cost = float('inf')
76 | best_solution = []
77 | all_costs = []
78 | for gen in range(self.generations):
79 | ants = [_Ant(self, graph) for i in range(self.ant_count)]
80 | for ind, ant in enumerate(ants):
81 | for i in range(graph.rank - 1):
82 | ant._select_next()
83 | ant.total_cost += graph.matrix[ant.tabu[-1]][ant.tabu[0]]
84 | if ant.total_cost < best_cost:
85 | best_cost = ant.total_cost
86 | best_solution = [] + ant.tabu
87 | # update pheromone
88 | ant._update_pheromone_delta()
89 | self._update_pheromone(graph, ants)
90 | logging.info(f'[Generation #{gen}] [best cost: {best_cost}]')
91 | return best_solution, best_cost
92 |
93 | class _Ant(object):
94 | def __init__(self, aco: ACO, graph: Graph):
95 | self.colony = aco
96 | self.graph = graph
97 | self.total_cost = 0.0
98 | self.tabu = [] # tabu list
99 | self.pheromone_delta = [] # the local increase of pheromone
100 | self.allowed = [i for i in range(graph.rank)] # nodes which are allowed for the next selection
101 | self.eta = [[0 if i == j else 1 / graph.matrix[i][j] for j in range(graph.rank)] for i in
102 | range(graph.rank)] # heuristic information
103 | start = random.randint(0, graph.rank - 1) # start from any node
104 | self.tabu.append(start)
105 | self.current = start
106 | self.allowed.remove(start)
107 |
108 | def _select_next(self):
109 | denominator = 0
110 | for i in self.allowed:
111 | denominator += self.graph.pheromone[self.current][i] ** self.colony.alpha * self.eta[self.current][i] ** self.colony.beta
112 | probabilities = [0 for i in range(self.graph.rank)] # probabilities for moving to a node in the next step
113 | for i in range(self.graph.rank):
114 | try:
115 | self.allowed.index(i) # test if allowed list contains i
116 | probabilities[i] = self.graph.pheromone[self.current][i] ** self.colony.alpha * \
117 | self.eta[self.current][i] ** self.colony.beta / denominator
118 | except ValueError:
119 | pass # do nothing
120 | # select next node by probability roulette
121 | selected = 0
122 | rand = random.random()
123 | for i, probability in enumerate(probabilities):
124 | rand -= probability
125 | if rand <= 0:
126 | selected = i
127 | break
128 | self.allowed.remove(selected)
129 | self.tabu.append(selected)
130 | self.total_cost += self.graph.matrix[self.current][selected]
131 | self.current = selected
132 |
133 | def _update_pheromone_delta(self):
134 | self.pheromone_delta = [[0 for j in range(self.graph.rank)] for i in range(self.graph.rank)]
135 | for _ in range(1, len(self.tabu)):
136 | i = self.tabu[_ - 1]
137 | j = self.tabu[_]
138 | if self.colony.update_strategy == 1: # ant-quality system
139 | self.pheromone_delta[i][j] = self.colony.Q
140 | elif self.colony.update_strategy == 2: # ant-density system
141 | self.pheromone_delta[i][j] = self.colony.Q / self.graph.matrix[i][j]
142 | else: # ant-cycle system
143 | self.pheromone_delta[i][j] = self.colony.Q / self.total_cost
144 |
145 | def distance_aco(cities: dict):
146 | distance_matrix = []
147 | for ka, va in cities.items():
148 | record_distance = []
149 | for kb, vb in cities.items():
150 | if kb == ka:
151 | record_distance.append(0.0)
152 | else:
153 | record_distance.append(great_circle(va, vb).m)
154 | distance_matrix.append(record_distance)
155 | logging.info(f'[Done] Create A Distance Matrix For ACO')
156 | return distance_matrix
157 |
158 |
159 |
160 | def location(data: pd.DataFrame,
161 | id_col: str,
162 | geometry_col: str) -> dict :
163 | """
164 | Extract Location from dataframe. Output a dict as {id: (lng, lat), ...}
165 | """
166 | loc = {}
167 | for row in data.iterrows():
168 | loc_id = row[1][id_col]
169 | x, y = row[1][geometry_col].x, row[1][geometry_col].y
170 | loc[loc_id] = loc.get(loc_id, (x,y))
171 | logging.info(f'[Done] Transform DataFrame To Location Dict)')
172 | return loc
173 |
174 |
175 | #######################################################
176 | # Christofides algorithm #
177 | # Code Source: #
178 | # https://github.com/Retsediv/ChristofidesAlgorithm #
179 | #######################################################
180 |
181 | def christofides(data):
182 | # build a graph
183 | G = build_graph(data)
184 | # print("Graph: ", G)
185 |
186 | # build a minimum spanning tree
187 | MSTree = minimum_spanning_tree(G)
188 | MSTree_init = deepcopy(MSTree)
189 | # print("MSTree: ", MSTree)
190 |
191 | # find odd vertexes
192 | odd_vertexes = find_odd_vertexes(MSTree)
193 | odd_vertexes_init = deepcopy(odd_vertexes)
194 | # print("Odd vertexes in MSTree: ", odd_vertexes)
195 |
196 | # add minimum weight matching edges to MST
197 | new_added_matching = minimum_weight_matching(MSTree, G, odd_vertexes)
198 | united_MSTree_perfect_matching = deepcopy(MSTree)
199 | # print("Minimum weight matching: ", MSTree)
200 |
201 | # find an eulerian tour
202 | eulerian_tour = find_eulerian_tour(MSTree, G)
203 |
204 | # print("Eulerian tour: ", eulerian_tour)
205 |
206 | current = eulerian_tour[0]
207 | path = [current]
208 | visited = [False] * len(eulerian_tour)
209 |
210 | length = 0
211 |
212 | for v in eulerian_tour[1:]:
213 | if not visited[v]:
214 | path.append(v)
215 | visited[v] = True
216 |
217 | length += G[current][v]
218 | current = v
219 |
220 | # path.append(path[0])
221 |
222 | # print("Result path: ", path)
223 | # print("Result length of the path: ", length)
224 |
225 | return G, MSTree_init, odd_vertexes_init, new_added_matching, united_MSTree_perfect_matching, eulerian_tour, length, path
226 |
227 |
228 |
229 | def get_length(x1, y1, x2, y2, name='great_circle'):
230 | '''
231 | x1: lat1
232 | y1: lng1
233 | x2: lat2
234 | y2: lng2
235 | '''
236 | if name == 'great_circle':
237 | return great_circle((x1,y1), (x2,y2)).km
238 | else:
239 | return ((x1 - x2) ** 2 + (y1 - y2) ** 2) ** (1 / 2)
240 |
241 |
242 | def build_graph(data):
243 | graph = {}
244 | for this in range(len(data)):
245 | for another_point in range(len(data)):
246 | if this != another_point:
247 | if this not in graph:
248 | graph[this] = {}
249 |
250 | graph[this][another_point] = get_length(data[this][0],
251 | data[this][1],
252 | data[another_point][0],
253 | data[another_point][1],
254 | name='great_circle')
255 |
256 | return graph
257 |
258 |
259 | class UnionFind:
260 | def __init__(self):
261 | self.weights = {}
262 | self.parents = {}
263 |
264 | def __getitem__(self, object):
265 | if object not in self.parents:
266 | self.parents[object] = object
267 | self.weights[object] = 1
268 | return object
269 |
270 | # find path of objects leading to the root
271 | path = [object]
272 | root = self.parents[object]
273 | while root != path[-1]:
274 | path.append(root)
275 | root = self.parents[root]
276 |
277 | # compress the path and return
278 | for ancestor in path:
279 | self.parents[ancestor] = root
280 | return root
281 |
282 | def __iter__(self):
283 | return iter(self.parents)
284 |
285 | def union(self, *objects):
286 | roots = [self[x] for x in objects]
287 | heaviest = max([(self.weights[r], r) for r in roots])[1]
288 | for r in roots:
289 | if r != heaviest:
290 | self.weights[heaviest] += self.weights[r]
291 | self.parents[r] = heaviest
292 |
293 |
294 | def minimum_spanning_tree(G):
295 | tree = []
296 | subtrees = UnionFind()
297 | for W, u, v in sorted((G[u][v], u, v) for u in G for v in G[u]):
298 | if subtrees[u] != subtrees[v]:
299 | tree.append((u, v, W))
300 | subtrees.union(u, v)
301 |
302 | return tree
303 |
304 |
305 | def find_odd_vertexes(MST):
306 | tmp_g = {}
307 | vertexes = []
308 | for edge in MST:
309 | if edge[0] not in tmp_g:
310 | tmp_g[edge[0]] = 0
311 |
312 | if edge[1] not in tmp_g:
313 | tmp_g[edge[1]] = 0
314 |
315 | tmp_g[edge[0]] += 1
316 | tmp_g[edge[1]] += 1
317 |
318 | for vertex in tmp_g:
319 | if tmp_g[vertex] % 2 == 1:
320 | vertexes.append(vertex)
321 |
322 | return vertexes
323 |
324 |
325 | def minimum_weight_matching(MST, G, odd_vert):
326 | import random
327 | random.shuffle(odd_vert)
328 |
329 | new_added = []
330 | while odd_vert:
331 | v = odd_vert.pop()
332 | length = float("inf")
333 | u = 1
334 | closest = 0
335 | for u in odd_vert:
336 | if v != u and G[v][u] < length:
337 | length = G[v][u]
338 | closest = u
339 |
340 | MST.append((v, closest, length))
341 | new_added.append((v, closest, length))
342 | odd_vert.remove(closest)
343 | return new_added
344 |
345 | def find_eulerian_tour(MatchedMSTree, G):
346 | # find neigbours
347 | neighbours = {}
348 | for edge in MatchedMSTree:
349 | if edge[0] not in neighbours:
350 | neighbours[edge[0]] = []
351 |
352 | if edge[1] not in neighbours:
353 | neighbours[edge[1]] = []
354 |
355 | neighbours[edge[0]].append(edge[1])
356 | neighbours[edge[1]].append(edge[0])
357 |
358 | # print("Neighbours: ", neighbours)
359 |
360 | # finds the hamiltonian circuit
361 | start_vertex = MatchedMSTree[0][0]
362 | EP = [neighbours[start_vertex][0]]
363 |
364 | while len(MatchedMSTree) > 0:
365 | for i, v in enumerate(EP):
366 | if len(neighbours[v]) > 0:
367 | break
368 |
369 | while len(neighbours[v]) > 0:
370 | w = neighbours[v][0]
371 |
372 | remove_edge_from_matchedMST(MatchedMSTree, v, w)
373 |
374 | del neighbours[v][(neighbours[v].index(w))]
375 | del neighbours[w][(neighbours[w].index(v))]
376 |
377 | i += 1
378 | EP.insert(i, w)
379 |
380 | v = w
381 |
382 | return EP
383 |
384 |
385 | def remove_edge_from_matchedMST(MatchedMST, v1, v2):
386 |
387 | for i, item in enumerate(MatchedMST):
388 | if (item[0] == v2 and item[1] == v1) or (item[0] == v1 and item[1] == v2):
389 | del MatchedMST[i]
390 |
391 | return MatchedMST
392 |
393 |
394 | def Euler_Tour(multigraph):
395 | """ Uses Fleury's algorithm to find the Euler Tour of the MultiGraph.
396 | """
397 | tour = []
398 | temp_graph = nx.MultiGraph()
399 | graph_nodes = nx.nodes(multigraph)
400 | current_node = graph_nodes[0]
401 | tour.append(current_node)
402 | while nx.number_of_edges(multigraph) > 0:
403 | for edge in multigraph.edges(current_node):
404 | temp_graph = copy.deepcopy(multigraph)
405 | temp_graph.remove_edge(edge[0], edge[1], key=None)
406 | if nx.is_connected(temp_graph):
407 | tour.append(edge[1])
408 | current_node = edge[1]
409 | multigraph.remove_edge(edge[0], edge[1], key=None)
410 | break
411 | else:
412 | tour.append(edge[1])
413 | current_node = edge[1]
414 | multigraph.remove_edge(edge[0], edge[1], key=None)
415 | multigraph.remove_nodes_from(nx.isolates(multigraph))
416 | return tour
417 |
418 |
419 | def shortcut_Euler_Tour(tour):
420 | """Find's the shortcut of the Euler Tour to obtain the Approximation.
421 | """
422 | Tour = []
423 | for vertex in tour:
424 | if vertex not in Tour:
425 | Tour.append(vertex)
426 | Tour.append(tour[0])
427 | return Tour
428 |
429 |
430 | class TravelingSalesman(Annealer):
431 | """Calculates sequence of places to visit"""
432 | def __init__(self, state, distance_matrix):
433 | self.distance_matrix = distance_matrix
434 | super(TravelingSalesman, self).__init__(state)
435 |
436 | def move(self):
437 | """Swaps two cities in the route."""
438 | a = random.randint(0, len(self.state) - 1)
439 | b = random.randint(0, len(self.state) - 1)
440 | self.state[a], self.state[b] = self.state[b], self.state[a]
441 |
442 | def energy(self):
443 | """Calculates energy with current configuration"""
444 | total_dist = 0
445 | # add distances from i-1 -> i
446 | for i in range(len(self.state)):
447 | # loop, back to the start.
448 | total_dist += self.distance_matrix[self.state[i-1]][self.state[i]]
449 | return total_dist
450 |
451 |
452 | def TravelingSalesmanRun(loc: dict, iteration: int):
453 | output = pd.DataFrame({'id': [], 'iteration': [], 'distance': []})
454 |
455 | # create a distance matrix
456 | distance_matrix = {}
457 | for ka, va in loc.items():
458 | distance_matrix[ka] = {}
459 | for kb, vb in loc.items():
460 | if kb == ka:
461 | distance_matrix[ka][kb] = 0.0
462 | else:
463 | distance_matrix[ka][kb] = great_circle(va, vb).m
464 | logging.info(f'[Done] Create A Distance Matrix)')
465 |
466 | for iter_i in range(iteration):
467 | # initial state
468 |
469 | # init_state = sorted(list(loc.keys()))
470 | init_state = list(loc.keys())
471 | random.shuffle(init_state)
472 |
473 | # run
474 | distance_matrix_copy = distance_matrix.copy()
475 |
476 | tsp = TravelingSalesman(init_state, distance_matrix_copy)
477 |
478 | ##################################################
479 | # Tmax = 25000.0 # Max (starting) temperature #
480 | # Tmin = 2.5 # Min (ending) temperature #
481 | # steps = 50000 # Number of iterations #
482 | # updates = 100 #
483 | ##################################################
484 | auto_schedule = tsp.auto(minutes=1)
485 | tsp.set_schedule(auto_schedule)
486 | tsp.copy_strategy = "slice"
487 |
488 | state, e = tsp.anneal()
489 |
490 | logging.info(f'[{iter_i+1}]: {e} m route)')
491 |
492 | # record
493 | output_i = pd.DataFrame({'id': state, 'iteration': [iter_i]*len(loc), 'distance': [e]*len(loc)})
494 | output = output.append(output_i)
495 | logging.info(f'[Done]: Traveling Salesman Run')
496 | return output
497 |
498 | def result(loc: dict, Output: pd.DataFrame) -> pd.DataFrame:
499 | output_copy = Output.copy()
500 | loc_copy = loc.copy()
501 |
502 | loc_copy = pd.DataFrame.from_dict(loc_copy, orient='index')
503 | loc_copy.columns = ['lng','lat']
504 |
505 | output_copy['shortest'] = output_copy['distance'].rank(method="min")
506 | output_copy['visitOrder'] = output_copy.index + 1
507 | output_copy.index = output_copy['id']
508 | output_copy = output_copy.join(loc_copy)
509 | output_copy['shortest'] = output_copy['shortest'].astype(int)
510 | output_copy = output_copy.sort_values(by=['shortest', 'visitOrder'])
511 | output_copy.reset_index(inplace=True, drop=True)
512 | output_copy['geometry'] = output_copy.apply(lambda x: Point(x.lng, x.lat), axis=1)
513 | logging.info(f'[Done]: Organize Result')
514 | return output_copy
515 |
516 | def shortestRoute(result: pd.DataFrame):
517 | shortest_route = result[result.shortest == 1]
518 | shortest_route = gpd.GeoDataFrame(shortest_route)
519 | logging.info(f'[Done]: Find The Shortest Route')
520 | return shortest_route
521 |
522 |
523 |
--------------------------------------------------------------------------------
/Dockerfile:
--------------------------------------------------------------------------------
1 | FROM jupyter/datascience-notebook
2 |
3 | USER root
4 | RUN apt-get update
5 | RUN apt-get install build-essential software-properties-common -y
6 | RUN apt-get install pkg-config
7 | RUN jupyter nbextension enable --py widgetsnbextension
8 | RUN jupyter labextension install @jupyter-widgets/jupyterlab-manager
9 | RUN apt-get install -y gdal-bin
10 | RUN apt-get install -y libspatialindex-dev
11 |
12 | USER jovyan
13 | COPY . /tmp/
14 | RUN pip install --requirement /tmp/requirements.txt
15 | RUN conda install -y r-rgdal
16 | RUN conda install -y r-spdep
17 | RUN Rscript /tmp/src_import/modules.R
18 |
19 |
--------------------------------------------------------------------------------
/README.md:
--------------------------------------------------------------------------------
1 | # Becoming a Spatial Data Scientist Materials
2 |
3 | Example notebooks to accompany [Becoming a Spatial Data Scientist](https://go.carto.com/ebooks/spatial-data-science).
4 |
5 | 
6 |
7 |
8 |
9 | ## Installation requirements
10 |
11 | The notebooks in this repository use a ready-to-run Docker image containing Jupyter applications and interactive computing tools. To run the notebooks, please follow the instructions below.
12 |
13 | 1. Clone this repository
14 | ```bash
15 | $ git clone git@github.com:CartoDB/data-science-book.git
16 | $ cd data-science-book
17 | ```
18 |
19 | 2. Download and install docker. Follow instructions here: https://docs.docker.com/install/
20 |
21 | 3. Run the image. Open your terminal and run
22 | ```bash
23 | $ docker run --user root -p 8888:8888 -e JUPYTER_ENABLE_LAB=yes -e GRANT_SUDO=yes -v "$PWD":/home/jovyan/workspace cartodb/data-science-book
24 | ```
25 |
26 | A local address will be created. Copy and paste the address in your browser, this will launch Jupyter Lab. **Note**: If you have another Jupyter server running, make sure it's on a different port than 8888. Otherwise change the port number above or close down the other notebook server.
27 |
28 | 4. Start experimenting with the code in each of the Chapter directories
29 |
30 |
31 |
32 | ## Table of Contents
33 |
34 | ### Chapter 1 - 2
35 |
36 | - `Visualizing spatial data with CARTOframes` ([static preview](https://nbviewer.jupyter.org/github/CartoDB/data-science-book/blob/master/Chapter%201-2/Visualizing%20spatial%20data%20with%20CARTOframes.ipynb)) - a notebook for easily visualizing your data on a map using CARTOframes.
37 |
38 | - `Computing measures of spatial dependence` ([static preview](https://nbviewer.jupyter.org/github/CartoDB/data-science-book/blob/master/Chapter%201-2/Computing%20measures%20of%20spatial%20dependence.ipynb)) - a notebook for exploring spatial dependence in your data and visualize the results using CARTOframes.
39 |
40 | - `Discrete spatial models` ([static preview](https://nbviewer.jupyter.org/github/CartoDB/data-science-book/blob/master/Chapter%201-2/Discrete%20Spatial%20Models.ipynb)) - a notebook with examples of spatial models for discrete processes and visualize the results using CARTOframes.
41 |
42 | - `Continous spatial models` ([static preview](https://nbviewer.jupyter.org/github/CartoDB/data-science-book/blob/master/Chapter%201-2/Continuous%20Spatial%20Models.ipynb)) - a notebook with examples of spatial models for continuous processes and visualize the results using CARTOframes.
43 |
44 | ### Chapter 3
45 |
46 | - `Agglomerative Clustering` ([static preview](https://nbviewer.jupyter.org/github/CartoDB/data-science-book/blob/master/Chapter%203/agglomerative.ipynb)) - a notebook demonstrating how to create spatially constrained clusters using agglomerative clustering
47 | - `DBSCAN` ([static preview](https://nbviewer.jupyter.org/github/CartoDB/data-science-book/blob/master/Chapter%203/dbscan.ipynb)) - a notebook demonstrating how to create clusters of points in geographic coordinates
48 | - `SKATER` ([static preview](https://nbviewer.jupyter.org/github/CartoDB/data-science-book/blob/master/Chapter%203/skater.ipynb)) - a notebook demonstrating how to create spatially constrained clusters that are homogeneous
49 |
50 | ### Chapter 4
51 |
52 | - `Travelling Salesman Problem` ([static preview](https://nbviewer.jupyter.org/github/CartoDB/data-science-book/blob/master/Chapter%204/Travelling%20Salesman%20Problem.ipynb)) - a notebook demonstrating how to solve travelling salesman problem.
53 |
--------------------------------------------------------------------------------
/requirements.txt:
--------------------------------------------------------------------------------
1 | geopandas
2 | cartoframes==v1.0.0
3 | shapely
4 | scikit-gstat
5 | scikit-learn
6 | libpysal
7 | esda
8 | pointpats
9 | splot
10 | rpy2
11 | rtree
12 | netCDF4
13 | git+https://github.com/pysal/region.git#egg=region
14 | geopy
15 | simanneal
16 |
--------------------------------------------------------------------------------
/src_import/modules.R:
--------------------------------------------------------------------------------
1 | libraries <- c('devtools',
2 | 'sp',
3 | 'spdep',
4 | 'rgdal',
5 | 'raster',
6 | 'maptools',
7 | 'gstat',
8 | 'reshape2',
9 | 'magrittr',
10 | 'dplyr',
11 | 'mgcv',
12 | 'splancs',
13 | 'INLA')
14 | CheckInstallPackages <- function(pkgs){
15 |
16 | #For each pkg in pkgs (attempt to load each package one at a time):
17 |
18 | x <- lapply(pkgs, function(pkg){
19 |
20 | #Load the package if available,
21 |
22 | if(!do.call("require", list(pkg))) {
23 | #Silently attempt to install into the default library
24 |
25 | try(install.packages(pkg, lib=.Library,repos="http://cran.rstudio.com"))
26 |
27 | #Now attempt to load the package, catch error if it wasn't installed
28 |
29 | tryCatch(do.call("library", list(pkg)),
30 |
31 | #Catch if we're unable to install into the default library
32 |
33 | error = function(err) {
34 |
35 | #If non-interactive, install into this user's personal library
36 |
37 | if(!interactive()) {
38 |
39 | #Get the path to this user's personal library
40 |
41 | personalLibPath <- Sys.getenv("R_LIBS_USER")
42 |
43 | #If the personal library is not in the list of libraries
44 |
45 | if(is.na(match(personalLibPath, .libPaths()))) {
46 |
47 | #Then create the personal library
48 |
49 | dir.create(personalLibPath, recursive = TRUE)
50 | #And add the personal library to the list of libraries
51 |
52 | .libPaths(personalLibPath)
53 |
54 | }
55 |
56 | #Attempt to install the package into the personal library
57 |
58 | #If this fails, raise the error back to the report
59 |
60 | if(pkg=='INLA'){
61 | install.packages('INLA', repos="https://inla.r-inla-download.org/R/stable", dep=TRUE)
62 | }
63 | if(pkg=='INLAutils'){
64 | install_github('timcdlucas/INLAutils', dep = TRUE)
65 | }
66 |
67 | install.packages(pkg, lib=personalLibPath, repos="http://cran.rstudio.com")
68 |
69 | #Finally, attempt to load the package
70 |
71 | do.call("library", list(pkg))
72 |
73 | }})}})
74 |
75 | }
76 |
77 | CheckInstallPackages(libraries)
--------------------------------------------------------------------------------
/src_import/utils.R:
--------------------------------------------------------------------------------
1 | loadMEUSE <- function(){
2 |
3 | data(meuse)
4 | coordinates(meuse) <- ~x+y
5 | proj4string(meuse) <- CRS("+proj=sterea +lat_0=52.15616055555555 +lon_0=5.38763888888889 +k=0.9999079 +x_0=155000 +y_0=463000 +ellps=bessel +towgs84=565.2369,50.0087,465.658,-0.406857,0.350733,-1.87035,4.0812 +units=m +no_defs")
6 |
7 | data(meuse.grid)
8 | coordinates(meuse.grid) = ~x+y
9 | proj4string(meuse.grid) <- CRS("+proj=sterea +lat_0=52.15616055555555 +lon_0=5.38763888888889 +k=0.9999079 +x_0=155000 +y_0=463000 +ellps=bessel +towgs84=565.2369,50.0087,465.658,-0.406857,0.350733,-1.87035,4.0812 +units=m +no_defs")
10 | gridded(meuse.grid) = TRUE
11 |
12 | meuse_list <- list(meuse,meuse.grid)
13 | names(meuse_list) <- c('meuse','meuse.grid')
14 |
15 | return(meuse_list)
16 | }
17 |
18 | pseudoR2 <- function(y_obs, y_pred){
19 | res <- y_obs-y_pred
20 | tmp <- 1-sum(res^2)/sum((y_obs-mean(y_obs))^2)
21 | return(tmp)
22 | }
23 |
24 | ## Code from gstat
25 | gstatkrg <- function(data, data.grid, formula, filename.out,filename.grid.out, var_model = "Sph"){
26 |
27 | # Compute the empirical variogram and fit model to the residuals
28 | fit <- lm(formula, data)
29 | vg <- variogram(formula, data)
30 | fit.vg <- fit.variogram(vg, vgm(var_model))
31 | data$res <- fit$res
32 | write.table(data, filename.out, sep = ",", col.names = TRUE, row.names = FALSE, quote = FALSE)
33 |
34 | # (Universal) Kriging
35 | krg <- krige(formula, data, data.grid, model = fit.vg)
36 | data.grid$mean <- krg$var1.pred
37 | data.grid$sd <- sqrt(krg$var1.var)
38 | write.table(data.grid, filename.grid.out, sep = ",", col.names = TRUE, row.names = FALSE, quote = FALSE)
39 | }
40 |
41 | get_INLAspde_results <- function(data, data.grid, mesh, model, stack,response_var, filename.out, filename.grid.out, sp_res = c(300,300)){
42 |
43 | ## Predictions
44 | data_out <- data.frame(data,model$summary.fitted.values[inla.stack.index(stack, tag = "train")$data, 1:5])
45 | data_out$pr2 <- pseudoR2(data_out[[response_var]],data_out$mean)
46 | data.grid_out <- data.frame(data.grid,model$summary.fitted.values[inla.stack.index(stack, tag = "pred")$data, 1:5])
47 |
48 | ## Spatial latent field
49 | proj = inla.mesh.projector(mesh, dims = sp_res)
50 | spatial_mean = inla.mesh.project(proj, model$summary.random[['spatial.field']][['mean']], dims = sp_res)
51 | spatial_sd = inla.mesh.project(proj, model$summary.random[['spatial.field']][['sd']], dims = sp_res)
52 | spatial_0.025quant = inla.mesh.project(proj, model$summary.random[['spatial.field']][['0.025quant']], dims = sp_res)
53 | spatial_0.5quant = inla.mesh.project(proj, model$summary.random[['spatial.field']][['0.5quant']], dims = sp_res)
54 | spatial_0.975quant = inla.mesh.project(proj, model$summary.random[['spatial.field']][['0.975quant']], dims = sp_res)
55 | spatial_mode = inla.mesh.project(proj, model$summary.random[['spatial.field']][['mode']], dims = sp_res)
56 |
57 | sp_out <- as.data.frame(cbind(x=proj$lattice$loc[,1],y=proj$lattice$loc[,2], spatial_mean = melt(spatial_mean)$value, spatial_SD = melt(spatial_sd)$value,
58 | spatial_0.025quant =melt(spatial_0.025quant)$value,
59 | spatial_0.5quant =melt(spatial_0.5quant)$value,
60 | spatial_0.975quant =melt(spatial_0.975quant)$value,
61 | spatial_mode =melt(spatial_mode)$value))
62 | colnames(sp_out)[colnames(sp_out)=="spatial_SD"] <- "spatial_sd"
63 |
64 | write.table(data_out,filename.out, sep = ",", col.names = TRUE, row.names = FALSE, quote = FALSE)
65 | write.table(data.grid_out,filename.grid.out, sep = ",", col.names = TRUE, row.names = FALSE, quote = FALSE)
66 | write.table(sp_out,gsub('.csv','_sp.csv',filename.out), sep = ",", col.names = TRUE, row.names = FALSE, quote = FALSE)
67 |
68 | }
69 |
70 | ## Code from https://becarioprecario.bitbucket.io
71 | INLAspde <- function(data, data.grid, family, response_var, predictors, data.crs,filename.out, filename.grid.out){
72 |
73 | #Define mesh
74 | bnd <- inla.nonconvex.hull(coordinates(data.grid),crs=data.crs)
75 | mesh <- inla.mesh.2d(loc = coordinates(data.grid), boundary = bnd, cutoff = 100, max.edge = c(250, 500), offset = c(100, 250))
76 |
77 | png(gsub('.csv','_mesh.png',filename.out), width = 20, height = 20, units = 'cm', res = 300)
78 | plot(mesh, asp = 1, main = "")
79 | points(coordinates(data), pch = 21, bg = 'red', col = 'red', cex = 1)
80 | dev.off()
81 |
82 | #Create SPDE
83 | #sig0 = 0.1; rho0 = 0.1 #rho0 is typical range, sig0 typical sd
84 | ## (ρ_0, P(ρ < ρ_0)=p_ρ) where ρ is the spatial range of the random field.
85 | ## (σ_0, P(σ > σ_0)=p_σ) where σ is the marginal standard deviation of the field)
86 | spde <- inla.spde2.pcmatern(mesh = mesh, alpha = 2, constr=TRUE, prior.range=c(700,0.1), prior.sigma=c(0.2,0.1))
87 | s.index <- inla.spde.make.index(name = "spatial.field",n.spde = spde$n.spde)
88 |
89 | #Create data structure
90 | A.train <- inla.spde.make.A(mesh = mesh, loc = coordinates(data))
91 | stack.train <- inla.stack(data = list(response_var = data[[response_var]]),
92 | A = list(A.train, 1),
93 | effects = list(c(s.index, list(Intercept = 1)),
94 | data.frame(data) %>%
95 | select(predictors) %>%
96 | as.list()),
97 | tag = "train")
98 |
99 | #Create data structure for prediction
100 | A.pred <- inla.spde.make.A(mesh = mesh, loc = coordinates(data.grid))
101 | stack.pred <- inla.stack(data = list(response_var = NA),
102 | A = list(A.pred, 1),
103 | effects = list(c(s.index, list(Intercept = 1)),
104 | data.frame(data.grid) %>%
105 | select(predictors) %>%
106 | as.list()),
107 | tag = "pred")
108 |
109 | #Join stack
110 | stack.join <- inla.stack(stack.train, stack.pred)
111 |
112 | #Fit model
113 | ff <- as.formula(paste('response_var', paste(c('-1', 'Intercept', predictors, 'f(spatial.field, model = spde)'), collapse=" + "),sep='~'))
114 | model <- inla(ff, data = inla.stack.data(stack.join, spde = spde),
115 | family = family,
116 | control.predictor = list(A = inla.stack.A(stack.join), compute = TRUE,link = 1),
117 | control.compute = list(cpo = TRUE, dic = TRUE), verbose = TRUE)
118 |
119 | #Summary of results
120 | print(summary(model))
121 |
122 | get_INLAspde_results(data, data.grid, mesh, model, stack.join,response_var, filename.out, filename.grid.out)
123 |
124 | }
125 |
--------------------------------------------------------------------------------