├── .gitignore ├── LICENSE ├── README.md └── public ├── ingest.ipynb ├── noaa.ipynb └── overture.ipynb /.gitignore: -------------------------------------------------------------------------------- 1 | # Byte-compiled / optimized / DLL files 2 | __pycache__/ 3 | *.py[cod] 4 | *$py.class 5 | 6 | # C extensions 7 | *.so 8 | 9 | # Distribution / packaging 10 | .Python 11 | build/ 12 | develop-eggs/ 13 | dist/ 14 | downloads/ 15 | eggs/ 16 | .eggs/ 17 | lib/ 18 | lib64/ 19 | parts/ 20 | sdist/ 21 | var/ 22 | wheels/ 23 | share/python-wheels/ 24 | *.egg-info/ 25 | .installed.cfg 26 | *.egg 27 | MANIFEST 28 | 29 | # PyInstaller 30 | # Usually these files are written by a python script from a template 31 | # before PyInstaller builds the exe, so as to inject date/other infos into it. 32 | *.manifest 33 | *.spec 34 | 35 | # Installer logs 36 | pip-log.txt 37 | pip-delete-this-directory.txt 38 | 39 | # Unit test / coverage reports 40 | htmlcov/ 41 | .tox/ 42 | .nox/ 43 | .coverage 44 | .coverage.* 45 | .cache 46 | nosetests.xml 47 | coverage.xml 48 | *.cover 49 | *.py,cover 50 | .hypothesis/ 51 | .pytest_cache/ 52 | cover/ 53 | 54 | # Translations 55 | *.mo 56 | *.pot 57 | 58 | # Django stuff: 59 | *.log 60 | local_settings.py 61 | db.sqlite3 62 | db.sqlite3-journal 63 | 64 | # Flask stuff: 65 | instance/ 66 | .webassets-cache 67 | 68 | # Scrapy stuff: 69 | .scrapy 70 | 71 | # Sphinx documentation 72 | docs/_build/ 73 | 74 | # PyBuilder 75 | .pybuilder/ 76 | target/ 77 | 78 | # Jupyter Notebook 79 | .ipynb_checkpoints 80 | 81 | # IPython 82 | profile_default/ 83 | ipython_config.py 84 | 85 | # pyenv 86 | # For a library or package, you might want to ignore these files since the code is 87 | # intended to run in multiple environments; otherwise, check them in: 88 | # .python-version 89 | 90 | # pipenv 91 | # According to pypa/pipenv#598, it is recommended to include Pipfile.lock in version control. 92 | # However, in case of collaboration, if having platform-specific dependencies or dependencies 93 | # having no cross-platform support, pipenv may install dependencies that don't work, or not 94 | # install all needed dependencies. 95 | #Pipfile.lock 96 | 97 | # poetry 98 | # Similar to Pipfile.lock, it is generally recommended to include poetry.lock in version control. 99 | # This is especially recommended for binary packages to ensure reproducibility, and is more 100 | # commonly ignored for libraries. 101 | # https://python-poetry.org/docs/basic-usage/#commit-your-poetrylock-file-to-version-control 102 | #poetry.lock 103 | 104 | # pdm 105 | # Similar to Pipfile.lock, it is generally recommended to include pdm.lock in version control. 106 | #pdm.lock 107 | # pdm stores project-wide configurations in .pdm.toml, but it is recommended to not include it 108 | # in version control. 109 | # https://pdm.fming.dev/#use-with-ide 110 | .pdm.toml 111 | 112 | # PEP 582; used by e.g. github.com/David-OConnor/pyflow and github.com/pdm-project/pdm 113 | __pypackages__/ 114 | 115 | # Celery stuff 116 | celerybeat-schedule 117 | celerybeat.pid 118 | 119 | # SageMath parsed files 120 | *.sage.py 121 | 122 | # Environments 123 | .env 124 | .venv 125 | env/ 126 | venv/ 127 | ENV/ 128 | env.bak/ 129 | venv.bak/ 130 | 131 | # Spyder project settings 132 | .spyderproject 133 | .spyproject 134 | 135 | # Rope project settings 136 | .ropeproject 137 | 138 | # mkdocs documentation 139 | /site 140 | 141 | # mypy 142 | .mypy_cache/ 143 | .dmypy.json 144 | dmypy.json 145 | 146 | # Pyre type checker 147 | .pyre/ 148 | 149 | # pytype static type analyzer 150 | .pytype/ 151 | 152 | # Cython debug symbols 153 | cython_debug/ 154 | 155 | # PyCharm 156 | # JetBrains specific template is maintained in a separate JetBrains.gitignore that can 157 | # be found at https://github.com/github/gitignore/blob/main/Global/JetBrains.gitignore 158 | # and can be added to the global gitignore or merged into this file. For a more nuclear 159 | # option (not recommended) you can uncomment the following to ignore the entire idea folder. 160 | #.idea/ 161 | -------------------------------------------------------------------------------- /LICENSE: -------------------------------------------------------------------------------- 1 | MIT License 2 | 3 | Copyright (c) 2024 Fused 4 | 5 | Permission is hereby granted, free of charge, to any person obtaining a copy 6 | of this software and associated documentation files (the "Software"), to deal 7 | in the Software without restriction, including without limitation the rights 8 | to use, copy, modify, merge, publish, distribute, sublicense, and/or sell 9 | copies of the Software, and to permit persons to whom the Software is 10 | furnished to do so, subject to the following conditions: 11 | 12 | The above copyright notice and this permission notice shall be included in all 13 | copies or substantial portions of the Software. 14 | 15 | THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR 16 | IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, 17 | FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE 18 | AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER 19 | LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, 20 | OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE 21 | SOFTWARE. 22 | -------------------------------------------------------------------------------- /README.md: -------------------------------------------------------------------------------- 1 |

2 | Fused Example Jupyter Notebooks 3 |

4 |

5 | 🌎 Code to Map. Instantly. 6 |

7 |

8 | 9 | ![alt text](https://fused-magic.s3.us-west-2.amazonaws.com/docs_assets/github_udfs_repo/readme_udf_explorer.png) 10 | 11 | This repo is a public collection of Jupyter Notebooks that showcase Fused User Defined Functions (UDFs). 12 | 13 | Fused is the glue layer that interfaces data platforms and data tools via a managed serverless API. With Fused, you can write, share, or discover UDFs which are the building blocks of serverless geospatial operations. UDFs are Python functions that turn into live HTTP endpoints that load their output into any tools that can call an API. 14 | 15 | ## Quickstart 16 | 17 | ### 1. Install Fused Python SDK 18 | 19 | [![PyPI Version](https://img.shields.io/pypi/v/fused.svg)](https://pypi.python.org/pypi/fused) 20 | 21 | The Fused Python SDK is available at [PyPI](https://pypi.org/project/fused/). Use the standard Python [installation tools](https://packaging.python.org/en/latest/tutorials/installing-packages/). UDFs this repo expect the most recent version. 22 | 23 | ```bash 24 | python3 -m venv .venv 25 | source .venv/bin/activate 26 | pip install fused 27 | ``` 28 | ### 2. TODO 29 | 30 | ## Walkthrough TODO 31 | 32 | 33 | ## Ecosystem 34 | 35 | Build any scale workflows with the [Fused Python SDK](https://docs.fused.io/python-sdk/overview) and [Workbench webapp](https://docs.fused.io/workbench/overview), and integrate them into your stack with the [Fused Hosted API](https://docs.fused.io/hosted-api/overview). 36 | 37 | ![alt text](https://fused-magic.s3.us-west-2.amazonaws.com/docs_assets/ecosystem_diagram.png) 38 | 39 | ## Documentation 40 | 41 | Fused documentation is in [docs.fused.io](https://docs.fused.io/). 42 | 43 | ## Contribution guidelines 44 | 45 | All UDF contributions, bug reports, bug fixes, documentation improvements, enhancements, and ideas are welcome. 46 | 47 | ## License 48 | 49 | [MIT License](./LICENSE) 50 | 51 | -------------------------------------------------------------------------------- /public/ingest.ipynb: -------------------------------------------------------------------------------- 1 | { 2 | "cells": [ 3 | { 4 | "attachments": {}, 5 | "cell_type": "markdown", 6 | "metadata": { 7 | "id": "UhnqNrO6UhHj" 8 | }, 9 | "source": [ 10 | "# How-to: Ingestion\n", 11 | "\n", 12 | "This example notebook shows how to use Fused to ingest data into an S3 bucket." 13 | ] 14 | }, 15 | { 16 | "attachments": {}, 17 | "cell_type": "markdown", 18 | "metadata": { 19 | "id": "cN6FpPH4WZms" 20 | }, 21 | "source": [ 22 | "## Ingest data\n", 23 | "\n", 24 | "Fused delivers speed advantages thanks to spatial partitioning. Geospatial operations between two or more datasets are usually for spatially overlapping or neighboring areas - and usually for localized areas of interest. Breaking down datasets across geographic chunks loads only the relevant data for each operation.\n", 25 | "\n", 26 | "The [`fused.ingest()`](/python-sdk/api/top-level-functions/#fused.ingest) method loads data into an S3 bucket and automatically geo partitions it.\n", 27 | "\n", 28 | "Datasets ingested with Fused are spatially partitioned collections of Parquet files. Each file has one or more chunks, which are a further level of spatial partitioning.\n", 29 | "\n", 30 | "Columns in a dataset are grouped into tables. An ingested dataset contains a `main` table with the original input columns and a `fused` table containing spatial metadata.\n", 31 | "\n", 32 | "The `ingest()` method has many configuration options, which the API documentation explains. The following sections cover a few different ingestion use cases.\n", 33 | "\n", 34 | "Pro tip: While Fused is generally used to ingest files, it's also possible to pass the `GeoDataFrame` directly to `fused.ingest()`." 35 | ] 36 | }, 37 | { 38 | "attachments": {}, 39 | "cell_type": "markdown", 40 | "metadata": {}, 41 | "source": [ 42 | "### Default ingestion\n", 43 | "By default ingestion tries to create a certain number of files (`target_num_files=20`). The number of rows per file and chunk are chosen to meet this target. Note that 20 files is only a target and the actual number of files generated can vary." 44 | ] 45 | }, 46 | { 47 | "cell_type": "code", 48 | "execution_count": 6, 49 | "metadata": {}, 50 | "outputs": [], 51 | "source": [ 52 | "import fused\n", 53 | "\n", 54 | "\n", 55 | "job = fused.ingest(\n", 56 | " input=\"https://www2.census.gov/geo/tiger/TIGER_RD18/LAYER/TRACT/tl_rd22_11_tract.zip\",\n", 57 | " output=f\"fd://census/dc_tract\",\n", 58 | ")\n", 59 | "job_id = job.run_remote(overwrite=True)" 60 | ] 61 | }, 62 | { 63 | "cell_type": "markdown", 64 | "metadata": {}, 65 | "source": [ 66 | "While the job is running, follow its logs." 67 | ] 68 | }, 69 | { 70 | "cell_type": "code", 71 | "execution_count": null, 72 | "metadata": {}, 73 | "outputs": [], 74 | "source": [ 75 | "job_id.tail_logs()" 76 | ] 77 | }, 78 | { 79 | "cell_type": "markdown", 80 | "metadata": {}, 81 | "source": [ 82 | "### Row-based ingestion\n", 83 | "\n", 84 | "Our basic ingestion is row-based, where the user set the maximum number of rows per each chunk and file." 85 | ] 86 | }, 87 | { 88 | "cell_type": "code", 89 | "execution_count": 8, 90 | "metadata": {}, 91 | "outputs": [], 92 | "source": [ 93 | "job = fused.ingest(\n", 94 | " input=\"https://www2.census.gov/geo/tiger/TIGER_RD18/LAYER/TRACT/tl_rd22_11_tract.zip\",\n", 95 | " explode_geometries=True,\n", 96 | " partitioning_method=\"rows\",\n", 97 | " partitioning_maximum_per_file=100,\n", 98 | " partitioning_maximum_per_chunk=10,\n", 99 | ")\n", 100 | "job_id = job.run_remote(overwrite=True)" 101 | ] 102 | }, 103 | { 104 | "attachments": {}, 105 | "cell_type": "markdown", 106 | "metadata": {}, 107 | "source": [ 108 | "### Area-based ingestion\n", 109 | "\n", 110 | "Fused also supports area-based ingestion, where the number of rows in each partition is determined by the sum of their area.\n" 111 | ] 112 | }, 113 | { 114 | "cell_type": "code", 115 | "execution_count": 10, 116 | "metadata": {}, 117 | "outputs": [], 118 | "source": [ 119 | "job = fused.ingest(\n", 120 | " input=\"https://www2.census.gov/geo/tiger/TIGER_RD18/LAYER/TRACT/tl_rd22_11_tract.zip\",\n", 121 | " output=f\"fd://census/dc_tract_area\",\n", 122 | " explode_geometries=True,\n", 123 | " partitioning_method=\"area\",\n", 124 | " partitioning_maximum_per_file=None,\n", 125 | " partitioning_maximum_per_chunk=None,\n", 126 | ")\n", 127 | "job_id = job.run_remote(overwrite=True)" 128 | ] 129 | }, 130 | { 131 | "attachments": {}, 132 | "cell_type": "markdown", 133 | "metadata": {}, 134 | "source": [ 135 | "### Geometry subdivision\n", 136 | "\n", 137 | "It's also possible to subdivide geometries in the ingestion process." 138 | ] 139 | }, 140 | { 141 | "cell_type": "code", 142 | "execution_count": 11, 143 | "metadata": {}, 144 | "outputs": [], 145 | "source": [ 146 | "job = fused.ingest(\n", 147 | " input=\"https://www2.census.gov/geo/tiger/TIGER_RD18/LAYER/TRACT/tl_rd22_11_tract.zip\",\n", 148 | " output=f\"fd://census/dc_tract_geometry\",\n", 149 | " explode_geometries=True,\n", 150 | " partitioning_method=\"area\",\n", 151 | " partitioning_maximum_per_file=None,\n", 152 | " partitioning_maximum_per_chunk=None,\n", 153 | " subdivide_start=0.001,\n", 154 | " subdivide_stop=0.0001,\n", 155 | " subdivide_method=\"area\",\n", 156 | ")\n", 157 | "job_id = job.run_remote(overwrite=True)" 158 | ] 159 | }, 160 | { 161 | "attachments": {}, 162 | "cell_type": "markdown", 163 | "metadata": {}, 164 | "source": [ 165 | "Once ingestion completes, [`fused.open_table`](/python-sdk/api/experimental/#fused._experimental.open_table) returns the corresponding [`Table`](/python-sdk/api/experimental/#fused.models.Table) object.\n", 166 | "\n", 167 | "The notebook _repr_ provides insight into the Table structure.\n", 168 | "\n", 169 | "- Each table has one or more _files_, which are spatially partitioned.\n", 170 | "- Each file has one or more _chunks_, which are again spatially partitioned within the file.\n", 171 | "\n", 172 | "Optionally, tables can be part of a `Dataset`, which consists of one or more _tables_.\n" 173 | ] 174 | }, 175 | { 176 | "cell_type": "code", 177 | "execution_count": 12, 178 | "metadata": {}, 179 | "outputs": [ 180 | { 181 | "data": { 182 | "text/html": [ 183 | "
\n", 184 | "\n", 185 | "\n", 186 | "\n", 187 | "\n", 188 | "\n", 189 | "\n", 190 | "\n", 191 | "\n", 192 | "\n", 193 | "\n", 194 | "\n", 195 | "\n", 196 | "\n", 197 | "\n", 198 | "\n", 199 | "\n", 200 | "\n", 201 | "\n", 202 | "\n", 203 | "\n", 204 | "
Table(version='0.0.3', url='fd://census/dc_tract', name='dc_tract', table_schema=Schema(fields=[Field(name='STATEFP', type=<PrimitiveDataType.String: 'String'>, nullable=True, metadata=None), Field(name='COUNTYFP', type=<PrimitiveDataType.String: 'String'>, nullable=True, metadata=None), Field(name='TRACTCE', type=<PrimitiveDataType.String: 'String'>, nullable=True, metadata=None), Field(name='GEOID', type=<PrimitiveDataType.String: 'String'>, nullable=True, metadata=None), Field(name='NAME', type=<PrimitiveDataType.String: 'String'>, nullable=True, metadata=None), Field(name='NAMELSAD', type=<PrimitiveDataType.String: 'String'>, nullable=True, metadata=None), Field(name='MTFCC', type=<PrimitiveDataType.String: 'String'>, nullable=True, metadata=None), Field(name='FUNCSTAT', type=<PrimitiveDataType.String: 'String'>, nullable=True, metadata=None), Field(name='ALAND', type=<PrimitiveDataType.Int64: 'Int64'>, nullable=True, metadata=None), Field(name='AWATER', type=<PrimitiveDataType.Int64: 'Int64'>, nullable=True, metadata=None), Field(name='INTPTLAT', type=<PrimitiveDataType.String: 'String'>, nullable=True, metadata=None), Field(name='INTPTLON', type=<PrimitiveDataType.String: 'String'>, nullable=True, metadata=None), Field(name='geometry', type=<PrimitiveDataType.Binary: 'Binary'>, nullable=True, metadata={'ARROW:extension:name': 'ogc.wkb'})], metadata={'geo': '{"primary_column": "geometry", "columns": {"geometry": {"encoding": "WKB", "crs": {"$schema": "https://proj.org/schemas/v0.7/projjson.schema.json", "type": "GeographicCRS", "name": "WGS 84", "datum_ensemble": {"name": "World Geodetic System 1984 ensemble", "members": [{"name": "World Geodetic System 1984 (Transit)", "id": {"authority": "EPSG", "code": 1166}}, {"name": "World Geodetic System 1984 (G730)", "id": {"authority": "EPSG", "code": 1152}}, {"name": "World Geodetic System 1984 (G873)", "id": {"authority": "EPSG", "code": 1153}}, {"name": "World Geodetic System 1984 (G1150)", "id": {"authority": "EPSG", "code": 1154}}, {"name": "World Geodetic System 1984 (G1674)", "id": {"authority": "EPSG", "code": 1155}}, {"name": "World Geodetic System 1984 (G1762)", "id": {"authority": "EPSG", "code": 1156}}, {"name": "World Geodetic System 1984 (G2139)", "id": {"authority": "EPSG", "code": 1309}}], "ellipsoid": {"name": "WGS 84", "semi_major_axis": 6378137, "inverse_flattening": 298.257223563}, "accuracy": "2.0", "id": {"authority": "EPSG", "code": 6326}}, "coordinate_system": {"subtype": "ellipsoidal", "axis": [{"name": "Geodetic latitude", "abbreviation": "Lat", "direction": "north", "unit": "degree"}, {"name": "Geodetic longitude", "abbreviation": "Lon", "direction": "east", "unit": "degree"}]}, "scope": "Horizontal component of 3D system.", "area": "World.", "bbox": {"south_latitude": -90, "west_longitude": -180, "north_latitude": 90, "east_longitude": 180}, "id": {"authority": "EPSG", "code": 4326}}, "geometry_types": ["WKB"], "bbox": [-77.119759, 38.791644999999995, -76.909393, 38.995844999999996]}}, "version": "1.0.0-beta.1"}', 'fused:job_meta': '{"instance_settings": {"ec2_instance_type": "m5.16xlarge", "hdd_size_gb": 100}, "revision": "8846eedbc8a0b6a9530fd03208934d2b2b7c1b8c", "job_config": {"version": "0.0.3", "name": null, "steps": [{"version": "0.0.3", "type": "partition_geospatial", "name": null, "metadata": null, "input": "https://www2.census.gov/geo/tiger/TIGER_RD18/LAYER/TRACT/tl_rd22_11_tract.zip", "output": null, "output": "fd://census/dc_tract", "output_metadata": null, "partitioning_maximum_per_file": null, "partitioning_maximum_per_chunk": null, "table_schema": null, "file_suffix": null, "load_columns": null, "remove_cols": [], "explode_geometries": false, "drop_out_of_bounds": null, "lonlat_cols": null, "partitioning_max_width_ratio": 2, "partitioning_max_height_ratio": 2, "partitioning_method": "rows", "partitioning_force_utm": "chunk", "partitioning_split_method": "mean", "subdivide_start": null, "subdivide_stop": null, "subdivide_method": null, "split_identical_centroids": true, "target_num_files": 20, "gdal_config": {"open_options": {}, "layer": null}}], "metadata": null}, "step_config": {"version": "0.0.3", "type": "partition_geospatial", "name": null, "metadata": null, "input": "https://www2.census.gov/geo/tiger/TIGER_RD18/LAYER/TRACT/tl_rd22_11_tract.zip", "output": null, "output": "fd://census/dc_tract", "output_metadata": null, "partitioning_maximum_per_file": null, "partitioning_maximum_per_chunk": null, "table_schema": null, "file_suffix": null, "load_columns": null, "remove_cols": [], "explode_geometries": false, "drop_out_of_bounds": null, "lonlat_cols": null, "partitioning_max_width_ratio": 2, "partitioning_max_height_ratio": 2, "partitioning_method": "rows", "partitioning_force_utm": "chunk", "partitioning_split_method": "mean", "subdivide_start": null, "subdivide_stop": null, "subdivide_method": null, "split_identical_centroids": true, "target_num_files": 20, "gdal_config": {"open_options": {}, "layer": null}}, "input": null, "time_taken": null, "production": true, "usage": null, "job_id": "720f0b78-6d1c-45e8-9867-85b0cf9ebfb6", "has_ifused": null, "has_sample": null, "has_metadata": null, "num_rows": null, "num_files": null, "num_chunks": null}'}), parent=JobMetadata(version='0.0.3', ec2_instance_type=None, step_config=GeospatialPartitionJobStepConfig(version='0.0.3', type='partition_geospatial', name=None, metadata=None, input='https://www2.census.gov/geo/tiger/TIGER_RD18/LAYER/TRACT/tl_rd22_11_tract.zip', output='fd://census/dc_tract', output_metadata=None, partitioning_maximum_per_file=None, partitioning_maximum_per_chunk=None, table_schema=None, file_suffix=None, load_columns=None, remove_cols=[], explode_geometries=False, drop_out_of_bounds=None, lonlat_cols=None, partitioning_max_width_ratio=2, partitioning_max_height_ratio=2, partitioning_method='rows', partitioning_force_utm='chunk', partitioning_split_method='mean', subdivide_start=None, subdivide_stop=None, subdivide_method=None, split_identical_centroids=True, target_num_files=20, gdal_config=GDALOpenConfig(open_options={}, layer=None)), time_taken=None, job_id='720f0b78-6d1c-45e8-9867-85b0cf9ebfb6'), column_names=['STATEFP', 'COUNTYFP', 'TRACTCE', 'GEOID', 'NAME', 'NAMELSAD', 'MTFCC', 'FUNCSTAT', 'ALAND', 'AWATER', 'INTPTLAT', 'INTPTLON', 'geometry'], num_rows=206, num_files=28, num_chunks=150, status=None)
" 612 | ], 613 | "text/plain": [ 614 | "Table(version='0.0.3', url='fd://census/dc_tract', name='dc_tract', table_schema=Schema(fields=[Field(name='STATEFP', type=, nullable=True, metadata=None), Field(name='COUNTYFP', type=, nullable=True, metadata=None), Field(name='TRACTCE', type=, nullable=True, metadata=None), Field(name='GEOID', type=, nullable=True, metadata=None), Field(name='NAME', type=, nullable=True, metadata=None), Field(name='NAMELSAD', type=, nullable=True, metadata=None), Field(name='MTFCC', type=, nullable=True, metadata=None), Field(name='FUNCSTAT', type=, nullable=True, metadata=None), Field(name='ALAND', type=, nullable=True, metadata=None), Field(name='AWATER', type=, nullable=True, metadata=None), Field(name='INTPTLAT', type=, nullable=True, metadata=None), Field(name='INTPTLON', type=, nullable=True, metadata=None), Field(name='geometry', type=, nullable=True, metadata={'ARROW:extension:name': 'ogc.wkb'})], metadata={'geo': '{\"primary_column\": \"geometry\", \"columns\": {\"geometry\": {\"encoding\": \"WKB\", \"crs\": {\"$schema\": \"https://proj.org/schemas/v0.7/projjson.schema.json\", \"type\": \"GeographicCRS\", \"name\": \"WGS 84\", \"datum_ensemble\": {\"name\": \"World Geodetic System 1984 ensemble\", \"members\": [{\"name\": \"World Geodetic System 1984 (Transit)\", \"id\": {\"authority\": \"EPSG\", \"code\": 1166}}, {\"name\": \"World Geodetic System 1984 (G730)\", \"id\": {\"authority\": \"EPSG\", \"code\": 1152}}, {\"name\": \"World Geodetic System 1984 (G873)\", \"id\": {\"authority\": \"EPSG\", \"code\": 1153}}, {\"name\": \"World Geodetic System 1984 (G1150)\", \"id\": {\"authority\": \"EPSG\", \"code\": 1154}}, {\"name\": \"World Geodetic System 1984 (G1674)\", \"id\": {\"authority\": \"EPSG\", \"code\": 1155}}, {\"name\": \"World Geodetic System 1984 (G1762)\", \"id\": {\"authority\": \"EPSG\", \"code\": 1156}}, {\"name\": \"World Geodetic System 1984 (G2139)\", \"id\": {\"authority\": \"EPSG\", \"code\": 1309}}], \"ellipsoid\": {\"name\": \"WGS 84\", \"semi_major_axis\": 6378137, \"inverse_flattening\": 298.257223563}, \"accuracy\": \"2.0\", \"id\": {\"authority\": \"EPSG\", \"code\": 6326}}, \"coordinate_system\": {\"subtype\": \"ellipsoidal\", \"axis\": [{\"name\": \"Geodetic latitude\", \"abbreviation\": \"Lat\", \"direction\": \"north\", \"unit\": \"degree\"}, {\"name\": \"Geodetic longitude\", \"abbreviation\": \"Lon\", \"direction\": \"east\", \"unit\": \"degree\"}]}, \"scope\": \"Horizontal component of 3D system.\", \"area\": \"World.\", \"bbox\": {\"south_latitude\": -90, \"west_longitude\": -180, \"north_latitude\": 90, \"east_longitude\": 180}, \"id\": {\"authority\": \"EPSG\", \"code\": 4326}}, \"geometry_types\": [\"WKB\"], \"bbox\": [-77.119759, 38.791644999999995, -76.909393, 38.995844999999996]}}, \"version\": \"1.0.0-beta.1\"}', 'fused:job_meta': '{\"instance_settings\": {\"ec2_instance_type\": \"m5.16xlarge\", \"hdd_size_gb\": 100}, \"revision\": \"8846eedbc8a0b6a9530fd03208934d2b2b7c1b8c\", \"job_config\": {\"version\": \"0.0.3\", \"name\": null, \"steps\": [{\"version\": \"0.0.3\", \"type\": \"partition_geospatial\", \"name\": null, \"metadata\": null, \"input\": \"https://www2.census.gov/geo/tiger/TIGER_RD18/LAYER/TRACT/tl_rd22_11_tract.zip\", \"output\": null, \"output\": \"fd://census/dc_tract\", \"output_metadata\": null, \"partitioning_maximum_per_file\": null, \"partitioning_maximum_per_chunk\": null, \"table_schema\": null, \"file_suffix\": null, \"load_columns\": null, \"remove_cols\": [], \"explode_geometries\": false, \"drop_out_of_bounds\": null, \"lonlat_cols\": null, \"partitioning_max_width_ratio\": 2, \"partitioning_max_height_ratio\": 2, \"partitioning_method\": \"rows\", \"partitioning_force_utm\": \"chunk\", \"partitioning_split_method\": \"mean\", \"subdivide_start\": null, \"subdivide_stop\": null, \"subdivide_method\": null, \"split_identical_centroids\": true, \"target_num_files\": 20, \"gdal_config\": {\"open_options\": {}, \"layer\": null}}], \"metadata\": null}, \"step_config\": {\"version\": \"0.0.3\", \"type\": \"partition_geospatial\", \"name\": null, \"metadata\": null, \"input\": \"https://www2.census.gov/geo/tiger/TIGER_RD18/LAYER/TRACT/tl_rd22_11_tract.zip\", \"output\": null, \"output\": \"fd://census/dc_tract\", \"output_metadata\": null, \"partitioning_maximum_per_file\": null, \"partitioning_maximum_per_chunk\": null, \"table_schema\": null, \"file_suffix\": null, \"load_columns\": null, \"remove_cols\": [], \"explode_geometries\": false, \"drop_out_of_bounds\": null, \"lonlat_cols\": null, \"partitioning_max_width_ratio\": 2, \"partitioning_max_height_ratio\": 2, \"partitioning_method\": \"rows\", \"partitioning_force_utm\": \"chunk\", \"partitioning_split_method\": \"mean\", \"subdivide_start\": null, \"subdivide_stop\": null, \"subdivide_method\": null, \"split_identical_centroids\": true, \"target_num_files\": 20, \"gdal_config\": {\"open_options\": {}, \"layer\": null}}, \"input\": null, \"time_taken\": null, \"production\": true, \"usage\": null, \"job_id\": \"720f0b78-6d1c-45e8-9867-85b0cf9ebfb6\", \"has_ifused\": null, \"has_sample\": null, \"has_metadata\": null, \"num_rows\": null, \"num_files\": null, \"num_chunks\": null}'}), parent=JobMetadata(version='0.0.3', ec2_instance_type=None, step_config=GeospatialPartitionJobStepConfig(version='0.0.3', type='partition_geospatial', name=None, metadata=None, input='https://www2.census.gov/geo/tiger/TIGER_RD18/LAYER/TRACT/tl_rd22_11_tract.zip', output='fd://census/dc_tract', output_metadata=None, partitioning_maximum_per_file=None, partitioning_maximum_per_chunk=None, table_schema=None, file_suffix=None, load_columns=None, remove_cols=[], explode_geometries=False, drop_out_of_bounds=None, lonlat_cols=None, partitioning_max_width_ratio=2, partitioning_max_height_ratio=2, partitioning_method='rows', partitioning_force_utm='chunk', partitioning_split_method='mean', subdivide_start=None, subdivide_stop=None, subdivide_method=None, split_identical_centroids=True, target_num_files=20, gdal_config=GDALOpenConfig(open_options={}, layer=None)), time_taken=None, job_id='720f0b78-6d1c-45e8-9867-85b0cf9ebfb6'), column_names=['STATEFP', 'COUNTYFP', 'TRACTCE', 'GEOID', 'NAME', 'NAMELSAD', 'MTFCC', 'FUNCSTAT', 'ALAND', 'AWATER', 'INTPTLAT', 'INTPTLON', 'geometry'], num_rows=206, num_files=28, num_chunks=150, status=None)" 615 | ] 616 | }, 617 | "execution_count": 12, 618 | "metadata": {}, 619 | "output_type": "execute_result" 620 | } 621 | ], 622 | "source": [ 623 | "census_tracts = fused.open_table(f\"fd://census/dc_tract\")\n", 624 | "census_tracts" 625 | ] 626 | } 627 | ], 628 | "metadata": { 629 | "colab": { 630 | "provenance": [] 631 | }, 632 | "kernelspec": { 633 | "display_name": "Python 3", 634 | "name": "python3" 635 | }, 636 | "language_info": { 637 | "codemirror_mode": { 638 | "name": "ipython", 639 | "version": 3 640 | }, 641 | "file_extension": ".py", 642 | "mimetype": "text/x-python", 643 | "name": "python", 644 | "nbconvert_exporter": "python", 645 | "pygments_lexer": "ipython3", 646 | "version": "3.11.5" 647 | } 648 | }, 649 | "nbformat": 4, 650 | "nbformat_minor": 0 651 | } 652 | -------------------------------------------------------------------------------- /public/overture.ipynb: -------------------------------------------------------------------------------- 1 | { 2 | "cells": [ 3 | { 4 | "cell_type": "markdown", 5 | "metadata": { 6 | "id": "Z5K7s21O1_tQ" 7 | }, 8 | "source": [ 9 | "# Spatial query of the Overture Buildings dataset with Fused\n", 10 | "\n", 11 | "\n", 12 | "Welcome! This Notebook designed to perform a spatial query on the Overture Buildings dataset.\n", 13 | "\n", 14 | "The original dataset is several hundred GB in size. The Fused User Defined Function (UDF) below fetches only the buildings that fall within the area of interest. This means you can use this code to simplify your workflows by loading only the fraction of data you care about.\n", 15 | "\n", 16 | "This User Defined Function (UDF) queries a geo partitioned version of the Overture Buildings Dataset, [hosted on Source Cooperative](https://beta.source.coop/repositories/fused/overture/), that corresponds to the area of an input GeoDataFrame. It returns a GeoDataFrame containing the subsampled data, which gets cached to the local environment for added speed. You can find the UDF code in the [public UDF GitHub repo](https://github.com/fusedio/udfs/tree/main/public/Overture_Maps_Example).\n", 17 | "\n", 18 | "Let's begin!" 19 | ] 20 | }, 21 | { 22 | "cell_type": "code", 23 | "execution_count": null, 24 | "metadata": { 25 | "id": "ejugfC0D1_tR" 26 | }, 27 | "outputs": [], 28 | "source": [ 29 | "!pip install fused==1.4.0 pyarrow==12.0.1 fsspec==2023.12.2" 30 | ] 31 | }, 32 | { 33 | "cell_type": "code", 34 | "execution_count": null, 35 | "metadata": { 36 | "id": "OCFhvdrUnftq" 37 | }, 38 | "outputs": [], 39 | "source": [ 40 | "# Enable ipyleaflet widgets on Colab, which require a specific ipywidgets version\n", 41 | "!pip install ipywidgets==7.7.1\n", 42 | "\n", 43 | "from google.colab import output\n", 44 | "output.enable_custom_widget_manager()\n", 45 | "\n", 46 | "import ipyleaflet\n", 47 | "import ipywidgets\n", 48 | "\n", 49 | "print(ipyleaflet.__version__) # Must be 0.18.2\n", 50 | "print(ipywidgets.__version__) # Must be 7.7.1" 51 | ] 52 | }, 53 | { 54 | "cell_type": "code", 55 | "execution_count": null, 56 | "metadata": { 57 | "id": "cnYF0Lr41_tR" 58 | }, 59 | "outputs": [], 60 | "source": [ 61 | "import geopandas as gpd\n", 62 | "from shapely.geometry import shape\n", 63 | "import fused\n", 64 | "\n", 65 | "\n", 66 | "# Create a map centered at the given location\n", 67 | "MAP_LOCATION = (37.7749, -122.4194)" 68 | ] 69 | }, 70 | { 71 | "cell_type": "code", 72 | "execution_count": null, 73 | "metadata": { 74 | "id": "PwpST7i81_tS" 75 | }, 76 | "outputs": [], 77 | "source": [ 78 | "from ipyleaflet import DrawControl, GeoData, Map, basemap_to_tiles, basemaps\n", 79 | "\n", 80 | "m = Map(\n", 81 | " layers=(basemap_to_tiles(basemaps.OpenStreetMap.Mapnik),),\n", 82 | " center=MAP_LOCATION,\n", 83 | " zoom=13,\n", 84 | ")\n", 85 | "\n", 86 | "def on_draw(self, action, geo_json):\n", 87 | " # Utility to clear map\n", 88 | " if action == 'deleted':\n", 89 | " existing_layers = [l for l in m.layers if isinstance(l, GeoData)]\n", 90 | " if existing_layers:\n", 91 | " print('Clearing')\n", 92 | " for l in existing_layers:\n", 93 | " m.remove(l)\n", 94 | " return\n", 95 | " print(\"Running...\")\n", 96 | "\n", 97 | " # GeoDataFrame from drawn shape\n", 98 | " gdf = gpd.GeoDataFrame(index=[0], crs=\"epsg:4326\", geometry=[shape(geo_json.get(\"geometry\"))])\n", 99 | "\n", 100 | " # Run public UDF, identified by a signed token\n", 101 | " gdf_buildings = fused.core.run_shared_file(token='2d3dac3bd2ee5318a0d887929193e83917a9a24ba99a3affae54132437ac79fb', bbox=gdf.to_json())\n", 102 | "\n", 103 | " # For running locally:\n", 104 | " # udf = fused.load(\n", 105 | " # \"https://github.com/fusedio/udfs/tree/fc5b446/public/Overture_Maps_Example\"\n", 106 | " # )\n", 107 | " # gdf_buildings = udf.run_local(bbox=gdf)\n", 108 | "\n", 109 | " # Render polygons on map\n", 110 | " m.add(GeoData(geo_dataframe=gdf_buildings))\n", 111 | " print(\"Done!\")\n", 112 | "\n", 113 | "draw_control = DrawControl()\n", 114 | "draw_control.on_draw(on_draw)\n", 115 | "draw_control.polygon = {\n", 116 | " \"shapeOptions\": {\"color\": \"red\", \"fillOpacity\": 0.0},\n", 117 | " \"drawError\": {\"color\": \"#dd253b\", \"message\": \"Oups!\"},\n", 118 | " \"allowIntersection\": False,\n", 119 | "}\n", 120 | "\n", 121 | "draw_control.rectangle = {\"shapeOptions\": {\"color\": \"red\", \"fillOpacity\": 0.0}}\n", 122 | "m.add(draw_control)\n", 123 | "m" 124 | ] 125 | }, 126 | { 127 | "cell_type": "code", 128 | "source": [], 129 | "metadata": { 130 | "id": "Hvwk3qNctvUL" 131 | }, 132 | "execution_count": null, 133 | "outputs": [] 134 | } 135 | ], 136 | "metadata": { 137 | "colab": { 138 | "provenance": [] 139 | }, 140 | "kernelspec": { 141 | "display_name": ".venv", 142 | "language": "python", 143 | "name": "python3" 144 | }, 145 | "language_info": { 146 | "codemirror_mode": { 147 | "name": "ipython", 148 | "version": 3 149 | }, 150 | "file_extension": ".py", 151 | "mimetype": "text/x-python", 152 | "name": "python", 153 | "nbconvert_exporter": "python", 154 | "pygments_lexer": "ipython3", 155 | "version": "3.11.7" 156 | } 157 | }, 158 | "nbformat": 4, 159 | "nbformat_minor": 0 160 | } 161 | --------------------------------------------------------------------------------