├── .github
└── workflows
│ ├── ci.yml
│ └── pre-commit.yml
├── .gitignore
├── .pre-commit-config.yaml
├── Dockerfile
├── LICENSE
├── README.md
├── binder.jupyterlab-workspace
├── examples
├── Reading-Parquet-Files-using-DuckDB.ipynb
├── Substrait.ipynb
├── campaign-finance.ipynb
├── clickhouse-hackernews.ipynb
└── imdb.ipynb
├── requirements-test.txt
├── requirements.txt
├── scripts
└── prepare_campaign_finance_data.py
├── tutorial
├── 01-Introduction-to-Ibis.ipynb
├── 02-Aggregates-Joins.ipynb
├── 03-Expressions-Lazy-Mode-Logging.ipynb
├── 04-More-Value-Expressions.ipynb
├── 05-IO-Create-Insert-External-Data.ipynb
├── 06-ComplexFiltering.ipynb
├── 07-Analytics-Tools.ipynb
└── tutorial_utils.py
└── welcome.md
/.github/workflows/ci.yml:
--------------------------------------------------------------------------------
1 | name: Build and Test
2 |
3 | on:
4 | push:
5 | branches: [main]
6 | pull_request:
7 | branches: [main]
8 | paths-ignore:
9 | - "README.md"
10 | schedule:
11 | - cron: "0 8 * * 1-5" # Every weekday at 8 am UTC,
12 |
13 | jobs:
14 | tests:
15 | name: Run Tests
16 | runs-on: ubuntu-latest
17 |
18 | steps:
19 | - uses: actions/checkout@v2
20 |
21 | - name: Install Python
22 | uses: actions/setup-python@v2
23 | with:
24 | python-version: "3.11"
25 |
26 | - name: Install dependencies
27 | run: |
28 | pip install -r requirements.txt -r requirements-test.txt
29 |
30 | - name: Run tests
31 | run: |
32 | pytest --nbmake -vv tutorial/ examples/
33 |
--------------------------------------------------------------------------------
/.github/workflows/pre-commit.yml:
--------------------------------------------------------------------------------
1 | name: Linting
2 |
3 | on:
4 | push:
5 | branches: main
6 | pull_request:
7 | branches: main
8 |
9 | jobs:
10 | lint:
11 | name: Linting
12 | runs-on: ubuntu-latest
13 |
14 | steps:
15 | - uses: actions/checkout@v2
16 |
17 | - name: Install Python
18 | uses: actions/setup-python@v2
19 | with:
20 | python-version: "3.10"
21 |
22 | - name: Run pre-commit
23 | uses: pre-commit/action@v3.0.0
24 |
--------------------------------------------------------------------------------
/.gitignore:
--------------------------------------------------------------------------------
1 | # Byte-compiled / optimized / DLL files
2 | __pycache__/
3 | *.py[cod]
4 | *$py.class
5 |
6 | # C extensions
7 | *.so
8 |
9 | # Distribution / packaging
10 | .Python
11 | build/
12 | develop-eggs/
13 | dist/
14 | downloads/
15 | eggs/
16 | .eggs/
17 | lib/
18 | lib64/
19 | parts/
20 | sdist/
21 | var/
22 | wheels/
23 | pip-wheel-metadata/
24 | share/python-wheels/
25 | *.egg-info/
26 | .installed.cfg
27 | *.egg
28 | MANIFEST
29 |
30 | # PyInstaller
31 | # Usually these files are written by a python script from a template
32 | # before PyInstaller builds the exe, so as to inject date/other infos into it.
33 | *.manifest
34 | *.spec
35 |
36 | # Installer logs
37 | pip-log.txt
38 | pip-delete-this-directory.txt
39 |
40 | # Unit test / coverage reports
41 | htmlcov/
42 | .tox/
43 | .nox/
44 | .coverage
45 | .coverage.*
46 | .cache
47 | nosetests.xml
48 | coverage.xml
49 | *.cover
50 | *.py,cover
51 | .hypothesis/
52 | .pytest_cache/
53 |
54 | # Translations
55 | *.mo
56 | *.pot
57 |
58 | # Django stuff:
59 | *.log
60 | local_settings.py
61 | db.sqlite3
62 | db.sqlite3-journal
63 |
64 | # Flask stuff:
65 | instance/
66 | .webassets-cache
67 |
68 | # Scrapy stuff:
69 | .scrapy
70 |
71 | # Sphinx documentation
72 | docs/_build/
73 |
74 | # PyBuilder
75 | target/
76 |
77 | # Jupyter Notebook
78 | .ipynb_checkpoints
79 |
80 | # IPython
81 | profile_default/
82 | ipython_config.py
83 |
84 | # pyenv
85 | .python-version
86 |
87 | # pipenv
88 | # According to pypa/pipenv#598, it is recommended to include Pipfile.lock in version control.
89 | # However, in case of collaboration, if having platform-specific dependencies or dependencies
90 | # having no cross-platform support, pipenv may install dependencies that don't work, or not
91 | # install all needed dependencies.
92 | #Pipfile.lock
93 |
94 | # PEP 582; used by e.g. github.com/David-OConnor/pyflow
95 | __pypackages__/
96 |
97 | # Celery stuff
98 | celerybeat-schedule
99 | celerybeat.pid
100 |
101 | # SageMath parsed files
102 | *.sage.py
103 |
104 | # Environments
105 | .env
106 | .venv
107 | env/
108 | venv/
109 | ENV/
110 | env.bak/
111 | venv.bak/
112 |
113 | # Spyder project settings
114 | .spyderproject
115 | .spyproject
116 |
117 | # Rope project settings
118 | .ropeproject
119 |
120 | # mkdocs documentation
121 | /site
122 |
123 | # mypy
124 | .mypy_cache/
125 | .dmypy.json
126 | dmypy.json
127 |
128 | # Pyre type checker
129 | .pyre/
130 |
131 | # OS generated files
132 | .directory
133 | .gdb_history
134 | .DS_Store?
135 | Icon?
136 |
137 | # Tutorial and examples artifacts
138 | geography.db
139 | palmer_penguins.ddb*
140 | data/
141 | *.log
142 |
--------------------------------------------------------------------------------
/.pre-commit-config.yaml:
--------------------------------------------------------------------------------
1 | repos:
2 | - repo: https://github.com/kynan/nbstripout
3 | rev: 0.6.0
4 | hooks:
5 | - id: nbstripout
6 | - repo: https://github.com/psf/black
7 | rev: 23.3.0
8 | hooks:
9 | - id: black-jupyter
10 |
--------------------------------------------------------------------------------
/Dockerfile:
--------------------------------------------------------------------------------
1 | FROM python:3.10
2 |
3 | ARG NB_USER=jovyan
4 | ARG NB_UID=1000
5 | ENV USER ${NB_USER}
6 | ENV NB_UID ${NB_UID}
7 | ENV HOME /home/${NB_USER}
8 |
9 | RUN adduser --disabled-password \
10 | --gecos "Default user" \
11 | --uid ${NB_UID} \
12 | ${NB_USER}
13 |
14 | RUN apt-get update && \
15 | apt-get install -y git && \
16 | apt-get clean && \
17 | rm -rf /var/lib/apt/lists/*
18 |
19 | # Install dependencies
20 | COPY requirements.txt requirements.txt
21 | RUN pip install --no-cache -r requirements.txt \
22 | && find /usr/local/lib/python3.10/site-packages/ -follow -type f -name '*.js.map' -delete
23 |
24 | COPY --chown=${NB_UID} examples ${HOME}/examples
25 | COPY --chown=${NB_UID} tutorial ${HOME}/tutorial
26 | COPY --chown=${NB_UID} welcome.md ${HOME}/welcome.md
27 | COPY --chown=${NB_UID} binder.jupyterlab-workspace ${HOME}/.jupyter/lab/workspaces/default-37a8.jupyterlab-workspace
28 |
29 | USER ${USER}
30 | WORKDIR ${HOME}
31 |
--------------------------------------------------------------------------------
/LICENSE:
--------------------------------------------------------------------------------
1 | Apache License
2 | Version 2.0, January 2004
3 | http://www.apache.org/licenses/
4 |
5 | TERMS AND CONDITIONS FOR USE, REPRODUCTION, AND DISTRIBUTION
6 |
7 | 1. Definitions.
8 |
9 | "License" shall mean the terms and conditions for use, reproduction,
10 | and distribution as defined by Sections 1 through 9 of this document.
11 |
12 | "Licensor" shall mean the copyright owner or entity authorized by
13 | the copyright owner that is granting the License.
14 |
15 | "Legal Entity" shall mean the union of the acting entity and all
16 | other entities that control, are controlled by, or are under common
17 | control with that entity. For the purposes of this definition,
18 | "control" means (i) the power, direct or indirect, to cause the
19 | direction or management of such entity, whether by contract or
20 | otherwise, or (ii) ownership of fifty percent (50%) or more of the
21 | outstanding shares, or (iii) beneficial ownership of such entity.
22 |
23 | "You" (or "Your") shall mean an individual or Legal Entity
24 | exercising permissions granted by this License.
25 |
26 | "Source" form shall mean the preferred form for making modifications,
27 | including but not limited to software source code, documentation
28 | source, and configuration files.
29 |
30 | "Object" form shall mean any form resulting from mechanical
31 | transformation or translation of a Source form, including but
32 | not limited to compiled object code, generated documentation,
33 | and conversions to other media types.
34 |
35 | "Work" shall mean the work of authorship, whether in Source or
36 | Object form, made available under the License, as indicated by a
37 | copyright notice that is included in or attached to the work
38 | (an example is provided in the Appendix below).
39 |
40 | "Derivative Works" shall mean any work, whether in Source or Object
41 | form, that is based on (or derived from) the Work and for which the
42 | editorial revisions, annotations, elaborations, or other modifications
43 | represent, as a whole, an original work of authorship. For the purposes
44 | of this License, Derivative Works shall not include works that remain
45 | separable from, or merely link (or bind by name) to the interfaces of,
46 | the Work and Derivative Works thereof.
47 |
48 | "Contribution" shall mean any work of authorship, including
49 | the original version of the Work and any modifications or additions
50 | to that Work or Derivative Works thereof, that is intentionally
51 | submitted to Licensor for inclusion in the Work by the copyright owner
52 | or by an individual or Legal Entity authorized to submit on behalf of
53 | the copyright owner. For the purposes of this definition, "submitted"
54 | means any form of electronic, verbal, or written communication sent
55 | to the Licensor or its representatives, including but not limited to
56 | communication on electronic mailing lists, source code control systems,
57 | and issue tracking systems that are managed by, or on behalf of, the
58 | Licensor for the purpose of discussing and improving the Work, but
59 | excluding communication that is conspicuously marked or otherwise
60 | designated in writing by the copyright owner as "Not a Contribution."
61 |
62 | "Contributor" shall mean Licensor and any individual or Legal Entity
63 | on behalf of whom a Contribution has been received by Licensor and
64 | subsequently incorporated within the Work.
65 |
66 | 2. Grant of Copyright License. Subject to the terms and conditions of
67 | this License, each Contributor hereby grants to You a perpetual,
68 | worldwide, non-exclusive, no-charge, royalty-free, irrevocable
69 | copyright license to reproduce, prepare Derivative Works of,
70 | publicly display, publicly perform, sublicense, and distribute the
71 | Work and such Derivative Works in Source or Object form.
72 |
73 | 3. Grant of Patent License. Subject to the terms and conditions of
74 | this License, each Contributor hereby grants to You a perpetual,
75 | worldwide, non-exclusive, no-charge, royalty-free, irrevocable
76 | (except as stated in this section) patent license to make, have made,
77 | use, offer to sell, sell, import, and otherwise transfer the Work,
78 | where such license applies only to those patent claims licensable
79 | by such Contributor that are necessarily infringed by their
80 | Contribution(s) alone or by combination of their Contribution(s)
81 | with the Work to which such Contribution(s) was submitted. If You
82 | institute patent litigation against any entity (including a
83 | cross-claim or counterclaim in a lawsuit) alleging that the Work
84 | or a Contribution incorporated within the Work constitutes direct
85 | or contributory patent infringement, then any patent licenses
86 | granted to You under this License for that Work shall terminate
87 | as of the date such litigation is filed.
88 |
89 | 4. Redistribution. You may reproduce and distribute copies of the
90 | Work or Derivative Works thereof in any medium, with or without
91 | modifications, and in Source or Object form, provided that You
92 | meet the following conditions:
93 |
94 | (a) You must give any other recipients of the Work or
95 | Derivative Works a copy of this License; and
96 |
97 | (b) You must cause any modified files to carry prominent notices
98 | stating that You changed the files; and
99 |
100 | (c) You must retain, in the Source form of any Derivative Works
101 | that You distribute, all copyright, patent, trademark, and
102 | attribution notices from the Source form of the Work,
103 | excluding those notices that do not pertain to any part of
104 | the Derivative Works; and
105 |
106 | (d) If the Work includes a "NOTICE" text file as part of its
107 | distribution, then any Derivative Works that You distribute must
108 | include a readable copy of the attribution notices contained
109 | within such NOTICE file, excluding those notices that do not
110 | pertain to any part of the Derivative Works, in at least one
111 | of the following places: within a NOTICE text file distributed
112 | as part of the Derivative Works; within the Source form or
113 | documentation, if provided along with the Derivative Works; or,
114 | within a display generated by the Derivative Works, if and
115 | wherever such third-party notices normally appear. The contents
116 | of the NOTICE file are for informational purposes only and
117 | do not modify the License. You may add Your own attribution
118 | notices within Derivative Works that You distribute, alongside
119 | or as an addendum to the NOTICE text from the Work, provided
120 | that such additional attribution notices cannot be construed
121 | as modifying the License.
122 |
123 | You may add Your own copyright statement to Your modifications and
124 | may provide additional or different license terms and conditions
125 | for use, reproduction, or distribution of Your modifications, or
126 | for any such Derivative Works as a whole, provided Your use,
127 | reproduction, and distribution of the Work otherwise complies with
128 | the conditions stated in this License.
129 |
130 | 5. Submission of Contributions. Unless You explicitly state otherwise,
131 | any Contribution intentionally submitted for inclusion in the Work
132 | by You to the Licensor shall be under the terms and conditions of
133 | this License, without any additional terms or conditions.
134 | Notwithstanding the above, nothing herein shall supersede or modify
135 | the terms of any separate license agreement you may have executed
136 | with Licensor regarding such Contributions.
137 |
138 | 6. Trademarks. This License does not grant permission to use the trade
139 | names, trademarks, service marks, or product names of the Licensor,
140 | except as required for reasonable and customary use in describing the
141 | origin of the Work and reproducing the content of the NOTICE file.
142 |
143 | 7. Disclaimer of Warranty. Unless required by applicable law or
144 | agreed to in writing, Licensor provides the Work (and each
145 | Contributor provides its Contributions) on an "AS IS" BASIS,
146 | WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or
147 | implied, including, without limitation, any warranties or conditions
148 | of TITLE, NON-INFRINGEMENT, MERCHANTABILITY, or FITNESS FOR A
149 | PARTICULAR PURPOSE. You are solely responsible for determining the
150 | appropriateness of using or redistributing the Work and assume any
151 | risks associated with Your exercise of permissions under this License.
152 |
153 | 8. Limitation of Liability. In no event and under no legal theory,
154 | whether in tort (including negligence), contract, or otherwise,
155 | unless required by applicable law (such as deliberate and grossly
156 | negligent acts) or agreed to in writing, shall any Contributor be
157 | liable to You for damages, including any direct, indirect, special,
158 | incidental, or consequential damages of any character arising as a
159 | result of this License or out of the use or inability to use the
160 | Work (including but not limited to damages for loss of goodwill,
161 | work stoppage, computer failure or malfunction, or any and all
162 | other commercial damages or losses), even if such Contributor
163 | has been advised of the possibility of such damages.
164 |
165 | 9. Accepting Warranty or Additional Liability. While redistributing
166 | the Work or Derivative Works thereof, You may choose to offer,
167 | and charge a fee for, acceptance of support, warranty, indemnity,
168 | or other liability obligations and/or rights consistent with this
169 | License. However, in accepting such obligations, You may act only
170 | on Your own behalf and on Your sole responsibility, not on behalf
171 | of any other Contributor, and only if You agree to indemnify,
172 | defend, and hold each Contributor harmless for any liability
173 | incurred by, or claims asserted against, such Contributor by reason
174 | of your accepting any such warranty or additional liability.
175 |
176 | END OF TERMS AND CONDITIONS
177 |
178 | APPENDIX: How to apply the Apache License to your work.
179 |
180 | To apply the Apache License to your work, attach the following
181 | boilerplate notice, with the fields enclosed by brackets "[]"
182 | replaced with your own identifying information. (Don't include
183 | the brackets!) The text should be enclosed in the appropriate
184 | comment syntax for the file format. We also recommend that a
185 | file or class name and description of purpose be included on the
186 | same "printed page" as the copyright notice for easier
187 | identification within third-party archives.
188 |
189 | Copyright [yyyy] [name of copyright owner]
190 |
191 | Licensed under the Apache License, Version 2.0 (the "License");
192 | you may not use this file except in compliance with the License.
193 | You may obtain a copy of the License at
194 |
195 | http://www.apache.org/licenses/LICENSE-2.0
196 |
197 | Unless required by applicable law or agreed to in writing, software
198 | distributed under the License is distributed on an "AS IS" BASIS,
199 | WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
200 | See the License for the specific language governing permissions and
201 | limitations under the License.
202 |
--------------------------------------------------------------------------------
/README.md:
--------------------------------------------------------------------------------
1 | # Ibis Example Notebooks
2 |
3 | [](https://github.com/ibis-project/ibis-examples/actions/workflows/ci.yml?query=branch%3Amain)
4 |
5 | This repository contains runnable example notebooks for
6 | [Ibis](ibis-project.org/). They are intended to be educational and give users
7 | examples of common Ibis workflows.
8 |
9 | The easiest way to run the notebooks is in the cloud by clicking the big button
10 | below:
11 |
12 |
17 |
18 | ## Contributing
19 |
20 | Do you have an interesting example notebook that makes use of `ibis`? Please
21 | consider submitting it to this repository.
22 |
23 | Note that examples should adhere to the following guidelines:
24 |
25 | - Run top-to-bottom without error/intervention from the user
26 |
27 | - Not require external data sources that are likely to disappear over time
28 |
29 | - Not be resource intensive. Currently notebook sessions launched in
30 | [mybinder.org](https://mybinder.org) have access to 1-2 GiB of RAM. Your
31 | example must be able to run within these restrictions.
32 |
33 | - Be *interesting*, *exemplary* of good `ibis` usage, and of *general
34 | relevance* to `ibis` users.
35 |
--------------------------------------------------------------------------------
/binder.jupyterlab-workspace:
--------------------------------------------------------------------------------
1 | {"data":{"layout-restorer:data":{"main":{"dock":{"type":"tab-area","currentIndex":0,"widgets":["markdownviewer-widget:welcome.md"]},"current":"markdownviewer-widget:welcome.md"},"down":{"size":0,"widgets":[]},"left":{"collapsed":false,"current":"filebrowser","widgets":["filebrowser","running-sessions","@jupyterlab/toc:plugin","extensionmanager.main-view"]},"right":{"collapsed":true,"widgets":["jp-property-inspector","debugger-sidebar"]},"relativeSizes":[0.13545601726929304,0.864543982730707,0]},"file-browser-filebrowser:cwd":{"path":""},"workspace-ui:lastSave":"binder.jupyterlab-workspace","markdownviewer-widget:welcome.md":{"data":{"path":"welcome.md","factory":"Markdown Preview"}}},"metadata":{"id":"binder","last_modified":"2023-02-01T20:51:34.826742+00:00","created":"2023-02-01T20:51:34.826742+00:00"}}
--------------------------------------------------------------------------------
/examples/Reading-Parquet-Files-using-DuckDB.ipynb:
--------------------------------------------------------------------------------
1 | {
2 | "cells": [
3 | {
4 | "cell_type": "markdown",
5 | "id": "379407de-ed10-472c-ad81-228ba73c7d15",
6 | "metadata": {},
7 | "source": [
8 | "# Reading Parquet Files using DuckDB\n",
9 | "\n",
10 | "In this example, we will use Ibis's DuckDB backend to analyze data from a remote parquet source using `ibis.read_parquet`.\n",
11 | "`ibis.read_parquet` can also read local parquet files,\n",
12 | "and there are other `ibis.read_*` functions that conveniently return a table expression from a file.\n",
13 | "One such function is `ibis.read_csv`, which reads from local and remote CSV.\n",
14 | "\n",
15 | "We will be reading from the [**Global Biodiversity Information Facility (GBIF) Species Occurrences**](https://registry.opendata.aws/gbif/) dataset.\n",
16 | "It is hosted on S3 at `s3://gbif-open-data-us-east-1/occurrence/`"
17 | ]
18 | },
19 | {
20 | "cell_type": "markdown",
21 | "id": "4402d524-bd38-4127-a8ec-500be723711c",
22 | "metadata": {},
23 | "source": [
24 | "## Reading One Partition\n",
25 | "\n",
26 | "We can read a single partition by specifying its path.\n",
27 | "\n",
28 | "We do this by calling [`read_parquet`](https://ibis-project.org/api/expressions/top_level/#ibis.read_parquet) on the partition we care about.\n",
29 | "\n",
30 | "So to read the first partition in this dataset, we'll call `read_parquet` on `00000` in that path:"
31 | ]
32 | },
33 | {
34 | "cell_type": "code",
35 | "execution_count": null,
36 | "id": "062ba84c-1f4f-4ec7-9df5-73444c491342",
37 | "metadata": {
38 | "tags": []
39 | },
40 | "outputs": [],
41 | "source": [
42 | "import ibis\n",
43 | "\n",
44 | "t = ibis.read_parquet(\n",
45 | " f\"s3://gbif-open-data-us-east-1/occurrence/2023-04-01/occurrence.parquet/000000\"\n",
46 | ")\n",
47 | "t"
48 | ]
49 | },
50 | {
51 | "cell_type": "markdown",
52 | "id": "5440fa0f-2aca-40da-b4ed-4fde06051e10",
53 | "metadata": {},
54 | "source": [
55 | "Note that we're calling `read_parquet` and receiving a table expression without establishing a connection first.\n",
56 | "Ibis spins up a DuckDB connection (or whichever default backend you have) when you call `ibis.read_parquet` (or even `ibis.read_csv`).\n",
57 | "\n",
58 | "Since our result, `t`, is a table expression, we can now run queries against the file using Ibis expressions.\n",
59 | "For example, we can select columns, filter the file, and then view the first five rows of the result:"
60 | ]
61 | },
62 | {
63 | "cell_type": "code",
64 | "execution_count": null,
65 | "id": "035e845c-761a-4728-9361-ae33f3205c45",
66 | "metadata": {
67 | "tags": []
68 | },
69 | "outputs": [],
70 | "source": [
71 | "cols = [\n",
72 | " \"gbifid\",\n",
73 | " \"datasetkey\",\n",
74 | " \"occurrenceid\",\n",
75 | " \"kingdom\",\n",
76 | " \"phylum\",\n",
77 | " \"class\",\n",
78 | " \"order\",\n",
79 | " \"family\",\n",
80 | " \"genus\",\n",
81 | " \"species\",\n",
82 | " \"day\",\n",
83 | " \"month\",\n",
84 | " \"year\",\n",
85 | "]\n",
86 | "\n",
87 | "t.select(cols).filter(t[\"family\"].isin([\"Corvidae\"])).limit(5).to_pandas()"
88 | ]
89 | },
90 | {
91 | "cell_type": "markdown",
92 | "id": "4595a5ae-0007-4b8a-8e31-803d92e7e52c",
93 | "metadata": {},
94 | "source": [
95 | "or count the rows in the table (partition):"
96 | ]
97 | },
98 | {
99 | "cell_type": "code",
100 | "execution_count": null,
101 | "id": "bd6d8cc6-ce49-44dd-9507-bd26176127f8",
102 | "metadata": {
103 | "tags": []
104 | },
105 | "outputs": [],
106 | "source": [
107 | "t.count().to_pandas()"
108 | ]
109 | },
110 | {
111 | "cell_type": "markdown",
112 | "id": "4286d9f0-8e06-498b-a561-e75193126adc",
113 | "metadata": {},
114 | "source": [
115 | "## Reading All Partitions: Filter, Aggregate, Export\n",
116 | "We can use `read_parquet` to read an entire parquet file by globbing all partitions:"
117 | ]
118 | },
119 | {
120 | "cell_type": "code",
121 | "execution_count": null,
122 | "id": "3d2246c9-57b0-4b6c-8849-e8d2d85b29bb",
123 | "metadata": {
124 | "tags": []
125 | },
126 | "outputs": [],
127 | "source": [
128 | "t = ibis.read_parquet(\n",
129 | " f\"s3://gbif-open-data-us-east-1/occurrence/2023-04-01/occurrence.parquet/*\"\n",
130 | ")"
131 | ]
132 | },
133 | {
134 | "cell_type": "markdown",
135 | "id": "9bd746c0-d414-4212-ab76-c5d585bafc82",
136 | "metadata": {},
137 | "source": [
138 | "and since the function returns a table expression, we can perform valid selections, filters, aggregations, and exports just as we could with any other table expression:"
139 | ]
140 | },
141 | {
142 | "cell_type": "code",
143 | "execution_count": null,
144 | "id": "0f92c38b-1487-464c-86a2-4b922831207e",
145 | "metadata": {
146 | "tags": []
147 | },
148 | "outputs": [],
149 | "source": [
150 | "df = (\n",
151 | " t.select([\"gbifid\", \"family\", \"species\"])\n",
152 | " .filter(t[\"family\"].isin([\"Corvidae\"]))\n",
153 | " # Here we limit by 10,000 to fetch a quick batch of results\n",
154 | " .limit(10000)\n",
155 | " .group_by(\"species\")\n",
156 | " .count()\n",
157 | " .to_pandas()\n",
158 | ")\n",
159 | "\n",
160 | "print(df.shape)\n",
161 | "df.head()"
162 | ]
163 | },
164 | {
165 | "cell_type": "code",
166 | "execution_count": null,
167 | "id": "aecbd689-d632-42e1-80ed-28a7f0a22d17",
168 | "metadata": {},
169 | "outputs": [],
170 | "source": []
171 | }
172 | ],
173 | "metadata": {
174 | "kernelspec": {
175 | "display_name": "Python 3 (ipykernel)",
176 | "language": "python",
177 | "name": "python3"
178 | },
179 | "language_info": {
180 | "codemirror_mode": {
181 | "name": "ipython",
182 | "version": 3
183 | },
184 | "file_extension": ".py",
185 | "mimetype": "text/x-python",
186 | "name": "python",
187 | "nbconvert_exporter": "python",
188 | "pygments_lexer": "ipython3",
189 | "version": "3.10.6"
190 | }
191 | },
192 | "nbformat": 4,
193 | "nbformat_minor": 5
194 | }
195 |
--------------------------------------------------------------------------------
/examples/Substrait.ipynb:
--------------------------------------------------------------------------------
1 | {
2 | "cells": [
3 | {
4 | "cell_type": "markdown",
5 | "metadata": {},
6 | "source": [
7 | "# Substrait\n",
8 | "\n",
9 | "[Substrait](https://substrait.io) is a cross-language specification for data compute operations. Ibis can produce Substrait plans using the `ibis-substrait` python package. \n",
10 | "\n",
11 | "### Why Substrait?\n",
12 | "\n",
13 | "The current state of the world requires tools like Ibis to build connectors for each unique data system. This is a many-to-many relationship that grows exponentially. Substrait removes the need for connectors by introducing an Intermediate Representation (IR). Now, we can have a many-to-one relationship from frontend -> IR and a one-to-many relationship from IR -> backend. \n",
14 | "\n",
15 | "### But, how is this useful to me?\n",
16 | "\n",
17 | "Interoperability now _and in the future_. The same Substrait Plan can run anywhere that has built-in support for the Substrait specification. No need to wait for Ibis to implement the shiny new connector for your data system of choice."
18 | ]
19 | },
20 | {
21 | "cell_type": "markdown",
22 | "metadata": {},
23 | "source": [
24 | "## Example\n",
25 | "\n",
26 | "Let's see Ibis Substrait in action."
27 | ]
28 | },
29 | {
30 | "cell_type": "markdown",
31 | "metadata": {},
32 | "source": [
33 | "### Setup\n",
34 | "\n",
35 | "Let's build a toy example of a database server. Our example uses a local DuckDB database, but in practice we can imagine talking to a database server over the network."
36 | ]
37 | },
38 | {
39 | "cell_type": "code",
40 | "execution_count": null,
41 | "metadata": {},
42 | "outputs": [],
43 | "source": [
44 | "import duckdb\n",
45 | "import os\n",
46 | "from urllib.request import urlretrieve\n",
47 | "\n",
48 | "\n",
49 | "class DatabaseServer:\n",
50 | " DB_NAME = \"palmer_penguins.ddb\"\n",
51 | " DB_URL = \"https://storage.googleapis.com/ibis-tutorial-data/palmer_penguins.ddb\"\n",
52 | "\n",
53 | " def __init__(self):\n",
54 | " if not os.path.exists(self.DB_NAME):\n",
55 | " urlretrieve(self.DB_URL, self.DB_NAME)\n",
56 | " self.db = duckdb.connect(self.DB_NAME)\n",
57 | " self.db.install_extension(\"substrait\")\n",
58 | " self.db.load_extension(\"substrait\")\n",
59 | "\n",
60 | " def execute(self, substrait):\n",
61 | " result = self.db.from_substrait(substrait)\n",
62 | " return result.fetchall()\n",
63 | "\n",
64 | "\n",
65 | "db_server = DatabaseServer()"
66 | ]
67 | },
68 | {
69 | "cell_type": "markdown",
70 | "metadata": {},
71 | "source": [
72 | "### Ibis Table\n",
73 | "\n",
74 | "We need an Ibis Table to query against. Let's define one that matches the table in our mock DB server."
75 | ]
76 | },
77 | {
78 | "cell_type": "code",
79 | "execution_count": null,
80 | "metadata": {},
81 | "outputs": [],
82 | "source": [
83 | "import ibis\n",
84 | "from ibis.expr.datatypes.core import Float64, Int64, String\n",
85 | "\n",
86 | "table = ibis.table(\n",
87 | " name=\"penguins\",\n",
88 | " schema=[\n",
89 | " (\"species\", String()),\n",
90 | " (\"island\", String()),\n",
91 | " (\"bill_length_mm\", Float64()),\n",
92 | " (\"bill_depth_mm\", Float64()),\n",
93 | " (\"flipper_length_mm\", Int64()),\n",
94 | " (\"body_mass_g\", Int64()),\n",
95 | " (\"sex\", String()),\n",
96 | " (\"year\", Int64),\n",
97 | " ],\n",
98 | ")\n",
99 | "\n",
100 | "print(table)"
101 | ]
102 | },
103 | {
104 | "cell_type": "markdown",
105 | "metadata": {},
106 | "source": [
107 | "### Substrait Compiler\n",
108 | "\n",
109 | "The `ibis-substrait` package provides a `SubstraitCompiler` that can both compile and decompile Substrait Plans.\n",
110 | "\n",
111 | "Let's see it in action:"
112 | ]
113 | },
114 | {
115 | "cell_type": "code",
116 | "execution_count": null,
117 | "metadata": {},
118 | "outputs": [],
119 | "source": [
120 | "from ibis import _\n",
121 | "from ibis_substrait.compiler.core import SubstraitCompiler\n",
122 | "\n",
123 | "compiler = SubstraitCompiler()\n",
124 | "\n",
125 | "query = table.select(_.species).group_by(_.species).agg(count=_.species.count())\n",
126 | "\n",
127 | "substrait_plan = compiler.compile(query)\n",
128 | "\n",
129 | "print(substrait_plan)"
130 | ]
131 | },
132 | {
133 | "cell_type": "markdown",
134 | "metadata": {},
135 | "source": [
136 | "### Substrait Execution\n",
137 | "\n",
138 | "Let's serialize the Substrait Plan to bytes that can be sent over the network and pass them to our mock DB server.\n",
139 | "\n",
140 | "The query counts the number of penguins per species."
141 | ]
142 | },
143 | {
144 | "cell_type": "code",
145 | "execution_count": null,
146 | "metadata": {},
147 | "outputs": [],
148 | "source": [
149 | "plan_bytes = substrait_plan.SerializeToString()\n",
150 | "\n",
151 | "db_server.execute(substrait=plan_bytes)"
152 | ]
153 | },
154 | {
155 | "cell_type": "markdown",
156 | "metadata": {},
157 | "source": [
158 | "Success! We've created an Ibis Table expression, serialized it to the Substrait IR, sent it to our DB server, and received the resulting rows back.\n",
159 | "\n",
160 | "We can iterate on our data analysis. Let's see how many of each species lives on each island."
161 | ]
162 | },
163 | {
164 | "cell_type": "code",
165 | "execution_count": null,
166 | "metadata": {},
167 | "outputs": [],
168 | "source": [
169 | "query = (\n",
170 | " table.select(_.island, _.species)\n",
171 | " .group_by([_.island, _.species])\n",
172 | " .agg(num=_.species.count())\n",
173 | " .order_by([ibis.asc(_.island), ibis.asc(_.species)])\n",
174 | ")\n",
175 | "\n",
176 | "plan_bytes = compiler.compile(query).SerializeToString()\n",
177 | "\n",
178 | "db_server.execute(substrait=plan_bytes)"
179 | ]
180 | },
181 | {
182 | "cell_type": "markdown",
183 | "metadata": {},
184 | "source": [
185 | "Interesting! And what is the average body mass in grams for each row result?"
186 | ]
187 | },
188 | {
189 | "cell_type": "code",
190 | "execution_count": null,
191 | "metadata": {},
192 | "outputs": [],
193 | "source": [
194 | "query = (\n",
195 | " table.select(_.island, _.species, _.body_mass_g)\n",
196 | " .group_by([_.island, _.species])\n",
197 | " .agg(num=_.species.count(), avg_weight=_.body_mass_g.mean())\n",
198 | " .order_by([ibis.asc(_.island), ibis.asc(_.species)])\n",
199 | ")\n",
200 | "\n",
201 | "plan_bytes = compiler.compile(query).SerializeToString()\n",
202 | "\n",
203 | "db_server.execute(substrait=plan_bytes)"
204 | ]
205 | },
206 | {
207 | "cell_type": "markdown",
208 | "metadata": {},
209 | "source": [
210 | "## Conclusion\n",
211 | "\n",
212 | "We saw how we can translate Ibis expressions into Substrait Plans that can theoretically run anywhere. Backend support for Substrait is growing. Checkout some compatible projects such as [DuckDB](https://duckdb.org/docs/extensions/substrait), [Apache DataFusion](https://arrow.apache.org/datafusion), and Apache Arrow's [Acero](https://arrow.apache.org/docs/cpp/streaming_execution.html)!"
213 | ]
214 | }
215 | ],
216 | "metadata": {
217 | "kernelspec": {
218 | "display_name": "Python 3 (ipykernel)",
219 | "language": "python",
220 | "name": "python3"
221 | },
222 | "language_info": {
223 | "codemirror_mode": {
224 | "name": "ipython",
225 | "version": 3
226 | },
227 | "file_extension": ".py",
228 | "mimetype": "text/x-python",
229 | "name": "python",
230 | "nbconvert_exporter": "python",
231 | "pygments_lexer": "ipython3",
232 | "version": "3.10.10"
233 | }
234 | },
235 | "nbformat": 4,
236 | "nbformat_minor": 2
237 | }
238 |
--------------------------------------------------------------------------------
/examples/campaign-finance.ipynb:
--------------------------------------------------------------------------------
1 | {
2 | "cells": [
3 | {
4 | "cell_type": "markdown",
5 | "metadata": {},
6 | "source": [
7 | "# Exploring Campaign Finance Data\n",
8 | "\n",
9 | "Hi! My name is [Nick Crews](https://www.linkedin.com/in/nicholas-b-crews/),\n",
10 | "and I'm a data engineer that looks at public campaign finance data.\n",
11 | "\n",
12 | "In this post, I'll walk through how I use Ibis to explore public campaign contribution\n",
13 | "data from the Federal Election Commission (FEC). We'll do some loading,\n",
14 | "cleaning, featurizing, and visualization. There will be filtering, sorting, grouping,\n",
15 | "and aggregation."
16 | ]
17 | },
18 | {
19 | "cell_type": "markdown",
20 | "metadata": {},
21 | "source": [
22 | "## Downloading the Data\n",
23 | "\n",
24 | "The FEC publishes raw data as csvs to an S3 bucket [here](https://cg-519a459a-0ea3-42c2-b7bc-fa1143481f74.s3-us-gov-west-1.amazonaws.com/bulk-downloads/2018/indiv18.zip). This specific file expands to a 4.3 GiB csv file.\n",
25 | "\n",
26 | "While `ibis` could load that csv file directly, in the interest of making this example notebook quick and easy to run I've downloaded and converted the relevant csvs to parquet files, and uploaded them to google cloud (all this preprocessing was also done with `ibis`). For the interested, the preprocessing script can be found [here](../scripts/prepare_campaign_finance_data.py).\n",
27 | "\n",
28 | "We can download the parquet files to work with directly using `urllib.request.urlretrieve`:\n"
29 | ]
30 | },
31 | {
32 | "cell_type": "code",
33 | "execution_count": null,
34 | "metadata": {},
35 | "outputs": [],
36 | "source": [
37 | "import pathlib\n",
38 | "from urllib.request import urlretrieve\n",
39 | "\n",
40 | "data_dir = pathlib.Path.cwd().parent / \"data\"\n",
41 | "data_dir.mkdir(exist_ok=True)\n",
42 | "contribs_path = data_dir / \"contributions-2018.parquet\"\n",
43 | "comms_path = data_dir / \"committees-2018.parquet\"\n",
44 | "\n",
45 | "if not contribs_path.exists():\n",
46 | " urlretrieve(\n",
47 | " \"https://storage.googleapis.com/ibis-example-notebooks-data/contributions-2018.parquet\",\n",
48 | " contribs_path,\n",
49 | " )\n",
50 | "\n",
51 | "if not comms_path.exists():\n",
52 | " urlretrieve(\n",
53 | " \"https://storage.googleapis.com/ibis-example-notebooks-data/committees-2018.parquet\",\n",
54 | " comms_path,\n",
55 | " )"
56 | ]
57 | },
58 | {
59 | "cell_type": "markdown",
60 | "metadata": {},
61 | "source": [
62 | "## Loading the data\n",
63 | "\n",
64 | "Now that we have our data, let's load it into Ibis.\n",
65 | "\n",
66 | "Since our data is stored as `parquet` files, we can do that using `ibis.read_parquet`. This takes a path to a parquet file, and returns a `Table` representing the loaded data.\n",
67 | "\n",
68 | "We'll also turn on `interactive` mode, so we can peak at the query results as we work."
69 | ]
70 | },
71 | {
72 | "cell_type": "code",
73 | "execution_count": null,
74 | "metadata": {},
75 | "outputs": [],
76 | "source": [
77 | "import ibis\n",
78 | "from ibis import _\n",
79 | "\n",
80 | "ibis.options.interactive = True\n",
81 | "\n",
82 | "contribs = ibis.read_parquet(contribs_path)\n",
83 | "contribs"
84 | ]
85 | },
86 | {
87 | "cell_type": "markdown",
88 | "metadata": {},
89 | "source": [
90 | "The above table shows just the first few rows. To see how many rows of data we actually have, we can use the `.count()` method:"
91 | ]
92 | },
93 | {
94 | "cell_type": "code",
95 | "execution_count": null,
96 | "metadata": {},
97 | "outputs": [],
98 | "source": [
99 | "contribs.count()"
100 | ]
101 | },
102 | {
103 | "cell_type": "markdown",
104 | "metadata": {},
105 | "source": [
106 | "### Committees Data\n",
107 | "\n",
108 | "The contributions only list an opaque `CMTE_ID` column. We want to know which actual\n",
109 | "committee this is. Let's load the committees table so we can lookup from\n",
110 | "committee ID to committee name."
111 | ]
112 | },
113 | {
114 | "cell_type": "code",
115 | "execution_count": null,
116 | "metadata": {},
117 | "outputs": [],
118 | "source": [
119 | "comms = ibis.read_parquet(comms_path)\n",
120 | "comms"
121 | ]
122 | },
123 | {
124 | "cell_type": "markdown",
125 | "metadata": {},
126 | "source": [
127 | "Now add a the committee name to the contributions table:"
128 | ]
129 | },
130 | {
131 | "cell_type": "code",
132 | "execution_count": null,
133 | "metadata": {},
134 | "outputs": [],
135 | "source": [
136 | "together = contribs.join(comms, \"CMTE_ID\")\n",
137 | "together"
138 | ]
139 | },
140 | {
141 | "cell_type": "markdown",
142 | "metadata": {},
143 | "source": [
144 | "## Cleaning"
145 | ]
146 | },
147 | {
148 | "cell_type": "markdown",
149 | "metadata": {},
150 | "source": [
151 | "Let's look at the `ENTITY_TP` column. This represents the type of entity that\n",
152 | "made the contribution:"
153 | ]
154 | },
155 | {
156 | "cell_type": "code",
157 | "execution_count": null,
158 | "metadata": {},
159 | "outputs": [],
160 | "source": [
161 | "together.ENTITY_TP.value_counts()"
162 | ]
163 | },
164 | {
165 | "cell_type": "markdown",
166 | "metadata": {},
167 | "source": [
168 | "We only care about contributions from individuals.\n",
169 | "\n",
170 | "Once we filter on this column, the contents of it are irrelevant, so let's drop it."
171 | ]
172 | },
173 | {
174 | "cell_type": "code",
175 | "execution_count": null,
176 | "metadata": {},
177 | "outputs": [],
178 | "source": [
179 | "cleaned = together[_.ENTITY_TP == \"IND\"].drop(\"ENTITY_TP\")"
180 | ]
181 | },
182 | {
183 | "cell_type": "markdown",
184 | "metadata": {},
185 | "source": [
186 | "It looks like the `TRANSACTION_DT` column was a raw string like \"MMDDYYYY\", \n",
187 | "so let's convert that to a proper date type."
188 | ]
189 | },
190 | {
191 | "cell_type": "code",
192 | "execution_count": null,
193 | "metadata": {},
194 | "outputs": [],
195 | "source": [
196 | "from ibis.expr.types import StringValue, DateValue\n",
197 | "\n",
198 | "\n",
199 | "def mmddyyyy_to_date(val: StringValue) -> DateValue:\n",
200 | " return val.cast(str).lpad(8, \"0\").to_timestamp(\"%m%d%Y\").date()\n",
201 | "\n",
202 | "\n",
203 | "cleaned = cleaned.mutate(date=mmddyyyy_to_date(_.TRANSACTION_DT)).drop(\"TRANSACTION_DT\")\n",
204 | "cleaned"
205 | ]
206 | },
207 | {
208 | "cell_type": "markdown",
209 | "metadata": {},
210 | "source": [
211 | "The `TRANSACTION_PGI` column represents the type (primary, general, etc) of election,\n",
212 | "and the year. But it seems to be not very consistent:"
213 | ]
214 | },
215 | {
216 | "cell_type": "code",
217 | "execution_count": null,
218 | "metadata": {},
219 | "outputs": [],
220 | "source": [
221 | "cleaned.TRANSACTION_PGI.topk(10)"
222 | ]
223 | },
224 | {
225 | "cell_type": "code",
226 | "execution_count": null,
227 | "metadata": {},
228 | "outputs": [],
229 | "source": [
230 | "def get_election_type(pgi: StringValue) -> StringValue:\n",
231 | " \"\"\"Use the first letter of the TRANSACTION_PGI column to determine the election type\n",
232 | "\n",
233 | " If the first letter is not one of the known election stage, then return null.\n",
234 | " \"\"\"\n",
235 | " election_types = {\n",
236 | " \"P\": \"primary\",\n",
237 | " \"G\": \"general\",\n",
238 | " \"O\": \"other\",\n",
239 | " \"C\": \"convention\",\n",
240 | " \"R\": \"runoff\",\n",
241 | " \"S\": \"special\",\n",
242 | " \"E\": \"recount\",\n",
243 | " }\n",
244 | " first_letter = pgi[0]\n",
245 | " return first_letter.substitute(election_types, else_=ibis.NA)\n",
246 | "\n",
247 | "\n",
248 | "cleaned = cleaned.mutate(election_type=get_election_type(_.TRANSACTION_PGI)).drop(\n",
249 | " \"TRANSACTION_PGI\"\n",
250 | ")\n",
251 | "cleaned"
252 | ]
253 | },
254 | {
255 | "cell_type": "markdown",
256 | "metadata": {},
257 | "source": [
258 | "That worked well! There are 0 nulls in the resulting column, so we always were\n",
259 | "able to determine the elction type."
260 | ]
261 | },
262 | {
263 | "cell_type": "code",
264 | "execution_count": null,
265 | "metadata": {},
266 | "outputs": [],
267 | "source": [
268 | "cleaned.election_type.topk(10)"
269 | ]
270 | },
271 | {
272 | "cell_type": "markdown",
273 | "metadata": {},
274 | "source": [
275 | "About 1/20 of transactions are negative. These could represent refunds, or they could be data\n",
276 | "entry errors. Let's simply drop them to keep it simple."
277 | ]
278 | },
279 | {
280 | "cell_type": "code",
281 | "execution_count": null,
282 | "metadata": {},
283 | "outputs": [],
284 | "source": [
285 | "above_zero = cleaned.TRANSACTION_AMT > 0\n",
286 | "cleaned = cleaned[above_zero]\n",
287 | "above_zero.value_counts()"
288 | ]
289 | },
290 | {
291 | "cell_type": "markdown",
292 | "metadata": {},
293 | "source": [
294 | "## Adding Features\n",
295 | "\n",
296 | "Now that the data is cleaned up to a usable format, let's add some features.\n",
297 | "\n",
298 | "First, it's useful to categorize donations by size, placing them into buckets\n",
299 | "of small, medium, large, etc."
300 | ]
301 | },
302 | {
303 | "cell_type": "code",
304 | "execution_count": null,
305 | "metadata": {},
306 | "outputs": [],
307 | "source": [
308 | "edges = [\n",
309 | " 10,\n",
310 | " 50,\n",
311 | " 100,\n",
312 | " 500,\n",
313 | " 1000,\n",
314 | " 5000,\n",
315 | "]\n",
316 | "labels = [\n",
317 | " \"<10\",\n",
318 | " \"10-50\",\n",
319 | " \"50-100\",\n",
320 | " \"100-500\",\n",
321 | " \"500-1000\",\n",
322 | " \"1000-5000\",\n",
323 | " \"5000+\",\n",
324 | "]\n",
325 | "\n",
326 | "\n",
327 | "def bucketize(vals, edges, str_labels):\n",
328 | " # Uses Ibis's .bucket() method to create a categorical column\n",
329 | " int_labels = vals.bucket(edges, include_under=True, include_over=True)\n",
330 | " # Map the integer labels to the string labels\n",
331 | " int_to_str = {str(i): s for i, s in enumerate(str_labels)}\n",
332 | " return int_labels.cast(str).substitute(int_to_str)\n",
333 | "\n",
334 | "\n",
335 | "featured = cleaned.mutate(amount_bucket=bucketize(_.TRANSACTION_AMT, edges, labels))\n",
336 | "featured"
337 | ]
338 | },
339 | {
340 | "cell_type": "markdown",
341 | "metadata": {},
342 | "source": [
343 | "## Analysis\n",
344 | "\n",
345 | "### By donation size\n",
346 | "\n",
347 | "One thing we can look at is the donation breakdown by size:\n",
348 | "- Are most donations small or large?\n",
349 | "- Where do politicians/committees get most of their money from? Large or small donations?\n",
350 | "\n",
351 | "We also will compare performance of Ibis vs pandas during this groupby."
352 | ]
353 | },
354 | {
355 | "cell_type": "code",
356 | "execution_count": null,
357 | "metadata": {},
358 | "outputs": [],
359 | "source": [
360 | "def summary_by(table, by):\n",
361 | " return table.group_by(by).agg(\n",
362 | " n_donations=_.count(),\n",
363 | " total_amount=_.TRANSACTION_AMT.sum(),\n",
364 | " mean_amount=_.TRANSACTION_AMT.mean(),\n",
365 | " median_amount=_.TRANSACTION_AMT.approx_median(),\n",
366 | " )\n",
367 | "\n",
368 | "\n",
369 | "def summary_by_pandas(df, by):\n",
370 | " return df.groupby(by, as_index=False).agg(\n",
371 | " n_donations=(\"election_type\", \"count\"),\n",
372 | " total_amount=(\"TRANSACTION_AMT\", \"sum\"),\n",
373 | " mean_amount=(\"TRANSACTION_AMT\", \"mean\"),\n",
374 | " median_amount=(\"TRANSACTION_AMT\", \"median\"),\n",
375 | " )\n",
376 | "\n",
377 | "\n",
378 | "# persist the input data so the following timings of the group_by are accurate.\n",
379 | "subset = featured[\"election_type\", \"amount_bucket\", \"TRANSACTION_AMT\"]\n",
380 | "subset = subset.cache()\n",
381 | "pandas_subset = subset.to_pandas()"
382 | ]
383 | },
384 | {
385 | "cell_type": "markdown",
386 | "metadata": {},
387 | "source": [
388 | "Let's take a look at what we are actually computing:"
389 | ]
390 | },
391 | {
392 | "cell_type": "code",
393 | "execution_count": null,
394 | "metadata": {},
395 | "outputs": [],
396 | "source": [
397 | "by_type_and_bucket = summary_by(subset, [\"election_type\", \"amount_bucket\"])\n",
398 | "by_type_and_bucket"
399 | ]
400 | },
401 | {
402 | "cell_type": "markdown",
403 | "metadata": {},
404 | "source": [
405 | "OK, now let's do our timings.\n",
406 | "\n",
407 | "One interesting thing to pay attention to here is the execution time for the following\n",
408 | "groupby. Before, we could get away with lazy execution: because we only wanted to preview\n",
409 | "the first few rows, we only had to compute the first few rows, so all our previews were\n",
410 | "very fast.\n",
411 | "\n",
412 | "But now, as soon as we do a groupby, we have to actually go through the whole dataset\n",
413 | "in order to compute the aggregate per group. So this is going to be slower. BUT,\n",
414 | "duckdb is still quite fast. It only takes .4 seconds to groupby-agg all 20 million rows!"
415 | ]
416 | },
417 | {
418 | "cell_type": "code",
419 | "execution_count": null,
420 | "metadata": {},
421 | "outputs": [],
422 | "source": [
423 | "%time summary_by(subset, [\"election_type\", \"amount_bucket\"]).to_pandas(); #to_pandas() so we actually fetch the data"
424 | ]
425 | },
426 | {
427 | "cell_type": "markdown",
428 | "metadata": {},
429 | "source": [
430 | "Now let's try the same thing in pandas:"
431 | ]
432 | },
433 | {
434 | "cell_type": "code",
435 | "execution_count": null,
436 | "metadata": {},
437 | "outputs": [],
438 | "source": [
439 | "%time summary_by_pandas(pandas_subset, [\"election_type\", \"amount_bucket\"]);"
440 | ]
441 | },
442 | {
443 | "cell_type": "markdown",
444 | "metadata": {},
445 | "source": [
446 | "On my machine it takes about 3 seconds, which is about 6 times slower than duckdb.\n",
447 | "\n",
448 | "At this scale, it again doesn't matter, but you could imagine with a dataset much larger than this, it would matter.\n",
449 | "\n",
450 | "Let's also think about memory usage:"
451 | ]
452 | },
453 | {
454 | "cell_type": "code",
455 | "execution_count": null,
456 | "metadata": {},
457 | "outputs": [],
458 | "source": [
459 | "pandas_subset.memory_usage(deep=True).sum() / 1e9 # GB"
460 | ]
461 | },
462 | {
463 | "cell_type": "markdown",
464 | "metadata": {},
465 | "source": [
466 | "The source dataframe is couple gigabytes, so probably during the groupby,\n",
467 | "the peak memory usage is going to be a bit higher than this. You could use a profiler\n",
468 | "such as [FIL](https://github.com/pythonspeed/filprofiler) if you wanted an exact number,\n",
469 | "I was too lazy to use that here.\n",
470 | "\n",
471 | "Again, this works on my laptop at this dataset size, but much larger than this and I'd\n",
472 | "start having problems. Duckdb on the other hand is designed around working out of core\n",
473 | "so it should scale to datasets into the hundreds of gigabytes, much larger than your\n",
474 | "computer's RAM.\n",
475 | "\n",
476 | "### Back to analysis\n",
477 | "\n",
478 | "OK, let's plot the result of that groupby.\n",
479 | "\n",
480 | "Surprise! (Or maybe not...) Most donations are small. But most of the money comes\n",
481 | "from donations larger than $1000.\n",
482 | "\n",
483 | "Well if that's the case, why do politicians spend so much time soliciting small\n",
484 | "donations? One explanation is that they can use the number of donations\n",
485 | "as a marketing pitch, to show how popular they are, and thus how viable of a\n",
486 | "candidate they are.\n",
487 | "\n",
488 | "This also might explain whose interests are being served by our politicians."
489 | ]
490 | },
491 | {
492 | "cell_type": "code",
493 | "execution_count": null,
494 | "metadata": {},
495 | "outputs": [],
496 | "source": [
497 | "import altair as alt\n",
498 | "\n",
499 | "# Do some bookkeeping so the buckets are displayed smallest to largest on the charts\n",
500 | "bucket_col = alt.Column(\"amount_bucket:N\", sort=labels)\n",
501 | "\n",
502 | "n_by_bucket = (\n",
503 | " alt.Chart(by_type_and_bucket.to_pandas())\n",
504 | " .mark_bar()\n",
505 | " .encode(\n",
506 | " x=bucket_col,\n",
507 | " y=\"n_donations:Q\",\n",
508 | " color=\"election_type:N\",\n",
509 | " )\n",
510 | ")\n",
511 | "total_by_bucket = (\n",
512 | " alt.Chart(by_type_and_bucket.to_pandas())\n",
513 | " .mark_bar()\n",
514 | " .encode(\n",
515 | " x=bucket_col,\n",
516 | " y=\"total_amount:Q\",\n",
517 | " color=\"election_type:N\",\n",
518 | " )\n",
519 | ")\n",
520 | "n_by_bucket | total_by_bucket"
521 | ]
522 | },
523 | {
524 | "cell_type": "markdown",
525 | "metadata": {},
526 | "source": [
527 | "### By election stage\n",
528 | "\n",
529 | "Let's look at how donations break down by election stage. Do people donate\n",
530 | "differently for primary elections vs general elections?\n",
531 | "\n",
532 | "Let's ignore everything but primary and general elections, since they are the\n",
533 | "most common, and arguably the most important."
534 | ]
535 | },
536 | {
537 | "cell_type": "code",
538 | "execution_count": null,
539 | "metadata": {},
540 | "outputs": [],
541 | "source": [
542 | "gb2 = by_type_and_bucket[_.election_type.isin((\"primary\", \"general\"))]\n",
543 | "n_donations_per_election_type = _.n_donations.sum().over(group_by=\"election_type\")\n",
544 | "frac = _.n_donations / n_donations_per_election_type\n",
545 | "gb2 = gb2.mutate(frac_n_donations_per_election_type=frac)\n",
546 | "gb2"
547 | ]
548 | },
549 | {
550 | "cell_type": "markdown",
551 | "metadata": {},
552 | "source": [
553 | "It looks like primary elections get a larger proportion of small donations."
554 | ]
555 | },
556 | {
557 | "cell_type": "code",
558 | "execution_count": null,
559 | "metadata": {},
560 | "outputs": [],
561 | "source": [
562 | "alt.Chart(gb2.to_pandas()).mark_bar().encode(\n",
563 | " x=\"election_type:O\",\n",
564 | " y=\"frac_n_donations_per_election_type:Q\",\n",
565 | " color=bucket_col,\n",
566 | ")"
567 | ]
568 | },
569 | {
570 | "cell_type": "markdown",
571 | "metadata": {},
572 | "source": [
573 | "### By recipient\n",
574 | "\n",
575 | "Let's look at the top players. Who gets the most donations?\n",
576 | "\n",
577 | "Far and away it is ActBlue, which acts as a conduit for donations to Democratic\n",
578 | "interests.\n",
579 | "\n",
580 | "Beto O'Rourke is the top individual politician, hats off to him!"
581 | ]
582 | },
583 | {
584 | "cell_type": "code",
585 | "execution_count": null,
586 | "metadata": {},
587 | "outputs": [],
588 | "source": [
589 | "by_recip = summary_by(featured, \"CMTE_NM\")\n",
590 | "by_recip"
591 | ]
592 | },
593 | {
594 | "cell_type": "code",
595 | "execution_count": null,
596 | "metadata": {},
597 | "outputs": [],
598 | "source": [
599 | "top_recip = by_recip.order_by(ibis.desc(\"n_donations\")).head(10)\n",
600 | "alt.Chart(top_recip.to_pandas()).mark_bar().encode(\n",
601 | " x=alt.X(\"CMTE_NM:O\", sort=\"-y\"),\n",
602 | " y=\"n_donations:Q\",\n",
603 | ")"
604 | ]
605 | },
606 | {
607 | "cell_type": "markdown",
608 | "metadata": {},
609 | "source": [
610 | "### By Location\n",
611 | "\n",
612 | "Where are the largest donations coming from?"
613 | ]
614 | },
615 | {
616 | "cell_type": "code",
617 | "execution_count": null,
618 | "metadata": {},
619 | "outputs": [],
620 | "source": [
621 | "f2 = featured.mutate(loc=_.CITY + \", \" + _.STATE).drop(\"CITY\", \"STATE\")\n",
622 | "by_loc = summary_by(f2, \"loc\")\n",
623 | "# Drop the places with a small number of donations so we're\n",
624 | "# resistant to outliers for the mean\n",
625 | "by_loc = by_loc[_.n_donations > 1000]\n",
626 | "by_loc"
627 | ]
628 | },
629 | {
630 | "cell_type": "code",
631 | "execution_count": null,
632 | "metadata": {},
633 | "outputs": [],
634 | "source": [
635 | "def top_by(col):\n",
636 | " top = by_loc.order_by(ibis.desc(col)).head(10)\n",
637 | " return (\n",
638 | " alt.Chart(top.to_pandas())\n",
639 | " .mark_bar()\n",
640 | " .encode(\n",
641 | " x=alt.X(\"loc:O\", sort=\"-y\"),\n",
642 | " y=col,\n",
643 | " )\n",
644 | " )\n",
645 | "\n",
646 | "\n",
647 | "top_by(\"n_donations\") | top_by(\"total_amount\") | top_by(\"mean_amount\") | top_by(\n",
648 | " \"median_amount\"\n",
649 | ")"
650 | ]
651 | },
652 | {
653 | "cell_type": "markdown",
654 | "metadata": {},
655 | "source": [
656 | "### By month\n",
657 | "\n",
658 | "When do the donations come in?"
659 | ]
660 | },
661 | {
662 | "cell_type": "code",
663 | "execution_count": null,
664 | "metadata": {},
665 | "outputs": [],
666 | "source": [
667 | "by_month = summary_by(featured, _.date.month().name(\"month_int\"))\n",
668 | "# Sorta hacky, .substritute doesn't work to change dtypes (yet?)\n",
669 | "# so we cast to string and then do our mapping\n",
670 | "month_map = {\n",
671 | " \"1\": \"Jan\",\n",
672 | " \"2\": \"Feb\",\n",
673 | " \"3\": \"Mar\",\n",
674 | " \"4\": \"Apr\",\n",
675 | " \"5\": \"May\",\n",
676 | " \"6\": \"Jun\",\n",
677 | " \"7\": \"Jul\",\n",
678 | " \"8\": \"Aug\",\n",
679 | " \"9\": \"Sep\",\n",
680 | " \"10\": \"Oct\",\n",
681 | " \"11\": \"Nov\",\n",
682 | " \"12\": \"Dec\",\n",
683 | "}\n",
684 | "by_month = by_month.mutate(month_str=_.month_int.cast(str).substitute(month_map))\n",
685 | "by_month"
686 | ]
687 | },
688 | {
689 | "cell_type": "code",
690 | "execution_count": null,
691 | "metadata": {},
692 | "outputs": [],
693 | "source": [
694 | "months_in_order = list(month_map.values())\n",
695 | "alt.Chart(by_month.to_pandas()).mark_bar().encode(\n",
696 | " x=alt.X(\"month_str:O\", sort=months_in_order),\n",
697 | " y=\"n_donations:Q\",\n",
698 | ")"
699 | ]
700 | },
701 | {
702 | "cell_type": "markdown",
703 | "metadata": {},
704 | "source": [
705 | "## Conclusion\n",
706 | "\n",
707 | "Thanks for following along! I hope you've learned something about Ibis, and\n",
708 | "maybe even about campaign finance.\n",
709 | "\n",
710 | "Ibis is a great tool for exploring data. I now find myself reaching for it\n",
711 | "when in the past I would have reached for pandas.\n",
712 | "\n",
713 | "Some of the highlights for me:\n",
714 | "\n",
715 | "- Fast, lazy execution, a great display format, and good type hinting/editor support for a great REPL experience.\n",
716 | "- Very well thought-out API and semantics (e.g. `isinstance(val, NumericValue)`?? That's beautiful!)\n",
717 | "- Fast and fairly complete string support, since I work with a lot of text data.\n",
718 | "- Extremely responsive maintainers. Sometimes I've submitted multiple feature requests and bug reports in a single day, and a PR has been merged by the next day.\n",
719 | "- Escape hatch to SQL. I didn't have to use that here, but if something isn't supported, you can always fall back to SQL.\n",
720 | "\n",
721 | "Check out [The Ibis Website](https://ibis-project.org/) for more information."
722 | ]
723 | }
724 | ],
725 | "metadata": {
726 | "kernelspec": {
727 | "display_name": "Python 3 (ipykernel)",
728 | "language": "python",
729 | "name": "python3"
730 | },
731 | "language_info": {
732 | "codemirror_mode": {
733 | "name": "ipython",
734 | "version": 3
735 | },
736 | "file_extension": ".py",
737 | "mimetype": "text/x-python",
738 | "name": "python",
739 | "nbconvert_exporter": "python",
740 | "pygments_lexer": "ipython3",
741 | "version": "3.10.10"
742 | }
743 | },
744 | "nbformat": 4,
745 | "nbformat_minor": 2
746 | }
747 |
--------------------------------------------------------------------------------
/examples/clickhouse-hackernews.ipynb:
--------------------------------------------------------------------------------
1 | {
2 | "cells": [
3 | {
4 | "cell_type": "markdown",
5 | "metadata": {},
6 | "source": [
7 | "# Using Ibis with ClickHouse"
8 | ]
9 | },
10 | {
11 | "cell_type": "markdown",
12 | "metadata": {},
13 | "source": [
14 | "[Ibis](https://ibis-project.com) supports reading and querying data using [ClickHouse](https://clickhouse.com/) as a backend.\n",
15 | "\n",
16 | "In this example we'll demonstrate connecting Ibis to a ClickHouse server, and using it to execute a few queries."
17 | ]
18 | },
19 | {
20 | "cell_type": "code",
21 | "execution_count": null,
22 | "metadata": {},
23 | "outputs": [],
24 | "source": [
25 | "import ibis\n",
26 | "from ibis import _\n",
27 | "\n",
28 | "ibis.options.interactive = True"
29 | ]
30 | },
31 | {
32 | "cell_type": "markdown",
33 | "metadata": {},
34 | "source": [
35 | "## Creating a Connection\n",
36 | "\n",
37 | "First we need to connect Ibis to a running ClickHouse server.\n",
38 | "\n",
39 | "In this example we'll run queries against the publically available [ClickHouse playground](https://clickhouse.com/docs/en/getting-started/playground) server. To run against your own ClickHouse server you'd only need to change the connection details."
40 | ]
41 | },
42 | {
43 | "cell_type": "code",
44 | "execution_count": null,
45 | "metadata": {},
46 | "outputs": [],
47 | "source": [
48 | "con = ibis.connect(\"clickhouse://play@play.clickhouse.com:443\")"
49 | ]
50 | },
51 | {
52 | "cell_type": "markdown",
53 | "metadata": {},
54 | "source": [
55 | "## Listing available tables\n",
56 | "\n",
57 | "The ClickHouse playground server has a number of interesting datasets available. To see them, we can examine the tables via the `.tables` attribue. This shows a list of all tables available:"
58 | ]
59 | },
60 | {
61 | "cell_type": "code",
62 | "execution_count": null,
63 | "metadata": {},
64 | "outputs": [],
65 | "source": [
66 | "con.tables"
67 | ]
68 | },
69 | {
70 | "cell_type": "markdown",
71 | "metadata": {},
72 | "source": [
73 | "## Inspecting a Table\n",
74 | "\n",
75 | "Lets take a look at the `hackernews` table. This table contains all posts and comments on [Hacker News](https://news.ycombinator.com/).\n",
76 | "\n",
77 | "We can access the table by attribute as `con.tables.hackernews`."
78 | ]
79 | },
80 | {
81 | "cell_type": "code",
82 | "execution_count": null,
83 | "metadata": {},
84 | "outputs": [],
85 | "source": [
86 | "t = con.tables.hackernews"
87 | ]
88 | },
89 | {
90 | "cell_type": "markdown",
91 | "metadata": {},
92 | "source": [
93 | "We can then take a peak at the first few rows using the `.head()` method."
94 | ]
95 | },
96 | {
97 | "cell_type": "code",
98 | "execution_count": null,
99 | "metadata": {},
100 | "outputs": [],
101 | "source": [
102 | "t.head()"
103 | ]
104 | },
105 | {
106 | "cell_type": "markdown",
107 | "metadata": {},
108 | "source": [
109 | "## Finding the highest scoring posts\n",
110 | "\n",
111 | "Here we find the top 5 posts by score.\n",
112 | "\n",
113 | "Posts have a title, so we:\n",
114 | "\n",
115 | "- `filter` out rows that lack a title\n",
116 | "- `select` only the columns we're interested in\n",
117 | "- `order` them by score, descending\n",
118 | "- `limit` to the top 5 rows"
119 | ]
120 | },
121 | {
122 | "cell_type": "code",
123 | "execution_count": null,
124 | "metadata": {},
125 | "outputs": [],
126 | "source": [
127 | "top_posts_by_score = (\n",
128 | " t.filter(_.title != \"\")\n",
129 | " .select(\"title\", \"score\")\n",
130 | " .order_by(ibis.desc(\"score\"))\n",
131 | " .limit(5)\n",
132 | ")\n",
133 | "\n",
134 | "top_posts_by_score"
135 | ]
136 | },
137 | {
138 | "cell_type": "markdown",
139 | "metadata": {},
140 | "source": [
141 | "## Finding the most prolific commenters\n",
142 | "\n",
143 | "Here we find the top 5 commenters by number of comments made.\n",
144 | "\n",
145 | "To do this we:\n",
146 | "\n",
147 | "- `filter` out rows with no author\n",
148 | "- `group_by` author\n",
149 | "- `count` all the rows in each group\n",
150 | "- `order_by` the counts, descending\n",
151 | "- `limit` to the top 5 rows"
152 | ]
153 | },
154 | {
155 | "cell_type": "code",
156 | "execution_count": null,
157 | "metadata": {},
158 | "outputs": [],
159 | "source": [
160 | "top_commenters = (\n",
161 | " t.filter(_.by != \"\")\n",
162 | " .group_by(\"by\")\n",
163 | " .agg(count=_.count())\n",
164 | " .order_by(ibis.desc(\"count\"))\n",
165 | " .limit(5)\n",
166 | ")\n",
167 | "\n",
168 | "top_commenters"
169 | ]
170 | },
171 | {
172 | "cell_type": "markdown",
173 | "metadata": {},
174 | "source": [
175 | "This query could also be expressed using the `.topk` method, which is a shorthand for the above:"
176 | ]
177 | },
178 | {
179 | "cell_type": "code",
180 | "execution_count": null,
181 | "metadata": {},
182 | "outputs": [],
183 | "source": [
184 | "# This is a shorthand for the above\n",
185 | "top_commenters = t.filter(_.by != \"\").by.topk(5)\n",
186 | "\n",
187 | "top_commenters"
188 | ]
189 | },
190 | {
191 | "cell_type": "markdown",
192 | "metadata": {},
193 | "source": [
194 | "## Finding top commenters by score"
195 | ]
196 | },
197 | {
198 | "cell_type": "markdown",
199 | "metadata": {},
200 | "source": [
201 | "Here we find the top 5 commenters with the highest cumulative scores. In this case the `.topk` shorthand won't work and we'll need to write out the full `group_by` -> `agg` -> `order_by` -> `limit` pipeline."
202 | ]
203 | },
204 | {
205 | "cell_type": "code",
206 | "execution_count": null,
207 | "metadata": {},
208 | "outputs": [],
209 | "source": [
210 | "top_commenters_by_score = (\n",
211 | " t.filter(_.by != \"\")\n",
212 | " .group_by(\"by\")\n",
213 | " .agg(total_score=_.score.sum())\n",
214 | " .order_by(ibis.desc(\"total_score\"))\n",
215 | " .limit(5)\n",
216 | ")\n",
217 | "\n",
218 | "top_commenters_by_score"
219 | ]
220 | },
221 | {
222 | "cell_type": "markdown",
223 | "metadata": {},
224 | "source": [
225 | "## Next Steps\n",
226 | "\n",
227 | "There are lots of other interesting queries one might ask of this dataset. A few examples:\n",
228 | "\n",
229 | "- What posts had the most comments?\n",
230 | "- How do post scores fluctuate over time?\n",
231 | "- What day of the week has the highest average post score? What day has the lowest?\n",
232 | "\n",
233 | "To learn more about how to use Ibis with Clickhouse, see [the documentation](https://ibis-project.org/backends/ClickHouse/)."
234 | ]
235 | }
236 | ],
237 | "metadata": {
238 | "interpreter": {
239 | "hash": "db67a4c5f346815e3207df1348e9e718605305208b0cc89f618da4cb81ede2ba"
240 | },
241 | "kernelspec": {
242 | "display_name": "Python 3 (ipykernel)",
243 | "language": "python",
244 | "name": "python3"
245 | },
246 | "language_info": {
247 | "codemirror_mode": {
248 | "name": "ipython",
249 | "version": 3
250 | },
251 | "file_extension": ".py",
252 | "mimetype": "text/x-python",
253 | "name": "python",
254 | "nbconvert_exporter": "python",
255 | "pygments_lexer": "ipython3",
256 | "version": "3.10.12"
257 | }
258 | },
259 | "nbformat": 4,
260 | "nbformat_minor": 2
261 | }
262 |
--------------------------------------------------------------------------------
/examples/imdb.ipynb:
--------------------------------------------------------------------------------
1 | {
2 | "cells": [
3 | {
4 | "attachments": {},
5 | "cell_type": "markdown",
6 | "metadata": {},
7 | "source": [
8 | "# Analyzing IMDB data with Ibis and DuckDB\n"
9 | ]
10 | },
11 | {
12 | "attachments": {},
13 | "cell_type": "markdown",
14 | "metadata": {},
15 | "source": [
16 | "Using the Ibis examples module and the DuckDB backend to find some movies to watch.\n",
17 | "\n",
18 | "Adapted from [Phillip in the Cloud's livestream using the same data](https://www.youtube.com/watch?v=J7sEn9VklKY)."
19 | ]
20 | },
21 | {
22 | "attachments": {},
23 | "cell_type": "markdown",
24 | "metadata": {},
25 | "source": [
26 | "## Imports\n",
27 | "\n",
28 | "For this example, we'll just use Ibis."
29 | ]
30 | },
31 | {
32 | "cell_type": "code",
33 | "execution_count": null,
34 | "metadata": {},
35 | "outputs": [],
36 | "source": [
37 | "import ibis\n",
38 | "from ibis import _\n",
39 | "\n",
40 | "import ibis.examples as ex\n",
41 | "import ibis.expr.types as dt"
42 | ]
43 | },
44 | {
45 | "attachments": {},
46 | "cell_type": "markdown",
47 | "metadata": {},
48 | "source": [
49 | "## Configure Ibis\n",
50 | "\n",
51 | "We'll use the default backend and enable interactive output."
52 | ]
53 | },
54 | {
55 | "cell_type": "code",
56 | "execution_count": null,
57 | "metadata": {},
58 | "outputs": [],
59 | "source": [
60 | "ibis.options.interactive = True"
61 | ]
62 | },
63 | {
64 | "attachments": {},
65 | "cell_type": "markdown",
66 | "metadata": {},
67 | "source": [
68 | "## Fetch the example data\n",
69 | "\n",
70 | "We can use the `ibis.examples` module to fetch the IMDB data. Ibis will automatically cache the data so subsequent runs will be faster."
71 | ]
72 | },
73 | {
74 | "cell_type": "code",
75 | "execution_count": null,
76 | "metadata": {},
77 | "outputs": [],
78 | "source": [
79 | "name_basics = ex.imdb_name_basics.fetch()"
80 | ]
81 | },
82 | {
83 | "cell_type": "code",
84 | "execution_count": null,
85 | "metadata": {},
86 | "outputs": [],
87 | "source": [
88 | "name_basics"
89 | ]
90 | },
91 | {
92 | "attachments": {},
93 | "cell_type": "markdown",
94 | "metadata": {},
95 | "source": [
96 | "To ensure column names are Pythonic, we can relabel as `snake_case`."
97 | ]
98 | },
99 | {
100 | "cell_type": "code",
101 | "execution_count": null,
102 | "metadata": {},
103 | "outputs": [],
104 | "source": [
105 | "name_basics.relabel(\"snake_case\")"
106 | ]
107 | },
108 | {
109 | "attachments": {},
110 | "cell_type": "markdown",
111 | "metadata": {},
112 | "source": [
113 | "Let's grab all of the relevant IMDB tables and relabel columns."
114 | ]
115 | },
116 | {
117 | "cell_type": "code",
118 | "execution_count": null,
119 | "metadata": {},
120 | "outputs": [],
121 | "source": [
122 | "name_basics = ex.imdb_name_basics.fetch().relabel(\"snake_case\")\n",
123 | "title_akas = ex.imdb_title_akas.fetch().relabel(\"snake_case\")\n",
124 | "title_basics = ex.imdb_title_basics.fetch().relabel(\"snake_case\")\n",
125 | "title_crew = ex.imdb_title_crew.fetch().relabel(\"snake_case\")\n",
126 | "title_episode = ex.imdb_title_episode.fetch().relabel(\"snake_case\")\n",
127 | "title_principals = ex.imdb_title_principals.fetch().relabel(\"snake_case\")\n",
128 | "title_ratings = ex.imdb_title_ratings.fetch().relabel(\"snake_case\")"
129 | ]
130 | },
131 | {
132 | "attachments": {},
133 | "cell_type": "markdown",
134 | "metadata": {},
135 | "source": [
136 | "## Preview the data\n",
137 | "\n",
138 | "We'll print out the first few rows of each table to get an idea of what is contained in each."
139 | ]
140 | },
141 | {
142 | "cell_type": "code",
143 | "execution_count": null,
144 | "metadata": {},
145 | "outputs": [],
146 | "source": [
147 | "PREVIEW_SIZE = 3"
148 | ]
149 | },
150 | {
151 | "cell_type": "code",
152 | "execution_count": null,
153 | "metadata": {},
154 | "outputs": [],
155 | "source": [
156 | "name_basics.head(PREVIEW_SIZE)"
157 | ]
158 | },
159 | {
160 | "cell_type": "code",
161 | "execution_count": null,
162 | "metadata": {},
163 | "outputs": [],
164 | "source": [
165 | "title_akas.head(PREVIEW_SIZE)"
166 | ]
167 | },
168 | {
169 | "cell_type": "code",
170 | "execution_count": null,
171 | "metadata": {},
172 | "outputs": [],
173 | "source": [
174 | "title_basics.head(PREVIEW_SIZE)"
175 | ]
176 | },
177 | {
178 | "cell_type": "code",
179 | "execution_count": null,
180 | "metadata": {},
181 | "outputs": [],
182 | "source": [
183 | "title_crew.head(PREVIEW_SIZE)"
184 | ]
185 | },
186 | {
187 | "cell_type": "code",
188 | "execution_count": null,
189 | "metadata": {},
190 | "outputs": [],
191 | "source": [
192 | "title_episode.head(PREVIEW_SIZE)"
193 | ]
194 | },
195 | {
196 | "cell_type": "code",
197 | "execution_count": null,
198 | "metadata": {},
199 | "outputs": [],
200 | "source": [
201 | "title_principals.head(PREVIEW_SIZE)"
202 | ]
203 | },
204 | {
205 | "cell_type": "code",
206 | "execution_count": null,
207 | "metadata": {},
208 | "outputs": [],
209 | "source": [
210 | "title_ratings.head(PREVIEW_SIZE)"
211 | ]
212 | },
213 | {
214 | "attachments": {},
215 | "cell_type": "markdown",
216 | "metadata": {},
217 | "source": [
218 | "## Basic data exploration\n",
219 | "\n",
220 | "Let's check how many records are in each table. It's just Python, so we can construct a dictionary and iterate through it in a for loop.\n"
221 | ]
222 | },
223 | {
224 | "cell_type": "code",
225 | "execution_count": null,
226 | "metadata": {},
227 | "outputs": [],
228 | "source": [
229 | "tables = {\n",
230 | " \"name_basics\": name_basics,\n",
231 | " \"title_akas\": title_akas,\n",
232 | " \"title_basics\": title_basics,\n",
233 | " \"title_crew\": title_crew,\n",
234 | " \"title_episode\": title_episode,\n",
235 | " \"title_principals\": title_principals,\n",
236 | " \"title_ratings\": title_ratings,\n",
237 | "}\n",
238 | "max_name_len = max(map(len, tables.keys())) + 1"
239 | ]
240 | },
241 | {
242 | "cell_type": "code",
243 | "execution_count": null,
244 | "metadata": {},
245 | "outputs": [],
246 | "source": [
247 | "print(\"Length of tables:\")\n",
248 | "for t in tables:\n",
249 | " print(f\"\\t{t.ljust(max_name_len)}: {tables[t].count().to_pandas():,}\")"
250 | ]
251 | },
252 | {
253 | "attachments": {},
254 | "cell_type": "markdown",
255 | "metadata": {},
256 | "source": [
257 | "## Clean data\n",
258 | "\n",
259 | "Looking at the data, the `nconst` and `tconst` columns seem to be unique identifiers. Let's confirm and adjust them accordingly."
260 | ]
261 | },
262 | {
263 | "cell_type": "code",
264 | "execution_count": null,
265 | "metadata": {},
266 | "outputs": [],
267 | "source": [
268 | "name_basics.head(PREVIEW_SIZE)"
269 | ]
270 | },
271 | {
272 | "attachments": {},
273 | "cell_type": "markdown",
274 | "metadata": {},
275 | "source": [
276 | "Check the number of unique `nconst` values."
277 | ]
278 | },
279 | {
280 | "cell_type": "code",
281 | "execution_count": null,
282 | "metadata": {},
283 | "outputs": [],
284 | "source": [
285 | "name_basics.nconst.nunique()"
286 | ]
287 | },
288 | {
289 | "attachments": {},
290 | "cell_type": "markdown",
291 | "metadata": {},
292 | "source": [
293 | "Confirm it's equal to the number of rows."
294 | ]
295 | },
296 | {
297 | "cell_type": "code",
298 | "execution_count": null,
299 | "metadata": {},
300 | "outputs": [],
301 | "source": [
302 | "name_basics.nconst.nunique() == name_basics.count()"
303 | ]
304 | },
305 | {
306 | "attachments": {},
307 | "cell_type": "markdown",
308 | "metadata": {},
309 | "source": [
310 | "Mutate the table to convert `nconst` to an integer."
311 | ]
312 | },
313 | {
314 | "cell_type": "code",
315 | "execution_count": null,
316 | "metadata": {},
317 | "outputs": [],
318 | "source": [
319 | "t = name_basics.mutate(nconst=_.nconst.replace(\"nm\", \"\").cast(\"int\"))\n",
320 | "t.head(PREVIEW_SIZE)"
321 | ]
322 | },
323 | {
324 | "attachments": {},
325 | "cell_type": "markdown",
326 | "metadata": {},
327 | "source": [
328 | "Let's also turn `primary_profession` into an array of strings instead of a single comma-separated string."
329 | ]
330 | },
331 | {
332 | "cell_type": "code",
333 | "execution_count": null,
334 | "metadata": {},
335 | "outputs": [],
336 | "source": [
337 | "t = t.mutate(primary_profession=_.primary_profession.split(\",\"))\n",
338 | "t"
339 | ]
340 | },
341 | {
342 | "attachments": {},
343 | "cell_type": "markdown",
344 | "metadata": {},
345 | "source": [
346 | "And, combining the two concepts, convert `known_for_titles` into an array of integers corresponding to `tconst` identifiers."
347 | ]
348 | },
349 | {
350 | "cell_type": "code",
351 | "execution_count": null,
352 | "metadata": {},
353 | "outputs": [],
354 | "source": [
355 | "t = t.mutate(\n",
356 | " known_for_titles=_.known_for_titles.split(\",\").map(\n",
357 | " lambda tconst: tconst.replace(\"tt\", \"\").cast(\"int\")\n",
358 | " )\n",
359 | ")\n",
360 | "t"
361 | ]
362 | },
363 | {
364 | "attachments": {},
365 | "cell_type": "markdown",
366 | "metadata": {},
367 | "source": [
368 | "## DRY-ing up the code\n",
369 | "\n",
370 | "We can define functions to convert `nconst` and `tconst` to integers."
371 | ]
372 | },
373 | {
374 | "cell_type": "code",
375 | "execution_count": null,
376 | "metadata": {},
377 | "outputs": [],
378 | "source": [
379 | "def nconst_to_int(nconst: dt.StringColumn) -> dt.IntegerColumn:\n",
380 | " return nconst.replace(\"nm\", \"\").cast(\"int\")\n",
381 | "\n",
382 | "\n",
383 | "def tconst_to_int(tconst: dt.StringColumn) -> dt.IntegerColumn:\n",
384 | " return tconst.replace(\"tt\", \"\").cast(\"int\")"
385 | ]
386 | },
387 | {
388 | "attachments": {},
389 | "cell_type": "markdown",
390 | "metadata": {},
391 | "source": [
392 | "Then combine the previous data cleansing in a single mutate call."
393 | ]
394 | },
395 | {
396 | "cell_type": "code",
397 | "execution_count": null,
398 | "metadata": {},
399 | "outputs": [],
400 | "source": [
401 | "name_basics = name_basics.mutate(\n",
402 | " nconst=nconst_to_int(_.nconst),\n",
403 | " primary_profession=_.primary_profession.split(\",\"),\n",
404 | " known_for_titles=_.known_for_titles.split(\",\").map(tconst_to_int),\n",
405 | ")\n",
406 | "name_basics"
407 | ]
408 | },
409 | {
410 | "attachments": {},
411 | "cell_type": "markdown",
412 | "metadata": {},
413 | "source": [
414 | "We can use `ibis.show_sql` to see all the SQL this generates."
415 | ]
416 | },
417 | {
418 | "cell_type": "code",
419 | "execution_count": null,
420 | "metadata": {},
421 | "outputs": [],
422 | "source": [
423 | "ibis.show_sql(name_basics)"
424 | ]
425 | },
426 | {
427 | "attachments": {},
428 | "cell_type": "markdown",
429 | "metadata": {},
430 | "source": [
431 | "Clean the rest of the tables. We'll convert `nconst` and `tconst` columns consistently to allow for easy joining."
432 | ]
433 | },
434 | {
435 | "cell_type": "code",
436 | "execution_count": null,
437 | "metadata": {},
438 | "outputs": [],
439 | "source": [
440 | "title_akas = title_akas.mutate(title_id=tconst_to_int(_.title_id)).relabel(\n",
441 | " {\"title_id\": \"tconst\"}\n",
442 | ")\n",
443 | "title_basics = title_basics.mutate(tconst=tconst_to_int(_.tconst))\n",
444 | "title_crew = title_crew.mutate(\n",
445 | " tconst=tconst_to_int(_.tconst),\n",
446 | " directors=_.directors.split(\",\").map(nconst_to_int),\n",
447 | " writers=_.writers.split(\",\").map(nconst_to_int),\n",
448 | ")\n",
449 | "title_episode = title_episode.mutate(\n",
450 | " tconst=tconst_to_int(_.tconst), parent_tconst=tconst_to_int(_.parent_tconst)\n",
451 | ")\n",
452 | "title_principals = title_principals.mutate(\n",
453 | " tconst=tconst_to_int(_.tconst), nconst=nconst_to_int(_.nconst)\n",
454 | ")\n",
455 | "title_ratings = title_ratings.mutate(tconst=tconst_to_int(_.tconst))"
456 | ]
457 | },
458 | {
459 | "attachments": {},
460 | "cell_type": "markdown",
461 | "metadata": {},
462 | "source": [
463 | "## Finding good (and bad) movies to watch\n",
464 | "\n",
465 | "Join the IMDB rankings with information about the movies."
466 | ]
467 | },
468 | {
469 | "cell_type": "code",
470 | "execution_count": null,
471 | "metadata": {},
472 | "outputs": [],
473 | "source": [
474 | "joined = title_basics.join(title_ratings, \"tconst\")\n",
475 | "joined"
476 | ]
477 | },
478 | {
479 | "cell_type": "code",
480 | "execution_count": null,
481 | "metadata": {},
482 | "outputs": [],
483 | "source": [
484 | "joined.title_type.value_counts().order_by(_.title_type_count.desc())"
485 | ]
486 | },
487 | {
488 | "attachments": {},
489 | "cell_type": "markdown",
490 | "metadata": {},
491 | "source": [
492 | "Filter down to movies."
493 | ]
494 | },
495 | {
496 | "cell_type": "code",
497 | "execution_count": null,
498 | "metadata": {},
499 | "outputs": [],
500 | "source": [
501 | "joined = joined.filter(_.title_type == \"movie\")\n",
502 | "joined"
503 | ]
504 | },
505 | {
506 | "attachments": {},
507 | "cell_type": "markdown",
508 | "metadata": {},
509 | "source": [
510 | "Reorder the columns and drop some."
511 | ]
512 | },
513 | {
514 | "cell_type": "code",
515 | "execution_count": null,
516 | "metadata": {},
517 | "outputs": [],
518 | "source": [
519 | "joined = joined.select(\n",
520 | " \"tconst\",\n",
521 | " \"primary_title\",\n",
522 | " \"average_rating\",\n",
523 | " \"num_votes\",\n",
524 | " \"genres\",\n",
525 | " \"runtime_minutes\",\n",
526 | ")\n",
527 | "joined"
528 | ]
529 | },
530 | {
531 | "attachments": {},
532 | "cell_type": "markdown",
533 | "metadata": {},
534 | "source": [
535 | "Sort by the average rating."
536 | ]
537 | },
538 | {
539 | "cell_type": "code",
540 | "execution_count": null,
541 | "metadata": {},
542 | "outputs": [],
543 | "source": [
544 | "joined = joined.order_by([_.average_rating.desc(), _.num_votes.desc()])\n",
545 | "joined"
546 | ]
547 | },
548 | {
549 | "attachments": {},
550 | "cell_type": "markdown",
551 | "metadata": {},
552 | "source": [
553 | "A lot of 10/10 movies I haven't heard of...let's filter to movies with at least `N` votes."
554 | ]
555 | },
556 | {
557 | "cell_type": "code",
558 | "execution_count": null,
559 | "metadata": {},
560 | "outputs": [],
561 | "source": [
562 | "N = 50000\n",
563 | "joined = joined.filter(_.num_votes > N)\n",
564 | "joined"
565 | ]
566 | },
567 | {
568 | "attachments": {},
569 | "cell_type": "markdown",
570 | "metadata": {},
571 | "source": [
572 | "What if you're in the mood for a bad movie?"
573 | ]
574 | },
575 | {
576 | "cell_type": "code",
577 | "execution_count": null,
578 | "metadata": {},
579 | "outputs": [],
580 | "source": [
581 | "joined = joined.order_by([_.average_rating.asc(), _.num_votes.desc()])\n",
582 | "joined"
583 | ]
584 | },
585 | {
586 | "attachments": {},
587 | "cell_type": "markdown",
588 | "metadata": {},
589 | "source": [
590 | "And specifically a bad comedy?"
591 | ]
592 | },
593 | {
594 | "cell_type": "code",
595 | "execution_count": null,
596 | "metadata": {},
597 | "outputs": [],
598 | "source": [
599 | "joined = joined.filter(_.genres.contains(\"Comedy\"))\n",
600 | "joined"
601 | ]
602 | },
603 | {
604 | "attachments": {},
605 | "cell_type": "markdown",
606 | "metadata": {},
607 | "source": [
608 | "Perfect!"
609 | ]
610 | },
611 | {
612 | "attachments": {},
613 | "cell_type": "markdown",
614 | "metadata": {},
615 | "source": [
616 | "## Next Steps"
617 | ]
618 | },
619 | {
620 | "attachments": {},
621 | "cell_type": "markdown",
622 | "metadata": {},
623 | "source": [
624 | "We only used two of the IMDB tables. What else can we do with the rest of the data? Play around and let us know!"
625 | ]
626 | }
627 | ],
628 | "metadata": {
629 | "interpreter": {
630 | "hash": "db67a4c5f346815e3207df1348e9e718605305208b0cc89f618da4cb81ede2ba"
631 | },
632 | "kernelspec": {
633 | "display_name": "Python 3 (ipykernel)",
634 | "language": "python",
635 | "name": "python3"
636 | },
637 | "language_info": {
638 | "codemirror_mode": {
639 | "name": "ipython",
640 | "version": 3
641 | },
642 | "file_extension": ".py",
643 | "mimetype": "text/x-python",
644 | "name": "python",
645 | "nbconvert_exporter": "python",
646 | "pygments_lexer": "ipython3",
647 | "version": "3.11.3"
648 | }
649 | },
650 | "nbformat": 4,
651 | "nbformat_minor": 2
652 | }
653 |
--------------------------------------------------------------------------------
/requirements-test.txt:
--------------------------------------------------------------------------------
1 | nbmake
2 | pytest
3 |
--------------------------------------------------------------------------------
/requirements.txt:
--------------------------------------------------------------------------------
1 | notebook
2 | jupyterlab == 3.4.8
3 | ipywidgets
4 | altair
5 | pandas < 2.1
6 | ibis-framework[sqlite,duckdb,clickhouse]
7 | ibis-substrait < 3.1
8 |
--------------------------------------------------------------------------------
/scripts/prepare_campaign_finance_data.py:
--------------------------------------------------------------------------------
1 | from pathlib import Path
2 | from urllib.request import urlretrieve
3 | from zipfile import ZipFile
4 |
5 | import ibis
6 |
7 | # Download and unzip the 2018 individual contributions data
8 | url = (
9 | "https://cg-519a459a-0ea3-42c2-b7bc-fa1143481f74.s3-us-gov-west-1."
10 | "amazonaws.com/bulk-downloads/2018/indiv18.zip"
11 | )
12 |
13 | root_dir = Path(__file__).resolve().parent.parent
14 | data_dir = root_dir.joinpath("data")
15 | data_dir.mkdir(exist_ok=True)
16 |
17 | zip_path = data_dir.joinpath("indiv18.zip")
18 | csv_path = data_dir.joinpath("itcont.txt")
19 | parquet_path = data_dir.joinpath("itcont.parquet")
20 |
21 | if not zip_path.exists():
22 | print("Downloading indiv18.zip...")
23 | urlretrieve(url, zip_path)
24 | else:
25 | print("indiv18.zip already downloaded")
26 |
27 | if not csv_path.exists():
28 | print("Extracting itcont.txt...")
29 | with ZipFile(zip_path) as zip_file:
30 | zip_file.extract("itcont.txt", path=data_dir)
31 | else:
32 | print("itcont.txt already extracted")
33 |
34 | if not parquet_path.exists():
35 | print("Generating itcont.parquet...")
36 | # Read in the CSV
37 | t = ibis.read_csv(csv_path)
38 |
39 | # The CSV doesn't have a header, we need to manually add titles
40 | header = [
41 | "CMTE_ID",
42 | "AMNDT_IND",
43 | "RPT_TP",
44 | "TRANSACTION_PGI",
45 | "IMAGE_NUM",
46 | "TRANSACTION_TP",
47 | "ENTITY_TP",
48 | "NAME",
49 | "CITY",
50 | "STATE",
51 | "ZIP_CODE",
52 | "EMPLOYER",
53 | "OCCUPATION",
54 | "TRANSACTION_DT",
55 | "TRANSACTION_AMT",
56 | "OTHER_ID",
57 | "TRAN_ID",
58 | "FILE_NUM",
59 | "MEMO_CD",
60 | "MEMO_TEXT",
61 | "SUB_ID",
62 | ]
63 | t = t.relabel(dict(zip(t.columns, header)))
64 |
65 | # For the analysis, we're only going to use a few of the columns. To save
66 | # bandwidth, lets select out only the columns we'll be using.
67 | columns = [
68 | "CMTE_ID",
69 | "TRANSACTION_PGI",
70 | "ENTITY_TP",
71 | "CITY",
72 | "STATE",
73 | "TRANSACTION_DT",
74 | "TRANSACTION_AMT",
75 | ]
76 | t = t[columns]
77 |
78 | # Write out a parquet file
79 | t.to_parquet(parquet_path, compression="zstd")
80 | else:
81 | print("itcont.parquet already exists")
82 |
--------------------------------------------------------------------------------
/tutorial/01-Introduction-to-Ibis.ipynb:
--------------------------------------------------------------------------------
1 | {
2 | "cells": [
3 | {
4 | "attachments": {},
5 | "cell_type": "markdown",
6 | "metadata": {},
7 | "source": [
8 | "# Getting started"
9 | ]
10 | },
11 | {
12 | "attachments": {},
13 | "cell_type": "markdown",
14 | "metadata": {},
15 | "source": [
16 | "### Setting up"
17 | ]
18 | },
19 | {
20 | "attachments": {},
21 | "cell_type": "markdown",
22 | "metadata": {},
23 | "source": [
24 | "To start using `ibis`, you need a Python environment with `ibis` installed.\n",
25 | "\n",
26 | "If you're running through this tutorial on your own machine (rather than binder) please follow the [installation instructions](https://ibis-project.org/install/ to setup an environment with the `SQLite` backend.\n",
27 | "\n",
28 | "You'll also need access to the `geography.db` database hosted [here](https://storage.googleapis.com/ibis-tutorial-data/geography.db).\n",
29 | "\n",
30 | "Every notebook in the tutorial starts with the following code to download the database if it doesn't already exist:"
31 | ]
32 | },
33 | {
34 | "cell_type": "code",
35 | "execution_count": null,
36 | "metadata": {},
37 | "outputs": [],
38 | "source": [
39 | "from tutorial_utils import setup\n",
40 | "\n",
41 | "setup()"
42 | ]
43 | },
44 | {
45 | "attachments": {},
46 | "cell_type": "markdown",
47 | "metadata": {},
48 | "source": [
49 | "You should now have `ibis` and the tutorial data all setup.\n",
50 | "We're ready to get started. First lets import `ibis`."
51 | ]
52 | },
53 | {
54 | "cell_type": "code",
55 | "execution_count": null,
56 | "metadata": {},
57 | "outputs": [],
58 | "source": [
59 | "import ibis"
60 | ]
61 | },
62 | {
63 | "attachments": {},
64 | "cell_type": "markdown",
65 | "metadata": {},
66 | "source": [
67 | "To make things easier, we will be using `ibis`'s **interactive mode** in order to see the results of an operation immediately.\n",
68 | "This is the recommended mode to use when doing interactive/iterative work with `ibis`.\n",
69 | "\n",
70 | "When deploying production code you'll typically run in **non-interactive/lazy mode**. More details on `ibis` non-interactive mode are covered in [a later notebook](./03-Expressions-Lazy-Mode-Logging.ipynb).\n",
71 | "\n",
72 | "To enable interactive mode, run:"
73 | ]
74 | },
75 | {
76 | "cell_type": "code",
77 | "execution_count": null,
78 | "metadata": {},
79 | "outputs": [],
80 | "source": [
81 | "ibis.options.interactive = True"
82 | ]
83 | },
84 | {
85 | "attachments": {},
86 | "cell_type": "markdown",
87 | "metadata": {},
88 | "source": [
89 | "### Creating a connection"
90 | ]
91 | },
92 | {
93 | "attachments": {},
94 | "cell_type": "markdown",
95 | "metadata": {},
96 | "source": [
97 | "Next thing we need is to create a **connection object**.\n",
98 | "\n",
99 | "The connection defines where the data is stored and where the computations will be performed.\n",
100 | "\n",
101 | "This is not the same as in `pandas` when we import the data from an external source (e.g. `pandas.read_sql`). In this case `pandas` loads data into memory and performs the computations itself. `ibis` will not load the data and perform any computation, but instead will leave the data in the backend defined in the connection, and will _ask_ the backend to perform the computations.\n",
102 | "\n",
103 | "In this tutorial we will be using a `SQLite` connection for its simplicity (no installation is needed). But `ibis` can work with many different backends, including big data systems, or GPU-accelerated analytical databases. As well as most common relational databases (`PostgreSQL`, `MySQL`, ...)."
104 | ]
105 | },
106 | {
107 | "cell_type": "code",
108 | "execution_count": null,
109 | "metadata": {},
110 | "outputs": [],
111 | "source": [
112 | "connection = ibis.sqlite.connect(\"geography.db\")"
113 | ]
114 | },
115 | {
116 | "cell_type": "markdown",
117 | "metadata": {},
118 | "source": [
119 | "### Exploring the data\n",
120 | "\n",
121 | "To list the tables in the `connection` object, we can use the `.list_tables()` method. If you are using Jupyter, you can see all the methods and attributes of the `connection` object by writing `connection.` and pressing the `` key."
122 | ]
123 | },
124 | {
125 | "cell_type": "code",
126 | "execution_count": null,
127 | "metadata": {},
128 | "outputs": [],
129 | "source": [
130 | "connection.list_tables()"
131 | ]
132 | },
133 | {
134 | "attachments": {},
135 | "cell_type": "markdown",
136 | "metadata": {},
137 | "source": [
138 | "These three tables include world countries data, their GDP by year and their independence information.\n",
139 | "\n",
140 | "* The data for the countries table has been obtained from [GeoNames](https://www.geonames.org/countries/).\n",
141 | "* The GDP table will be used in the next tutorial, and the data for it has been obtained from the\n",
142 | "[World Bank website](https://data.worldbank.org/indicator/NY.GDP.MKTP.CD).\n",
143 | "* The data for the `independence` table has been obtained from [Wikipedia](https://en.wikipedia.org/wiki/List_of_national_independence_days) and will be used in one of the following tutorials.\n",
144 | "\n",
145 | "Next, we want to access a specific table in the database. We can create a handler to the `countries` table with:"
146 | ]
147 | },
148 | {
149 | "cell_type": "code",
150 | "execution_count": null,
151 | "metadata": {},
152 | "outputs": [],
153 | "source": [
154 | "countries = connection.table(\"countries\")"
155 | ]
156 | },
157 | {
158 | "cell_type": "markdown",
159 | "metadata": {},
160 | "source": [
161 | "To list the columns of the `countries` table, we can use the `columns` attribute.\n",
162 | "\n",
163 | "Again, Jupyter users can see all the methods and attributes of the `countries` object by typing `countries.` and pressing ``."
164 | ]
165 | },
166 | {
167 | "cell_type": "code",
168 | "execution_count": null,
169 | "metadata": {},
170 | "outputs": [],
171 | "source": [
172 | "countries.columns"
173 | ]
174 | },
175 | {
176 | "cell_type": "markdown",
177 | "metadata": {},
178 | "source": [
179 | "We can now access a sample of the data. Let's focus on the `name`, `continent` and `population` columns to start with. We can visualize the values of the columns with:"
180 | ]
181 | },
182 | {
183 | "cell_type": "code",
184 | "execution_count": null,
185 | "metadata": {},
186 | "outputs": [],
187 | "source": [
188 | "countries[\"name\", \"continent\", \"population\"]"
189 | ]
190 | },
191 | {
192 | "attachments": {},
193 | "cell_type": "markdown",
194 | "metadata": {},
195 | "source": [
196 | "The table is too big for all the results to be displayed, and we probably don't want to see all of them at once anyway. For this reason, just the first 10 rows of the results are displayed.\n",
197 | "\n",
198 | "To check how many rows a table has, we can use the `.count()` method:"
199 | ]
200 | },
201 | {
202 | "cell_type": "code",
203 | "execution_count": null,
204 | "metadata": {},
205 | "outputs": [],
206 | "source": [
207 | "countries.count()"
208 | ]
209 | },
210 | {
211 | "cell_type": "markdown",
212 | "metadata": {},
213 | "source": [
214 | "To fetch just a subset of the rows, we can use the `.limit(n)` method, where `n` is the number of samples we want. In this case we will fetch the first `3` countries from the table:"
215 | ]
216 | },
217 | {
218 | "cell_type": "code",
219 | "execution_count": null,
220 | "metadata": {},
221 | "outputs": [],
222 | "source": [
223 | "countries[\"name\", \"continent\", \"population\"].limit(3)"
224 | ]
225 | },
226 | {
227 | "cell_type": "markdown",
228 | "metadata": {},
229 | "source": [
230 | "### Filters and order\n",
231 | "\n",
232 | "Now that we've got an intuition of the data available in the table `countries`, we will extract some information from it by applying filters and sorting the data.\n",
233 | "\n",
234 | "Let's focus on a single continent. We can see a list of unique continents in the table using the `.distinct()` method:"
235 | ]
236 | },
237 | {
238 | "cell_type": "code",
239 | "execution_count": null,
240 | "metadata": {},
241 | "outputs": [],
242 | "source": [
243 | "countries[[\"continent\"]].distinct()"
244 | ]
245 | },
246 | {
247 | "attachments": {},
248 | "cell_type": "markdown",
249 | "metadata": {},
250 | "source": [
251 | "We will focus on Asia (`AS`). We can identify which rows belong to Asian countries using the standard Python `==` operator:"
252 | ]
253 | },
254 | {
255 | "cell_type": "code",
256 | "execution_count": null,
257 | "metadata": {},
258 | "outputs": [],
259 | "source": [
260 | "countries[\"continent\"] == \"AS\""
261 | ]
262 | },
263 | {
264 | "cell_type": "markdown",
265 | "metadata": {},
266 | "source": [
267 | "The result has a value `True` for rows where the condition is true, and the value `False` when it's not.\n",
268 | "\n",
269 | "We can provide this expression to the method `.filter()`, and save the result in the variable `asian_countries` for future use."
270 | ]
271 | },
272 | {
273 | "cell_type": "code",
274 | "execution_count": null,
275 | "metadata": {},
276 | "outputs": [],
277 | "source": [
278 | "asian_countries = countries[\"name\", \"continent\", \"population\"].filter(\n",
279 | " countries[\"continent\"] == \"AS\"\n",
280 | ")\n",
281 | "asian_countries"
282 | ]
283 | },
284 | {
285 | "cell_type": "markdown",
286 | "metadata": {},
287 | "source": [
288 | "We can check how many countries exist in Asia (based on the information in the database) by using the `.count()` method we've already seen:"
289 | ]
290 | },
291 | {
292 | "cell_type": "code",
293 | "execution_count": null,
294 | "metadata": {},
295 | "outputs": [],
296 | "source": [
297 | "asian_countries.count()"
298 | ]
299 | },
300 | {
301 | "attachments": {},
302 | "cell_type": "markdown",
303 | "metadata": {},
304 | "source": [
305 | "Next, we want to find the most populated countries in Asia. We are going to sort the countries by the column `population` and fetch the first 10. We can use the `.order_by()` method to sort by a column:"
306 | ]
307 | },
308 | {
309 | "cell_type": "code",
310 | "execution_count": null,
311 | "metadata": {},
312 | "outputs": [],
313 | "source": [
314 | "asian_countries.order_by(\"population\").limit(10)"
315 | ]
316 | },
317 | {
318 | "attachments": {},
319 | "cell_type": "markdown",
320 | "metadata": {},
321 | "source": [
322 | "Because the default for `.order_by` is ascending order (ascending order like in `1, 2, 3, 4`) the operation will return the least populated countries. This behavior is consistent with SQL `ORDER BY`.\n",
323 | "\n",
324 | "To order in descending order we can use `ibis.desc()`:"
325 | ]
326 | },
327 | {
328 | "cell_type": "code",
329 | "execution_count": null,
330 | "metadata": {},
331 | "outputs": [],
332 | "source": [
333 | "asian_countries.order_by(ibis.desc(\"population\")).limit(10)"
334 | ]
335 | },
336 | {
337 | "attachments": {},
338 | "cell_type": "markdown",
339 | "metadata": {},
340 | "source": [
341 | "This is the list of the 10 most populated countries based on the data from [GeoNames](https://www.geonames.org/).\n",
342 | "\n",
343 | "**_To learn more about Ibis, continue to our next tutorial: [Aggregating and joining data](./02-Aggregates-Joins.ipynb)._**"
344 | ]
345 | }
346 | ],
347 | "metadata": {
348 | "kernelspec": {
349 | "display_name": "Python 3 (ipykernel)",
350 | "language": "python",
351 | "name": "python3"
352 | },
353 | "language_info": {
354 | "codemirror_mode": {
355 | "name": "ipython",
356 | "version": 3
357 | },
358 | "file_extension": ".py",
359 | "mimetype": "text/x-python",
360 | "name": "python",
361 | "nbconvert_exporter": "python",
362 | "pygments_lexer": "ipython3",
363 | "version": "3.10.8"
364 | }
365 | },
366 | "nbformat": 4,
367 | "nbformat_minor": 4
368 | }
369 |
--------------------------------------------------------------------------------
/tutorial/02-Aggregates-Joins.ipynb:
--------------------------------------------------------------------------------
1 | {
2 | "cells": [
3 | {
4 | "cell_type": "markdown",
5 | "metadata": {},
6 | "source": [
7 | "# Aggregating and joining data\n",
8 | "\n",
9 | "This is the second introductory tutorial to Ibis. If you are new to Ibis, you may want to start at [the beginning of this tutorial](./01-Introduction-to-Ibis.ipynb).\n",
10 | "\n",
11 | "In the first notebook we saw how to load and query data using `ibis`. In this notebook we'll continue with the same dataset, building up some more complicated queries."
12 | ]
13 | },
14 | {
15 | "cell_type": "code",
16 | "execution_count": null,
17 | "metadata": {},
18 | "outputs": [],
19 | "source": [
20 | "from tutorial_utils import setup\n",
21 | "\n",
22 | "setup()"
23 | ]
24 | },
25 | {
26 | "cell_type": "code",
27 | "execution_count": null,
28 | "metadata": {},
29 | "outputs": [],
30 | "source": [
31 | "import ibis\n",
32 | "\n",
33 | "ibis.options.interactive = True\n",
34 | "\n",
35 | "connection = ibis.sqlite.connect(\"geography.db\")\n",
36 | "countries = connection.table(\"countries\")\n",
37 | "\n",
38 | "countries[\"name\", \"continent\", \"area_km2\", \"population\"]"
39 | ]
40 | },
41 | {
42 | "cell_type": "markdown",
43 | "metadata": {},
44 | "source": [
45 | "## Expressions\n",
46 | "\n",
47 | "We will continue by exploring the data by continent. We will start by creating an expression\n",
48 | "with the continent names, since our table only contains the abbreviations.\n",
49 | "\n",
50 | "An expression is one or more operations performed over the data. They can be used to retrieve the\n",
51 | "data or to build more complex operations.\n",
52 | "\n",
53 | "In this case we will use a `case` conditional statement to replace values depending on a condition.\n",
54 | "A `case` expression will return a case builder, and must be followed by one or more `when` calls,\n",
55 | "optionally an `else_` call, and must end with a call to `end`, to complete the full expression.\n",
56 | "The expression where `case` is called (`countries['continent']` in this case)\n",
57 | "is evaluated to see if it's equal to any of the first arguments of the calls to `when`. And the second\n",
58 | "argument is returned. If the value does not match any of the `when` values, the value of `else_` is returned."
59 | ]
60 | },
61 | {
62 | "cell_type": "code",
63 | "execution_count": null,
64 | "metadata": {},
65 | "outputs": [],
66 | "source": [
67 | "continent_name = (\n",
68 | " countries[\"continent\"]\n",
69 | " .case()\n",
70 | " .when(\"NA\", \"North America\")\n",
71 | " .when(\"SA\", \"South America\")\n",
72 | " .when(\"EU\", \"Europe\")\n",
73 | " .when(\"AF\", \"Africa\")\n",
74 | " .when(\"AS\", \"Asia\")\n",
75 | " .when(\"OC\", \"Oceania\")\n",
76 | " .when(\"AN\", \"Antarctica\")\n",
77 | " .else_(\"Unknown continent\")\n",
78 | " .end()\n",
79 | " .name(\"continent_name\")\n",
80 | ")\n",
81 | "continent_name"
82 | ]
83 | },
84 | {
85 | "cell_type": "markdown",
86 | "metadata": {},
87 | "source": [
88 | "What we did is take the values of the column `countries['continent']`, and we created a calculated\n",
89 | "column with the names of the continents, as defined in the `when` methods.\n",
90 | "\n",
91 | "This calculated column is an expression. The computations didn't happen when defining the `continent_name`\n",
92 | "variable, and the results are not stored. They have been computed when we printed its content.\n",
93 | "\n",
94 | "We can see that by checking the type of `continent_name`:"
95 | ]
96 | },
97 | {
98 | "cell_type": "code",
99 | "execution_count": null,
100 | "metadata": {},
101 | "outputs": [],
102 | "source": [
103 | "type(continent_name)"
104 | ]
105 | },
106 | {
107 | "cell_type": "markdown",
108 | "metadata": {},
109 | "source": [
110 | "In the next tutorial we will see more about eager and lazy mode, and when operations are being\n",
111 | "executed. For now we can think that the query to the database happens only when we want to see\n",
112 | "the results.\n",
113 | "\n",
114 | "The important part is that now we can use our `continent_name` expression in other expressions.\n",
115 | "For example, since this is a column (a `StringColumn` to be specific), we can use it as a column\n",
116 | "to query the countries table.\n",
117 | "\n",
118 | "Note that when we created the expression we added `.name('continent_name')` to it, so the column\n",
119 | "has a name when being returned."
120 | ]
121 | },
122 | {
123 | "cell_type": "code",
124 | "execution_count": null,
125 | "metadata": {},
126 | "outputs": [],
127 | "source": [
128 | "countries[\"name\", continent_name, \"area_km2\", \"population\"]"
129 | ]
130 | },
131 | {
132 | "cell_type": "markdown",
133 | "metadata": {},
134 | "source": [
135 | "Just for illustration, let's repeat the same query, but renaming the expression to `continent`\n",
136 | "when using it in the list of columns to fetch."
137 | ]
138 | },
139 | {
140 | "cell_type": "code",
141 | "execution_count": null,
142 | "metadata": {},
143 | "outputs": [],
144 | "source": [
145 | "countries[\"name\", continent_name.name(\"continent\"), \"area_km2\", \"population\"]"
146 | ]
147 | },
148 | {
149 | "cell_type": "markdown",
150 | "metadata": {},
151 | "source": [
152 | "## Aggregating data\n",
153 | "\n",
154 | "Now, let's group our data by continent, and let's find the total population of each."
155 | ]
156 | },
157 | {
158 | "cell_type": "code",
159 | "execution_count": null,
160 | "metadata": {},
161 | "outputs": [],
162 | "source": [
163 | "countries.group_by(continent_name).aggregate(\n",
164 | " countries[\"population\"].sum().name(\"total_population\")\n",
165 | ")"
166 | ]
167 | },
168 | {
169 | "cell_type": "markdown",
170 | "metadata": {},
171 | "source": [
172 | "We can see how Asia is the most populated country, followed by Africa. Antarctica is the least populated,\n",
173 | "as we would expect.\n",
174 | "\n",
175 | "The code to aggregate has two main parts:\n",
176 | "- The `group_by` method, that receive the column, expression or list of them to group by\n",
177 | "- The `aggregate` method, that receives an expression with the reduction we want to apply\n",
178 | "\n",
179 | "To make things a bit clearer, let's first save the reduction."
180 | ]
181 | },
182 | {
183 | "cell_type": "code",
184 | "execution_count": null,
185 | "metadata": {},
186 | "outputs": [],
187 | "source": [
188 | "total_population = countries[\"population\"].sum().name(\"total_population\")\n",
189 | "total_population"
190 | ]
191 | },
192 | {
193 | "cell_type": "markdown",
194 | "metadata": {},
195 | "source": [
196 | "As we can see, if we perform the operation directly, we will get the sum of the total in the column.\n",
197 | "\n",
198 | "But if we take the `total_population` expression as the parameter of the `aggregate` method, then the total is computed\n",
199 | "over every group defined by the `group_by` method."
200 | ]
201 | },
202 | {
203 | "cell_type": "code",
204 | "execution_count": null,
205 | "metadata": {},
206 | "outputs": [],
207 | "source": [
208 | "countries.group_by(continent_name).aggregate(total_population)"
209 | ]
210 | },
211 | {
212 | "cell_type": "markdown",
213 | "metadata": {},
214 | "source": [
215 | "If we want to compute two aggregates at the same time, we can pass a list to the `aggregate` method.\n",
216 | "\n",
217 | "For illustration, we use the `continent` column, instead of the `continent_names` expression. We can\n",
218 | "use both column names and expressions, and also a list with any of them (e.g. `[continent_names, 'name']`."
219 | ]
220 | },
221 | {
222 | "cell_type": "code",
223 | "execution_count": null,
224 | "metadata": {},
225 | "outputs": [],
226 | "source": [
227 | "countries.group_by(\"continent\").aggregate(\n",
228 | " [total_population, countries[\"area_km2\"].mean().name(\"average_area\")]\n",
229 | ")"
230 | ]
231 | },
232 | {
233 | "cell_type": "markdown",
234 | "metadata": {},
235 | "source": [
236 | "## Joining data\n",
237 | "\n",
238 | "Now we are going to get the total gross domestic product (GDP) for each continent. In this case, the GDP data\n",
239 | "is not in the same table `countries`, but in a table `gdp`."
240 | ]
241 | },
242 | {
243 | "cell_type": "code",
244 | "execution_count": null,
245 | "metadata": {},
246 | "outputs": [],
247 | "source": [
248 | "gdp = connection.table(\"gdp\")\n",
249 | "gdp"
250 | ]
251 | },
252 | {
253 | "cell_type": "markdown",
254 | "metadata": {},
255 | "source": [
256 | "The table contains information for different years, we can easily check the range with:"
257 | ]
258 | },
259 | {
260 | "cell_type": "code",
261 | "execution_count": null,
262 | "metadata": {},
263 | "outputs": [],
264 | "source": [
265 | "gdp[\"year\"].min(), gdp[\"year\"].max()"
266 | ]
267 | },
268 | {
269 | "cell_type": "markdown",
270 | "metadata": {},
271 | "source": [
272 | "Now, we are going to join this data with the `countries` table so we can obtain the continent\n",
273 | "of each country. The `countries` table has several different codes for the countries. Let's find out which\n",
274 | "one matches the three letter code in the `gdp` table."
275 | ]
276 | },
277 | {
278 | "cell_type": "code",
279 | "execution_count": null,
280 | "metadata": {},
281 | "outputs": [],
282 | "source": [
283 | "countries[\"iso_alpha2\", \"iso_alpha3\", \"iso_numeric\", \"fips\", \"name\"]"
284 | ]
285 | },
286 | {
287 | "cell_type": "markdown",
288 | "metadata": {},
289 | "source": [
290 | "The `country_code` in `gdp` corresponds to `iso_alpha3` in the `countries` table. We can also see\n",
291 | "how the `gdp` table has `10,000` rows, while `countries` has `252`. We will start joining the\n",
292 | "two tables by the codes that match, discarding the codes that do not exist in both tables.\n",
293 | "This is called an inner join."
294 | ]
295 | },
296 | {
297 | "cell_type": "code",
298 | "execution_count": null,
299 | "metadata": {},
300 | "outputs": [],
301 | "source": [
302 | "countries_and_gdp = countries.inner_join(\n",
303 | " gdp, predicates=countries[\"iso_alpha3\"] == gdp[\"country_code\"]\n",
304 | ")\n",
305 | "countries_and_gdp[countries, gdp]"
306 | ]
307 | },
308 | {
309 | "cell_type": "markdown",
310 | "metadata": {},
311 | "source": [
312 | "We joined the table with the information for all years. Now we are going to just take the information about the last available year, 2017."
313 | ]
314 | },
315 | {
316 | "cell_type": "code",
317 | "execution_count": null,
318 | "metadata": {},
319 | "outputs": [],
320 | "source": [
321 | "gdp_2017 = gdp.filter(gdp[\"year\"] == 2017)\n",
322 | "gdp_2017"
323 | ]
324 | },
325 | {
326 | "cell_type": "markdown",
327 | "metadata": {},
328 | "source": [
329 | "Joining with the new expression we get:"
330 | ]
331 | },
332 | {
333 | "cell_type": "code",
334 | "execution_count": null,
335 | "metadata": {},
336 | "outputs": [],
337 | "source": [
338 | "countries_and_gdp = countries.inner_join(\n",
339 | " gdp_2017, predicates=countries[\"iso_alpha3\"] == gdp_2017[\"country_code\"]\n",
340 | ")\n",
341 | "countries_and_gdp[countries, gdp_2017]"
342 | ]
343 | },
344 | {
345 | "cell_type": "markdown",
346 | "metadata": {},
347 | "source": [
348 | "We have called the `inner_join` method of the `countries` table and passed\n",
349 | "the `gdp` table as a parameter. The method receives a second parameter, `predicates`, that is used to specify\n",
350 | "how the join will be performed. In this case we want the `iso_alpha3` column in `countries` to\n",
351 | "match the `country_code` column in `gdp`. This is specified with the expression\n",
352 | "`countries['iso_alpha3'] == gdp['country_code']`.\n"
353 | ]
354 | }
355 | ],
356 | "metadata": {
357 | "kernelspec": {
358 | "display_name": "Python 3 (ipykernel)",
359 | "language": "python",
360 | "name": "python3"
361 | },
362 | "language_info": {
363 | "codemirror_mode": {
364 | "name": "ipython",
365 | "version": 3
366 | },
367 | "file_extension": ".py",
368 | "mimetype": "text/x-python",
369 | "name": "python",
370 | "nbconvert_exporter": "python",
371 | "pygments_lexer": "ipython3",
372 | "version": "3.10.8"
373 | }
374 | },
375 | "nbformat": 4,
376 | "nbformat_minor": 4
377 | }
378 |
--------------------------------------------------------------------------------
/tutorial/03-Expressions-Lazy-Mode-Logging.ipynb:
--------------------------------------------------------------------------------
1 | {
2 | "cells": [
3 | {
4 | "cell_type": "markdown",
5 | "metadata": {},
6 | "source": [
7 | "# Lazy Mode and Logging\n",
8 | "\n",
9 | "So far, we have seen Ibis in interactive mode. Interactive mode (also known as eager mode) makes Ibis return the\n",
10 | "results of an operation immediately.\n",
11 | "\n",
12 | "In most cases, instead of using interactive mode, it makes more sense to use the default lazy mode.\n",
13 | "In lazy mode, Ibis won't be executing the operations automatically, but instead, will generate an\n",
14 | "expression to be executed at a later time.\n",
15 | "\n",
16 | "Let's see this in practice, starting with the same database as in previous tutorials."
17 | ]
18 | },
19 | {
20 | "cell_type": "code",
21 | "execution_count": null,
22 | "metadata": {},
23 | "outputs": [],
24 | "source": [
25 | "from tutorial_utils import setup\n",
26 | "\n",
27 | "setup()"
28 | ]
29 | },
30 | {
31 | "cell_type": "code",
32 | "execution_count": null,
33 | "metadata": {},
34 | "outputs": [],
35 | "source": [
36 | "import ibis\n",
37 | "\n",
38 | "connection = ibis.sqlite.connect(\"geography.db\")\n",
39 | "countries = connection.table(\"countries\")"
40 | ]
41 | },
42 | {
43 | "cell_type": "markdown",
44 | "metadata": {},
45 | "source": [
46 | "In previous tutorials, we set interactive mode to `True`, and we obtained the result\n",
47 | "of every operation."
48 | ]
49 | },
50 | {
51 | "cell_type": "code",
52 | "execution_count": null,
53 | "metadata": {},
54 | "outputs": [],
55 | "source": [
56 | "ibis.options.interactive = True\n",
57 | "\n",
58 | "countries[\"name\", \"continent\", \"population\"].limit(3)"
59 | ]
60 | },
61 | {
62 | "cell_type": "markdown",
63 | "metadata": {},
64 | "source": [
65 | "But now let's see what happens if we leave the `interactive` option to `False` (the default),\n",
66 | "and we operate in lazy mode."
67 | ]
68 | },
69 | {
70 | "cell_type": "code",
71 | "execution_count": null,
72 | "metadata": {},
73 | "outputs": [],
74 | "source": [
75 | "ibis.options.interactive = False\n",
76 | "\n",
77 | "countries[\"name\", \"continent\", \"population\"].limit(3)"
78 | ]
79 | },
80 | {
81 | "cell_type": "markdown",
82 | "metadata": {},
83 | "source": [
84 | "What we find is the graph of the expressions that would return the desired result instead of the result itself.\n",
85 | "\n",
86 | "Let's analyze the expressions in the graph:\n",
87 | "\n",
88 | "- We query the `countries` table (all rows and all columns)\n",
89 | "- We select the `name`, `continent` and `population` columns\n",
90 | "- We limit the results to only the first `3` rows\n",
91 | "\n",
92 | "Now consider that the data is in a database, possibly in a different host than the one executing Ibis.\n",
93 | "Also consider that the results returned to the user need to be moved to the memory of the host executing Ibis.\n",
94 | "\n",
95 | "When using interactive (or eager) mode, if we perform one operation at a time, we would do:\n",
96 | "\n",
97 | "- We would move all the rows and columns from the backend (database, big data system, etc.) into memory\n",
98 | "- Once in memory, we would discard all the columns but `name`, `continent` and `population`\n",
99 | "- After that, we would discard all the rows, except the first `3`\n",
100 | "\n",
101 | "This is not very efficient. If you consider that the table can have millions of rows, backed by a\n",
102 | "big data system like Spark or Impala, this may not even be possible (not enough memory to load all the data).\n",
103 | "\n",
104 | "The solution is to use lazy mode. In lazy mode, instead of obtaining the results after each operation,\n",
105 | "we build an expression (a graph) of all the operations that need to be done. After all the operations\n",
106 | "are recorded, the graph is sent to the backend which will perform the operation in an efficient way - only\n",
107 | "moving to memory the required data.\n",
108 | "\n",
109 | "You can think of this as writing a shopping list and requesting someone to go to the supermarket and buy\n",
110 | "everything you need once the list is complete. As opposed as getting someone to bring all the products of\n",
111 | "the supermarket to your home and then return everything you don't want.\n",
112 | "\n",
113 | "Let's continue with our example, save the expression in a variable `countries_expression`, and check its type."
114 | ]
115 | },
116 | {
117 | "cell_type": "code",
118 | "execution_count": null,
119 | "metadata": {},
120 | "outputs": [],
121 | "source": [
122 | "countries_expression = countries[\"name\", \"continent\", \"population\"].limit(3)\n",
123 | "type(countries_expression)"
124 | ]
125 | },
126 | {
127 | "cell_type": "markdown",
128 | "metadata": {},
129 | "source": [
130 | "The type is an Ibis `TableExpr`, since the result is a table (in a broad way, you can consider it a dataframe).\n",
131 | "\n",
132 | "Now we have our query instructions (our expression, fetching only 3 columns and 3 rows) in the variable `countries_expression`.\n",
133 | "\n",
134 | "At this point, nothing has been requested from the database. We have defined what we want to extract, but we didn't\n",
135 | "request it from the database yet. We can continue building our expression if we haven't finished yet. Or once we\n",
136 | "are done, we can simply request it from the database using the method `.to_pandas()`."
137 | ]
138 | },
139 | {
140 | "cell_type": "code",
141 | "execution_count": null,
142 | "metadata": {},
143 | "outputs": [],
144 | "source": [
145 | "countries_expression.to_pandas()"
146 | ]
147 | },
148 | {
149 | "cell_type": "markdown",
150 | "metadata": {},
151 | "source": [
152 | "We can build other types of expressions, for example, one that instead of returning a table,\n",
153 | "returns a columns."
154 | ]
155 | },
156 | {
157 | "cell_type": "code",
158 | "execution_count": null,
159 | "metadata": {},
160 | "outputs": [],
161 | "source": [
162 | "population_in_millions = (countries[\"population\"] / 1_000_000).name(\n",
163 | " \"population_in_millions\"\n",
164 | ")\n",
165 | "population_in_millions"
166 | ]
167 | },
168 | {
169 | "cell_type": "markdown",
170 | "metadata": {},
171 | "source": [
172 | "If we check its type, we can see how it is a `FloatingColumn` expression."
173 | ]
174 | },
175 | {
176 | "cell_type": "code",
177 | "execution_count": null,
178 | "metadata": {},
179 | "outputs": [],
180 | "source": [
181 | "type(population_in_millions)"
182 | ]
183 | },
184 | {
185 | "cell_type": "markdown",
186 | "metadata": {},
187 | "source": [
188 | "We can combine the previous expression to be a column of a table expression."
189 | ]
190 | },
191 | {
192 | "cell_type": "code",
193 | "execution_count": null,
194 | "metadata": {},
195 | "outputs": [],
196 | "source": [
197 | "countries[\"name\", \"continent\", population_in_millions].limit(3)"
198 | ]
199 | },
200 | {
201 | "cell_type": "markdown",
202 | "metadata": {},
203 | "source": [
204 | "Since we are in lazy mode (not interactive), those expressions don't request any data from the database\n",
205 | "unless explicitly requested with `.to_pandas()`.\n",
206 | "\n",
207 | "## Logging queries\n",
208 | "\n",
209 | "For SQL backends (and for others when it makes sense), the query sent to the database can be logged.\n",
210 | "This can be done by setting the `verbose` option to `True`."
211 | ]
212 | },
213 | {
214 | "cell_type": "code",
215 | "execution_count": null,
216 | "metadata": {},
217 | "outputs": [],
218 | "source": [
219 | "ibis.options.verbose = True\n",
220 | "\n",
221 | "countries[\"name\", \"continent\", population_in_millions].limit(3).to_pandas()"
222 | ]
223 | },
224 | {
225 | "cell_type": "markdown",
226 | "metadata": {},
227 | "source": [
228 | "By default, the logging is done to the terminal, but we can process the query with a custom function.\n",
229 | "This allows us to save executed queries to a file, save to a database, send them to a web service, etc.\n",
230 | "\n",
231 | "For example, to save queries to a file, we can write a custom function that given a query, saves it to a\n",
232 | "log file."
233 | ]
234 | },
235 | {
236 | "cell_type": "code",
237 | "execution_count": null,
238 | "metadata": {},
239 | "outputs": [],
240 | "source": [
241 | "from pathlib import Path\n",
242 | "\n",
243 | "\n",
244 | "def log_query_to_file(query: str) -> None:\n",
245 | " \"\"\"Log queries to `./tutorial_queries.log`.\"\"\"\n",
246 | " fname = Path() / \"tutorial_queries.log\"\n",
247 | " query = query.replace(\"\\n\", \" \")\n",
248 | " with fname.open(mode=\"a\") as f:\n",
249 | " # log on a single line\n",
250 | " f.write(f\"{query}\\n\")"
251 | ]
252 | },
253 | {
254 | "cell_type": "markdown",
255 | "metadata": {},
256 | "source": [
257 | "Then we can set the `verbose_log` option to the custom function, execute one query,\n",
258 | "wait one second, and execute another query."
259 | ]
260 | },
261 | {
262 | "cell_type": "code",
263 | "execution_count": null,
264 | "metadata": {},
265 | "outputs": [],
266 | "source": [
267 | "ibis.options.verbose_log = log_query_to_file\n",
268 | "\n",
269 | "countries.to_pandas()\n",
270 | "countries[\"name\", \"continent\", population_in_millions].limit(3).to_pandas()"
271 | ]
272 | },
273 | {
274 | "cell_type": "markdown",
275 | "metadata": {},
276 | "source": [
277 | "This has created a log file in `$PWD/tutorial_queries.log` where the executed queries have been logged."
278 | ]
279 | },
280 | {
281 | "cell_type": "code",
282 | "execution_count": null,
283 | "metadata": {},
284 | "outputs": [],
285 | "source": [
286 | "!cat -n $PWD/tutorial_queries.log"
287 | ]
288 | }
289 | ],
290 | "metadata": {
291 | "kernelspec": {
292 | "display_name": "Python 3 (ipykernel)",
293 | "language": "python",
294 | "name": "python3"
295 | },
296 | "language_info": {
297 | "codemirror_mode": {
298 | "name": "ipython",
299 | "version": 3
300 | },
301 | "file_extension": ".py",
302 | "mimetype": "text/x-python",
303 | "name": "python",
304 | "nbconvert_exporter": "python",
305 | "pygments_lexer": "ipython3",
306 | "version": "3.10.8"
307 | }
308 | },
309 | "nbformat": 4,
310 | "nbformat_minor": 4
311 | }
312 |
--------------------------------------------------------------------------------
/tutorial/04-More-Value-Expressions.ipynb:
--------------------------------------------------------------------------------
1 | {
2 | "cells": [
3 | {
4 | "attachments": {},
5 | "cell_type": "markdown",
6 | "metadata": {},
7 | "source": [
8 | "# More Value Expressions\n",
9 | "Let's walk through some more value expressions."
10 | ]
11 | },
12 | {
13 | "attachments": {},
14 | "cell_type": "markdown",
15 | "metadata": {},
16 | "source": [
17 | "## Setup"
18 | ]
19 | },
20 | {
21 | "cell_type": "code",
22 | "execution_count": null,
23 | "metadata": {},
24 | "outputs": [],
25 | "source": [
26 | "from tutorial_utils import setup\n",
27 | "\n",
28 | "setup()"
29 | ]
30 | },
31 | {
32 | "cell_type": "code",
33 | "execution_count": null,
34 | "metadata": {},
35 | "outputs": [],
36 | "source": [
37 | "import ibis\n",
38 | "\n",
39 | "ibis.options.interactive = True\n",
40 | "\n",
41 | "connection = ibis.sqlite.connect(\"geography.db\")"
42 | ]
43 | },
44 | {
45 | "attachments": {},
46 | "cell_type": "markdown",
47 | "metadata": {},
48 | "source": [
49 | "## Type casting\n",
50 | "\n",
51 | "The Ibis type system supports the most common data types used in analytics, including support for nested types like lists, structs, and maps.\n",
52 | "\n",
53 | "Type names can be used to cast from one type to another."
54 | ]
55 | },
56 | {
57 | "cell_type": "code",
58 | "execution_count": null,
59 | "metadata": {},
60 | "outputs": [],
61 | "source": [
62 | "countries = connection.table(\"countries\")\n",
63 | "countries"
64 | ]
65 | },
66 | {
67 | "cell_type": "code",
68 | "execution_count": null,
69 | "metadata": {},
70 | "outputs": [],
71 | "source": [
72 | "countries = connection.table(\"countries\")\n",
73 | "countries.population.cast(\"float\").sum()"
74 | ]
75 | },
76 | {
77 | "cell_type": "code",
78 | "execution_count": null,
79 | "metadata": {},
80 | "outputs": [],
81 | "source": [
82 | "countries.area_km2.cast(\"int32\").sum()"
83 | ]
84 | },
85 | {
86 | "attachments": {},
87 | "cell_type": "markdown",
88 | "metadata": {},
89 | "source": [
90 | "## Case / if-then-else expressions\n",
91 | "\n",
92 | "\n",
93 | "We support a number of variants of the SQL-equivalent `CASE` expression, and will add more API functions over time to meet different use cases and enhance the expressiveness of any branching-based value logic."
94 | ]
95 | },
96 | {
97 | "cell_type": "code",
98 | "execution_count": null,
99 | "metadata": {},
100 | "outputs": [],
101 | "source": [
102 | "expr = (\n",
103 | " countries.continent.case()\n",
104 | " .when(\"AF\", \"Africa\")\n",
105 | " .when(\"AN\", \"Antarctica\")\n",
106 | " .when(\"AS\", \"Asia\")\n",
107 | " .when(\"EU\", \"Europe\")\n",
108 | " .when(\"NA\", \"North America\")\n",
109 | " .when(\"OC\", \"Oceania\")\n",
110 | " .when(\"SA\", \"South America\")\n",
111 | " .else_(countries.continent)\n",
112 | " .end()\n",
113 | " .name(\"continent_name\")\n",
114 | ")\n",
115 | "\n",
116 | "expr.value_counts()"
117 | ]
118 | },
119 | {
120 | "attachments": {},
121 | "cell_type": "markdown",
122 | "metadata": {},
123 | "source": [
124 | "If the `else_` default condition is not provided, any values not matching one of the conditions will be `NULL`."
125 | ]
126 | },
127 | {
128 | "cell_type": "code",
129 | "execution_count": null,
130 | "metadata": {},
131 | "outputs": [],
132 | "source": [
133 | "expr = (\n",
134 | " countries.continent.case()\n",
135 | " .when(\"AF\", \"Africa\")\n",
136 | " .when(\"AS\", \"Asia\")\n",
137 | " .when(\"EU\", \"Europe\")\n",
138 | " .when(\"NA\", \"North America\")\n",
139 | " .when(\"OC\", \"Oceania\")\n",
140 | " .when(\"SA\", \"South America\")\n",
141 | " .end()\n",
142 | " .name(\"continent_name_with_nulls\")\n",
143 | ")\n",
144 | "\n",
145 | "expr.value_counts()"
146 | ]
147 | },
148 | {
149 | "attachments": {},
150 | "cell_type": "markdown",
151 | "metadata": {},
152 | "source": [
153 | "To test for an arbitrary series of boolean conditions, use the `case` API method and pass any boolean expressions potentially involving columns of the table:"
154 | ]
155 | },
156 | {
157 | "cell_type": "code",
158 | "execution_count": null,
159 | "metadata": {},
160 | "outputs": [],
161 | "source": [
162 | "expr = (\n",
163 | " ibis.case()\n",
164 | " .when(countries.population > 25_000_000, \"big\")\n",
165 | " .when(countries.population < 5_000_000, \"small\")\n",
166 | " .else_(\"medium\")\n",
167 | " .end()\n",
168 | " .name(\"size\")\n",
169 | ")\n",
170 | "\n",
171 | "countries[\"name\", \"population\", expr].limit(10)"
172 | ]
173 | },
174 | {
175 | "attachments": {},
176 | "cell_type": "markdown",
177 | "metadata": {},
178 | "source": [
179 | "Simple ternary-cases (like the Python `X if COND else Y`) can be written using the `ifelse` function:"
180 | ]
181 | },
182 | {
183 | "cell_type": "code",
184 | "execution_count": null,
185 | "metadata": {},
186 | "outputs": [],
187 | "source": [
188 | "expr = (countries.continent == \"AS\").ifelse(\"Asia\", \"Not Asia\").name(\"is_asia\")\n",
189 | "\n",
190 | "countries[\"name\", \"continent\", expr].limit(10)"
191 | ]
192 | },
193 | {
194 | "attachments": {},
195 | "cell_type": "markdown",
196 | "metadata": {},
197 | "source": [
198 | "## Set membership\n",
199 | "\n",
200 | "\n",
201 | "The `isin` and `notin` functions are like their pandas counterparts. These can take:\n",
202 | "\n",
203 | "- A list of value expressions, either literal values or other column expressions\n",
204 | "- An array/column expression of some kind"
205 | ]
206 | },
207 | {
208 | "cell_type": "code",
209 | "execution_count": null,
210 | "metadata": {},
211 | "outputs": [],
212 | "source": [
213 | "is_america = countries.continent.isin([\"NA\", \"SA\"])\n",
214 | "countries[is_america].continent.value_counts()"
215 | ]
216 | },
217 | {
218 | "attachments": {},
219 | "cell_type": "markdown",
220 | "metadata": {},
221 | "source": [
222 | "You can also check for membership in an array. Here is an example of filtering based on the top 3 (ignoring ties) most frequently-occurring values in the `string_col` column of alltypes:"
223 | ]
224 | },
225 | {
226 | "cell_type": "code",
227 | "execution_count": null,
228 | "metadata": {},
229 | "outputs": [],
230 | "source": [
231 | "top_continents = (\n",
232 | " countries.continent.value_counts()\n",
233 | " .order_by(ibis.desc(\"continent_count\"))\n",
234 | " .limit(3)\n",
235 | " .continent\n",
236 | ")\n",
237 | "top_continents_filter = countries.continent.isin(top_continents)\n",
238 | "expr = countries[top_continents_filter]\n",
239 | "\n",
240 | "expr.count()"
241 | ]
242 | },
243 | {
244 | "attachments": {},
245 | "cell_type": "markdown",
246 | "metadata": {},
247 | "source": [
248 | "This is a common enough operation that we provide a special analytical filter function `topk`:"
249 | ]
250 | },
251 | {
252 | "cell_type": "code",
253 | "execution_count": null,
254 | "metadata": {},
255 | "outputs": [],
256 | "source": [
257 | "countries.continent.topk(3)"
258 | ]
259 | },
260 | {
261 | "attachments": {},
262 | "cell_type": "markdown",
263 | "metadata": {},
264 | "source": [
265 | "Cool, huh? More on `topk` later."
266 | ]
267 | },
268 | {
269 | "attachments": {},
270 | "cell_type": "markdown",
271 | "metadata": {},
272 | "source": [
273 | "## Null Checking\n",
274 | "\n",
275 | "Like their pandas equivalents, the `isnull` and `notnull` functions return True values if the values are null, or non-null, respectively. For example:"
276 | ]
277 | },
278 | {
279 | "cell_type": "code",
280 | "execution_count": null,
281 | "metadata": {},
282 | "outputs": [],
283 | "source": [
284 | "expr = (\n",
285 | " countries.continent.case()\n",
286 | " .when(\"AF\", \"Africa\")\n",
287 | " .when(\"EU\", \"Europe\")\n",
288 | " .when(\"AS\", \"Asia\")\n",
289 | " .end()\n",
290 | " .name(\"top_continent_name\")\n",
291 | ")\n",
292 | "\n",
293 | "expr.isnull().value_counts()"
294 | ]
295 | },
296 | {
297 | "attachments": {},
298 | "cell_type": "markdown",
299 | "metadata": {},
300 | "source": [
301 | "Functions like `isnull` can be combined with `case` expressions or functions like `ifelse` to replace null values with some other value. `ifelse` here will use the first value supplied for any `True` value and the second value for any `False` value. Either value can be a scalar or array. "
302 | ]
303 | },
304 | {
305 | "cell_type": "code",
306 | "execution_count": null,
307 | "metadata": {},
308 | "outputs": [],
309 | "source": [
310 | "expr2 = expr.isnull().ifelse(\"Other continent\", expr).name(\"continent\")\n",
311 | "expr2.value_counts()"
312 | ]
313 | },
314 | {
315 | "attachments": {},
316 | "cell_type": "markdown",
317 | "metadata": {},
318 | "source": [
319 | "## Distinct-based operations\n",
320 | "\n",
321 | "\n",
322 | "Ibis supports using `distinct` to remove duplicate rows or values on tables or arrays. For example:"
323 | ]
324 | },
325 | {
326 | "cell_type": "code",
327 | "execution_count": null,
328 | "metadata": {},
329 | "outputs": [],
330 | "source": [
331 | "countries[[\"continent\"]].distinct()"
332 | ]
333 | },
334 | {
335 | "attachments": {},
336 | "cell_type": "markdown",
337 | "metadata": {},
338 | "source": [
339 | "This can be combined with `count` to form a reduction metric:"
340 | ]
341 | },
342 | {
343 | "cell_type": "code",
344 | "execution_count": null,
345 | "metadata": {},
346 | "outputs": [],
347 | "source": [
348 | "metric = countries[[\"continent\"]].distinct().count().name(\"num_continents\")\n",
349 | "metric"
350 | ]
351 | },
352 | {
353 | "attachments": {},
354 | "cell_type": "markdown",
355 | "metadata": {},
356 | "source": [
357 | "## String operations\n",
358 | "\n",
359 | "\n",
360 | "What's supported is pretty basic right now. We intend to support the full gamut of regular expression munging with a nice API, though in some cases some work will be required on SQLite's backend to support everything. "
361 | ]
362 | },
363 | {
364 | "cell_type": "code",
365 | "execution_count": null,
366 | "metadata": {},
367 | "outputs": [],
368 | "source": [
369 | "countries[[\"name\"]].limit(5)"
370 | ]
371 | },
372 | {
373 | "attachments": {},
374 | "cell_type": "markdown",
375 | "metadata": {},
376 | "source": [
377 | "At the moment, basic substring operations (`substr`, with conveniences `left` and `right`) and Python-like APIs such as `lower` and `upper` (for case normalization) are supported. So you could count first letter occurrences in a string column like so:"
378 | ]
379 | },
380 | {
381 | "cell_type": "code",
382 | "execution_count": null,
383 | "metadata": {},
384 | "outputs": [],
385 | "source": [
386 | "expr = countries.name.lower().left(1).name(\"first_letter\")\n",
387 | "expr.value_counts().order_by(ibis.desc(\"first_letter_count\")).limit(10)"
388 | ]
389 | },
390 | {
391 | "attachments": {},
392 | "cell_type": "markdown",
393 | "metadata": {},
394 | "source": [
395 | "For fuzzy and regex filtering/searching, you can use one of the following\n",
396 | "\n",
397 | "- `like`, works as the SQL `LIKE` keyword\n",
398 | "- `rlike`, like `re.search` or SQL `RLIKE`\n",
399 | "- `contains`, like `x in str_value` in Python"
400 | ]
401 | },
402 | {
403 | "cell_type": "code",
404 | "execution_count": null,
405 | "metadata": {},
406 | "outputs": [],
407 | "source": [
408 | "countries[countries.name.like(\"%GE%\")].name"
409 | ]
410 | },
411 | {
412 | "cell_type": "code",
413 | "execution_count": null,
414 | "metadata": {},
415 | "outputs": [],
416 | "source": [
417 | "countries[countries.name.lower().rlike(\".*ge.*\")].name"
418 | ]
419 | },
420 | {
421 | "cell_type": "code",
422 | "execution_count": null,
423 | "metadata": {},
424 | "outputs": [],
425 | "source": [
426 | "countries[countries.name.lower().contains(\"ge\")].name"
427 | ]
428 | },
429 | {
430 | "attachments": {},
431 | "cell_type": "markdown",
432 | "metadata": {},
433 | "source": [
434 | "## Timestamp operations\n",
435 | "\n",
436 | "\n",
437 | "Date and time functionality is relatively limited at present compared with pandas, but we'll get there. The main things we have right now are\n",
438 | "\n",
439 | "- Field access (year, month, day, ...)\n",
440 | "- Timedeltas\n",
441 | "- Comparisons with fixed timestamps"
442 | ]
443 | },
444 | {
445 | "cell_type": "code",
446 | "execution_count": null,
447 | "metadata": {},
448 | "outputs": [],
449 | "source": [
450 | "independence = connection.table(\"independence\")\n",
451 | "\n",
452 | "independence[\n",
453 | " independence.independence_date,\n",
454 | " independence.independence_date.month().name(\"month\"),\n",
455 | "].limit(10)"
456 | ]
457 | },
458 | {
459 | "attachments": {},
460 | "cell_type": "markdown",
461 | "metadata": {},
462 | "source": [
463 | "Somewhat more comprehensively"
464 | ]
465 | },
466 | {
467 | "cell_type": "code",
468 | "execution_count": null,
469 | "metadata": {},
470 | "outputs": [],
471 | "source": [
472 | "def get_field(f):\n",
473 | " return getattr(independence.independence_date, f)().name(f)\n",
474 | "\n",
475 | "\n",
476 | "fields = [\n",
477 | " \"year\",\n",
478 | " \"month\",\n",
479 | " \"day\",\n",
480 | "] # datetime fields can also use: 'hour', 'minute', 'second', 'millisecond'\n",
481 | "projection = [independence.independence_date] + [get_field(x) for x in fields]\n",
482 | "independence[projection].limit(10)"
483 | ]
484 | },
485 | {
486 | "attachments": {},
487 | "cell_type": "markdown",
488 | "metadata": {},
489 | "source": [
490 | "For timestamp arithmetic and comparisons, check out functions in the top level `ibis` namespace. This include things like `day` and `second`, but also the `ibis.timestamp` function:"
491 | ]
492 | },
493 | {
494 | "cell_type": "code",
495 | "execution_count": null,
496 | "metadata": {},
497 | "outputs": [],
498 | "source": [
499 | "independence.independence_date.max()"
500 | ]
501 | },
502 | {
503 | "cell_type": "code",
504 | "execution_count": null,
505 | "metadata": {},
506 | "outputs": [],
507 | "source": [
508 | "independence[independence.independence_date > \"2000-01-01\"].count()"
509 | ]
510 | },
511 | {
512 | "attachments": {},
513 | "cell_type": "markdown",
514 | "metadata": {},
515 | "source": [
516 | "Some backends support adding offsets. For example:\n",
517 | "\n",
518 | "```python\n",
519 | "independence.independence_date + ibis.interval(days=1)\n",
520 | "ibis.now() - independence.independence_date\n",
521 | "```"
522 | ]
523 | }
524 | ],
525 | "metadata": {
526 | "kernelspec": {
527 | "display_name": "Python 3 (ipykernel)",
528 | "language": "python",
529 | "name": "python3"
530 | },
531 | "language_info": {
532 | "codemirror_mode": {
533 | "name": "ipython",
534 | "version": 3
535 | },
536 | "file_extension": ".py",
537 | "mimetype": "text/x-python",
538 | "name": "python",
539 | "nbconvert_exporter": "python",
540 | "pygments_lexer": "ipython3",
541 | "version": "3.10.12"
542 | }
543 | },
544 | "nbformat": 4,
545 | "nbformat_minor": 4
546 | }
547 |
--------------------------------------------------------------------------------
/tutorial/05-IO-Create-Insert-External-Data.ipynb:
--------------------------------------------------------------------------------
1 | {
2 | "cells": [
3 | {
4 | "cell_type": "markdown",
5 | "metadata": {},
6 | "source": [
7 | "# Creating and Inserting Data"
8 | ]
9 | },
10 | {
11 | "cell_type": "markdown",
12 | "metadata": {},
13 | "source": [
14 | "## Setup"
15 | ]
16 | },
17 | {
18 | "cell_type": "code",
19 | "execution_count": null,
20 | "metadata": {},
21 | "outputs": [],
22 | "source": [
23 | "from tutorial_utils import setup\n",
24 | "\n",
25 | "setup()"
26 | ]
27 | },
28 | {
29 | "cell_type": "code",
30 | "execution_count": null,
31 | "metadata": {},
32 | "outputs": [],
33 | "source": [
34 | "import ibis\n",
35 | "\n",
36 | "ibis.options.interactive = True\n",
37 | "\n",
38 | "connection = ibis.sqlite.connect(\"geography.db\")"
39 | ]
40 | },
41 | {
42 | "cell_type": "markdown",
43 | "metadata": {},
44 | "source": [
45 | "## Creating new tables from Ibis expressions\n",
46 | "\n",
47 | "\n",
48 | "Suppose you have an Ibis expression that produces a table:"
49 | ]
50 | },
51 | {
52 | "cell_type": "code",
53 | "execution_count": null,
54 | "metadata": {},
55 | "outputs": [],
56 | "source": [
57 | "countries = connection.table(\"countries\")\n",
58 | "\n",
59 | "continent_name = (\n",
60 | " countries.continent.case()\n",
61 | " .when(\"AF\", \"Africa\")\n",
62 | " .when(\"AN\", \"Antarctica\")\n",
63 | " .when(\"AS\", \"Asia\")\n",
64 | " .when(\"EU\", \"Europe\")\n",
65 | " .when(\"NA\", \"North America\")\n",
66 | " .when(\"OC\", \"Oceania\")\n",
67 | " .when(\"SA\", \"South America\")\n",
68 | " .else_(countries.continent)\n",
69 | " .end()\n",
70 | " .name(\"continent_name\")\n",
71 | ")\n",
72 | "\n",
73 | "expr = countries[countries.continent, continent_name].distinct()\n",
74 | "expr"
75 | ]
76 | },
77 | {
78 | "cell_type": "markdown",
79 | "metadata": {},
80 | "source": [
81 | "To create a table in the database from the results of this expression, use the connection's `create_table` method:"
82 | ]
83 | },
84 | {
85 | "cell_type": "code",
86 | "execution_count": null,
87 | "metadata": {},
88 | "outputs": [],
89 | "source": [
90 | "connection.create_table(\"continents\", expr)"
91 | ]
92 | },
93 | {
94 | "cell_type": "code",
95 | "execution_count": null,
96 | "metadata": {},
97 | "outputs": [],
98 | "source": [
99 | "continents = connection.table(\"continents\")\n",
100 | "continents"
101 | ]
102 | },
103 | {
104 | "cell_type": "markdown",
105 | "metadata": {},
106 | "source": [
107 | "Tables can be similarly dropped with `drop_table`"
108 | ]
109 | },
110 | {
111 | "cell_type": "code",
112 | "execution_count": null,
113 | "metadata": {},
114 | "outputs": [],
115 | "source": [
116 | "connection.drop_table(\"continents\")"
117 | ]
118 | }
119 | ],
120 | "metadata": {
121 | "kernelspec": {
122 | "display_name": "Python 3 (ipykernel)",
123 | "language": "python",
124 | "name": "python3"
125 | },
126 | "language_info": {
127 | "codemirror_mode": {
128 | "name": "ipython",
129 | "version": 3
130 | },
131 | "file_extension": ".py",
132 | "mimetype": "text/x-python",
133 | "name": "python",
134 | "nbconvert_exporter": "python",
135 | "pygments_lexer": "ipython3",
136 | "version": "3.10.8"
137 | },
138 | "vscode": {
139 | "interpreter": {
140 | "hash": "c91744e846ab1fb46a81a92b1fa828c0e6b1381e7e12fd7b2bb300d813000458"
141 | }
142 | }
143 | },
144 | "nbformat": 4,
145 | "nbformat_minor": 4
146 | }
147 |
--------------------------------------------------------------------------------
/tutorial/06-ComplexFiltering.ipynb:
--------------------------------------------------------------------------------
1 | {
2 | "cells": [
3 | {
4 | "cell_type": "markdown",
5 | "metadata": {},
6 | "source": [
7 | "# Complex Filtering\n",
8 | "\n",
9 | "The filtering examples we've shown to this point have been pretty simple, either comparisons between columns or fixed values, or set filter functions like `isin` and `notin`. \n",
10 | "\n",
11 | "Ibis supports a number of richer analytical filters that can involve one or more of:\n",
12 | "\n",
13 | "- Aggregates computed from the same or other tables\n",
14 | "- Conditional aggregates (in SQL-speak these are similar to \"correlated subqueries\")\n",
15 | "- \"Existence\" set filters (equivalent to the SQL `EXISTS` and `NOT EXISTS` keywords)"
16 | ]
17 | },
18 | {
19 | "cell_type": "markdown",
20 | "metadata": {},
21 | "source": [
22 | "## Setup"
23 | ]
24 | },
25 | {
26 | "cell_type": "code",
27 | "execution_count": null,
28 | "metadata": {},
29 | "outputs": [],
30 | "source": [
31 | "from tutorial_utils import setup\n",
32 | "\n",
33 | "setup()"
34 | ]
35 | },
36 | {
37 | "cell_type": "code",
38 | "execution_count": null,
39 | "metadata": {},
40 | "outputs": [],
41 | "source": [
42 | "import ibis\n",
43 | "\n",
44 | "ibis.options.interactive = True\n",
45 | "\n",
46 | "connection = ibis.sqlite.connect(\"geography.db\")"
47 | ]
48 | },
49 | {
50 | "cell_type": "markdown",
51 | "metadata": {},
52 | "source": [
53 | "## Using scalar aggregates in filters"
54 | ]
55 | },
56 | {
57 | "cell_type": "code",
58 | "execution_count": null,
59 | "metadata": {},
60 | "outputs": [],
61 | "source": [
62 | "countries = connection.table(\"countries\")\n",
63 | "countries.limit(5)"
64 | ]
65 | },
66 | {
67 | "cell_type": "markdown",
68 | "metadata": {},
69 | "source": [
70 | "We could always compute some aggregate value from the table and use that in another expression, or we can use a data-derived aggregate in the filter. Take the average of a column. For example the average of countries size:"
71 | ]
72 | },
73 | {
74 | "cell_type": "code",
75 | "execution_count": null,
76 | "metadata": {},
77 | "outputs": [],
78 | "source": [
79 | "countries.area_km2.mean()"
80 | ]
81 | },
82 | {
83 | "cell_type": "markdown",
84 | "metadata": {},
85 | "source": [
86 | "You can use this expression as a substitute for a scalar value in a filter, and the execution engine will combine everything into a single query rather than having to access the database multiple times. For example, we want to filter European countries larger than the average country size in the world. See how most countries in Europe are smaller than the world average:"
87 | ]
88 | },
89 | {
90 | "cell_type": "code",
91 | "execution_count": null,
92 | "metadata": {},
93 | "outputs": [],
94 | "source": [
95 | "cond = countries.area_km2 > countries.area_km2.mean()\n",
96 | "expr = countries[(countries.continent == \"EU\") & cond]\n",
97 | "expr"
98 | ]
99 | },
100 | {
101 | "cell_type": "markdown",
102 | "metadata": {},
103 | "source": [
104 | "## Conditional aggregates\n",
105 | "\n",
106 | "\n",
107 | "Suppose that we wish to filter using an aggregate computed conditional on some other expressions holding true.\n",
108 | "\n",
109 | "For example, we want to filter European countries larger than the average country size, but this time of the average in Africa. African countries have an smaller size compared to the world average, and France gets into the list:"
110 | ]
111 | },
112 | {
113 | "cell_type": "code",
114 | "execution_count": null,
115 | "metadata": {},
116 | "outputs": [],
117 | "source": [
118 | "conditional_avg = countries[countries.continent == \"AF\"].area_km2.mean()\n",
119 | "countries[(countries.continent == \"EU\") & (countries.area_km2 > conditional_avg)]"
120 | ]
121 | },
122 | {
123 | "cell_type": "markdown",
124 | "metadata": {},
125 | "source": [
126 | "## \"Existence\" filters\n",
127 | "\n",
128 | "\n",
129 | "Some filtering involves checking for the existence of a particular value in a column of another table, or amount the results of some value expression. This is common in many-to-many relationships, and can be performed in numerous different ways, but it's nice to be able to express it with a single concise statement and let Ibis compute it optimally.\n",
130 | "\n",
131 | "An example could be finding all countries that had **any** year with a higher GDP than 3 trillion US dollars:"
132 | ]
133 | },
134 | {
135 | "cell_type": "code",
136 | "execution_count": null,
137 | "metadata": {},
138 | "outputs": [],
139 | "source": [
140 | "gdp = connection.table(\"gdp\")\n",
141 | "gdp"
142 | ]
143 | },
144 | {
145 | "cell_type": "code",
146 | "execution_count": null,
147 | "metadata": {},
148 | "outputs": [],
149 | "source": [
150 | "cond = ((gdp.country_code == countries.iso_alpha3) & (gdp.value > 3e12)).any()\n",
151 | "\n",
152 | "countries[cond][\"name\"]"
153 | ]
154 | },
155 | {
156 | "cell_type": "markdown",
157 | "metadata": {},
158 | "source": [
159 | "Note how this is different than a join between `countries` and `gdp`, which would return one row per year. The method `.any()` is equivalent to filtering with a subquery."
160 | ]
161 | },
162 | {
163 | "cell_type": "markdown",
164 | "metadata": {},
165 | "source": [
166 | "## Filtering in aggregations\n",
167 | "\n",
168 | "\n",
169 | "Suppose that you want to compute an aggregation with a subset of the data for _only one_ of the metrics / aggregates in question, and the complete data set with the other aggregates. Most aggregation functions are thus equipped with a `where` argument. Let me show it to you in action:"
170 | ]
171 | },
172 | {
173 | "cell_type": "code",
174 | "execution_count": null,
175 | "metadata": {},
176 | "outputs": [],
177 | "source": [
178 | "arctic = countries.name.isin(\n",
179 | " [\n",
180 | " \"United States\",\n",
181 | " \"Canada\",\n",
182 | " \"Finland\",\n",
183 | " \"Greenland\",\n",
184 | " \"Iceland\",\n",
185 | " \"Norway\",\n",
186 | " \"Russia\",\n",
187 | " \"Sweden\",\n",
188 | " ]\n",
189 | ")\n",
190 | "\n",
191 | "metrics = [\n",
192 | " countries.count().name(\"# countries\"),\n",
193 | " countries.population.sum().name(\"total population\"),\n",
194 | " countries.population.sum(where=arctic).name(\"population arctic countries\"),\n",
195 | "]\n",
196 | "\n",
197 | "(countries.group_by(countries.continent).aggregate(metrics))"
198 | ]
199 | }
200 | ],
201 | "metadata": {
202 | "kernelspec": {
203 | "display_name": "Python 3 (ipykernel)",
204 | "language": "python",
205 | "name": "python3"
206 | },
207 | "language_info": {
208 | "codemirror_mode": {
209 | "name": "ipython",
210 | "version": 3
211 | },
212 | "file_extension": ".py",
213 | "mimetype": "text/x-python",
214 | "name": "python",
215 | "nbconvert_exporter": "python",
216 | "pygments_lexer": "ipython3",
217 | "version": "3.10.8"
218 | }
219 | },
220 | "nbformat": 4,
221 | "nbformat_minor": 4
222 | }
223 |
--------------------------------------------------------------------------------
/tutorial/07-Analytics-Tools.ipynb:
--------------------------------------------------------------------------------
1 | {
2 | "cells": [
3 | {
4 | "cell_type": "markdown",
5 | "metadata": {},
6 | "source": [
7 | "# Analytics Tools"
8 | ]
9 | },
10 | {
11 | "cell_type": "markdown",
12 | "metadata": {},
13 | "source": [
14 | "## Setup"
15 | ]
16 | },
17 | {
18 | "cell_type": "code",
19 | "execution_count": null,
20 | "metadata": {},
21 | "outputs": [],
22 | "source": [
23 | "from tutorial_utils import setup\n",
24 | "\n",
25 | "setup()"
26 | ]
27 | },
28 | {
29 | "cell_type": "code",
30 | "execution_count": null,
31 | "metadata": {},
32 | "outputs": [],
33 | "source": [
34 | "import ibis\n",
35 | "\n",
36 | "ibis.options.interactive = True\n",
37 | "\n",
38 | "connection = ibis.sqlite.connect(\"geography.db\")"
39 | ]
40 | },
41 | {
42 | "cell_type": "markdown",
43 | "metadata": {},
44 | "source": [
45 | "## Frequency tables\n",
46 | "\n",
47 | "Ibis provides the `value_counts` API, just like pandas, for computing a frequency table for a table column or array expression. You might have seen it used already earlier in the tutorial. "
48 | ]
49 | },
50 | {
51 | "cell_type": "code",
52 | "execution_count": null,
53 | "metadata": {},
54 | "outputs": [],
55 | "source": [
56 | "countries = connection.table(\"countries\")\n",
57 | "countries.continent.value_counts()"
58 | ]
59 | },
60 | {
61 | "cell_type": "markdown",
62 | "metadata": {},
63 | "source": [
64 | "This can be customized, of course:"
65 | ]
66 | },
67 | {
68 | "cell_type": "code",
69 | "execution_count": null,
70 | "metadata": {},
71 | "outputs": [],
72 | "source": [
73 | "freq = countries.group_by(countries.continent).aggregate(\n",
74 | " [\n",
75 | " countries.count().name(\"# countries\"),\n",
76 | " countries.population.sum().name(\"total population\"),\n",
77 | " ]\n",
78 | ")\n",
79 | "freq"
80 | ]
81 | },
82 | {
83 | "cell_type": "markdown",
84 | "metadata": {},
85 | "source": [
86 | "## Binning and histograms\n",
87 | "\n",
88 | "\n",
89 | "Numeric array expressions (columns with numeric type and other array expressions) have `bucket` and `histogram` methods which produce different kinds of binning. These produce category values (the computed bins) that can be used in grouping and other analytics.\n",
90 | "\n",
91 | "Some backends implement the `.summary()` method, which can be used to see the general distribution of a column.\n",
92 | "\n",
93 | "Let's have a look at a few examples."
94 | ]
95 | },
96 | {
97 | "cell_type": "markdown",
98 | "metadata": {},
99 | "source": [
100 | "Alright then, now suppose we want to split the countries up into some buckets of our choosing for their population:"
101 | ]
102 | },
103 | {
104 | "cell_type": "code",
105 | "execution_count": null,
106 | "metadata": {},
107 | "outputs": [],
108 | "source": [
109 | "buckets = [0, 1e6, 1e7, 1e8, 1e9]"
110 | ]
111 | },
112 | {
113 | "cell_type": "markdown",
114 | "metadata": {},
115 | "source": [
116 | "The `bucket` function creates a bucketed category from the prices:"
117 | ]
118 | },
119 | {
120 | "cell_type": "code",
121 | "execution_count": null,
122 | "metadata": {},
123 | "outputs": [],
124 | "source": [
125 | "bucketed = countries.population.bucket(buckets).name(\"bucket\")"
126 | ]
127 | },
128 | {
129 | "cell_type": "markdown",
130 | "metadata": {},
131 | "source": [
132 | "Let's have a look at the value counts:"
133 | ]
134 | },
135 | {
136 | "cell_type": "code",
137 | "execution_count": null,
138 | "metadata": {},
139 | "outputs": [],
140 | "source": [
141 | "bucketed.value_counts()"
142 | ]
143 | },
144 | {
145 | "cell_type": "markdown",
146 | "metadata": {},
147 | "source": [
148 | "The buckets we wrote down define 4 buckets numbered 0 through 3. The `NaN` is a pandas `NULL` value (since that's how pandas represents nulls in numeric arrays), so don't worry too much about that. Since the bucketing ends at 100000, we see there are 4122 values that are over 100000. These can be included in the bucketing with `include_over`:"
149 | ]
150 | },
151 | {
152 | "cell_type": "code",
153 | "execution_count": null,
154 | "metadata": {},
155 | "outputs": [],
156 | "source": [
157 | "bucketed = countries.population.bucket(buckets, include_over=True).name(\"bucket\")\n",
158 | "bucketed.value_counts()"
159 | ]
160 | },
161 | {
162 | "cell_type": "markdown",
163 | "metadata": {},
164 | "source": [
165 | "The `bucketed` object here is a special **_category_** type"
166 | ]
167 | },
168 | {
169 | "cell_type": "code",
170 | "execution_count": null,
171 | "metadata": {},
172 | "outputs": [],
173 | "source": [
174 | "bucketed.type()"
175 | ]
176 | },
177 | {
178 | "cell_type": "markdown",
179 | "metadata": {},
180 | "source": [
181 | "Category values can either have a known or unknown **_cardinality_**. In this case, there's either 4 or 5 buckets based on how we used the `bucket` function.\n",
182 | "\n",
183 | "Labels can be assigned to the buckets at any time using the `label` function:"
184 | ]
185 | },
186 | {
187 | "cell_type": "code",
188 | "execution_count": null,
189 | "metadata": {},
190 | "outputs": [],
191 | "source": [
192 | "bucket_counts = bucketed.value_counts()\n",
193 | "\n",
194 | "labeled_bucket = bucket_counts.bucket.label(\n",
195 | " [\"< 1M\", \"> 1M\", \"> 10M\", \"> 100M\", \"> 1B\"]\n",
196 | ").name(\"bucket_name\")\n",
197 | "\n",
198 | "expr = bucket_counts[labeled_bucket, bucket_counts].order_by(\"bucket\")\n",
199 | "expr"
200 | ]
201 | },
202 | {
203 | "cell_type": "markdown",
204 | "metadata": {},
205 | "source": [
206 | "Nice, huh?\n",
207 | "\n",
208 | "Some backends implement `histogram(num_bins)`, a linear (fixed size bin) equivalent."
209 | ]
210 | }
211 | ],
212 | "metadata": {
213 | "kernelspec": {
214 | "display_name": "Python 3 (ipykernel)",
215 | "language": "python",
216 | "name": "python3"
217 | },
218 | "language_info": {
219 | "codemirror_mode": {
220 | "name": "ipython",
221 | "version": 3
222 | },
223 | "file_extension": ".py",
224 | "mimetype": "text/x-python",
225 | "name": "python",
226 | "nbconvert_exporter": "python",
227 | "pygments_lexer": "ipython3",
228 | "version": "3.10.8"
229 | }
230 | },
231 | "nbformat": 4,
232 | "nbformat_minor": 4
233 | }
234 |
--------------------------------------------------------------------------------
/tutorial/tutorial_utils.py:
--------------------------------------------------------------------------------
1 | import os
2 | from urllib.request import urlretrieve
3 |
4 |
5 | def setup():
6 | """Download the tutorial database if it doesn't already exist"""
7 | if not os.path.exists("geography.db"):
8 | urlretrieve(
9 | "https://storage.googleapis.com/ibis-tutorial-data/geography.db",
10 | "geography.db",
11 | )
12 |
--------------------------------------------------------------------------------
/welcome.md:
--------------------------------------------------------------------------------
1 | # Ibis Examples
2 |
3 | Welcome! This is a live Python environment running at
4 | [mybinder.org](https://mybinder.org/).
5 |
6 | It contains many example notebooks illustrating how to use
7 | [Ibis](https://ibis-project.org/).
8 |
9 | A file browser listing the example notebooks can be found on the left. If
10 | you're new to Ibis, we recommend starting with the tutorial found in the
11 | `tutorial` directory.
12 |
13 | ## Additional Resources
14 |
15 | To learn more about Ibis, please visit
16 | [ibis-project.org](https://ibis-project.org/).
17 |
18 | The code for these examples can be found [here on
19 | GitHub](https://github.com/ibis-project/ibis-examples). Found a typo, a bug, or
20 | a confusing example? Please [open an
21 | issue](https://github.com/ibis-project/ibis-examples/issues)!
22 |
--------------------------------------------------------------------------------