├── .devcontainer
└── devcontainer.json
├── .github
└── workflows
│ └── publish.yml
├── .gitignore
├── .jupyter
└── jupyter_notebook_config.py
├── .readthedocs.yaml
├── Dockerfile
├── LICENSE
├── Makefile
├── README.md
├── _config.yml
├── _toc.yml
├── data
├── confirmed.csv
├── death.csv
├── stock_sentiment_data.csv
└── titanic-preprocessed
│ └── titanic.parquet
├── docs
├── Makefile
├── _static
│ ├── fugue_logo_trimmed.svg
│ ├── logo_blue.svg
│ └── logo_doc.svg
├── _templates
│ ├── package.rst_t
│ └── toc.rst_t
└── make.bat
├── experiments
└── Polars_vs_Koalas.ipynb
├── images
├── architecture.svg
├── autoarima.png
├── checkpoint.svg
├── databricks_koalas_iloc.png
├── extensions.svg
├── fugue_backends.png
├── fugue_spark_benchmark_cdf.png
├── fugue_spark_benchmark_plus_one.png
├── fugue_spark_benchmark_subtract_mean_1.png
├── fugue_spark_benchmark_subtract_mean_2.png
├── fugue_sql
│ └── ipyvizzu.png
├── fugue_vs_spark.png
├── logo_blue.svg
├── nodes.png
├── nodes.svg
├── p_by_color_fugue.svg
├── p_by_color_presort.svg
├── p_by_color_spark.svg
├── p_even_by_color.svg
├── p_even_num_4.svg
├── p_even_num_4_by_color.svg
├── p_num_4.svg
├── p_orig.svg
├── p_rand_by_color.svg
├── p_rand_num_4.svg
├── pandas_like_1.png
├── pandas_like_2.png
├── pandas_like_3.png
├── prefect_fugue_block.png
├── transformers.svg
├── type_hint_functionality.svg
└── warehouses
│ └── create_bigquery_dataset.png
├── index.md
├── requirements.txt
├── spark-defaults.conf
└── tutorials
├── advanced
├── checkpoint.ipynb
├── execution_engine.ipynb
├── index.md
├── partition.ipynb
├── rpc.ipynb
├── schema_dataframes.ipynb
├── useful_config.ipynb
├── validation.ipynb
└── x-like.ipynb
├── applications
├── debugging
│ ├── index.md
│ └── unknown_opcode.ipynb
├── examples
│ ├── example_covid19.ipynb
│ ├── index.md
│ └── stock_sentiment.ipynb
├── recipes
│ ├── index.md
│ ├── loading_databases.ipynb
│ ├── loading_text_files.ipynb
│ └── pivot.ipynb
└── use_cases
│ ├── databricks_connect.ipynb
│ ├── image_classification.ipynb
│ ├── index.md
│ ├── model_sweeping.ipynb
│ ├── nlp.ipynb
│ └── unit_testing.ipynb
├── beginner
├── beginner_sql.ipynb
├── distributed_compute.ipynb
├── engine_context.ipynb
├── execution_engine.ipynb
├── index.md
├── io.ipynb
├── joins.ipynb
├── partitioning.ipynb
├── schema.ipynb
├── transform.ipynb
├── transformations.ipynb
└── type_hinting.ipynb
├── extensions
├── cotransformer.ipynb
├── creator.ipynb
├── index.md
├── interfaceless.ipynb
├── outputcotransformer.ipynb
├── outputter.ipynb
├── outputtransformer.ipynb
├── processor.ipynb
└── transformer.ipynb
├── fugue_sql
├── builtin.ipynb
├── extensions.ipynb
├── index.md
├── operators.ipynb
├── python.ipynb
└── syntax.ipynb
├── integrations
├── backends
│ ├── dask_sql.ipynb
│ ├── duckdb.ipynb
│ ├── ibis.ipynb
│ ├── index.md
│ └── polars.ipynb
├── cloudproviders
│ ├── anyscale.ipynb
│ ├── coiled.ipynb
│ ├── databricks.ipynb
│ ├── images
│ │ ├── anyscale_address.png
│ │ ├── anyscale_auth.png
│ │ ├── anyscale_env.png
│ │ └── anyscale_jupyter.png
│ └── index.md
├── ecosystem
│ ├── datacompy.ipynb
│ ├── index.md
│ ├── nixtla.ipynb
│ ├── pandera.ipynb
│ ├── prefect.ipynb
│ ├── pycaret.ipynb
│ └── whylogs.ipynb
└── warehouses
│ ├── bigquery.ipynb
│ ├── index.md
│ └── trino.ipynb
├── quick_look
├── index.md
├── ten_minutes.ipynb
└── ten_minutes_sql.ipynb
├── resources
├── appendix
│ ├── generate_types.ipynb
│ └── index.md
├── best_practices
│ ├── explicit_schema.ipynb
│ ├── file_formats.ipynb
│ ├── fugue_not_pandas.ipynb
│ ├── fugue_spark_benchmark.ipynb
│ ├── fugue_spark_benchmark_notebook.html
│ └── index.md
├── content.md
└── major_changes.md
└── tune
├── SciPyDemo_3_GreyKite.ipynb
├── index.md
├── iterative.ipynb
├── non_iterative.ipynb
└── search_space.ipynb
/.devcontainer/devcontainer.json:
--------------------------------------------------------------------------------
1 | {
2 | "name": "Fugue Development Environment",
3 | "image": "fugueproject/devenv:latest",
4 | "settings": {
5 | "terminal.integrated.shell.linux": "/bin/bash",
6 | "python.pythonPath": "/usr/local/bin/python",
7 | "python.linting.enabled": true,
8 | "python.linting.pylintEnabled": true,
9 | "python.formatting.autopep8Path": "/usr/local/py-utils/bin/autopep8",
10 | "python.formatting.blackPath": "/usr/local/py-utils/bin/black",
11 | "python.formatting.yapfPath": "/usr/local/py-utils/bin/yapf",
12 | "python.linting.banditPath": "/usr/local/py-utils/bin/bandit",
13 | "python.linting.flake8Path": "/usr/local/py-utils/bin/flake8",
14 | "python.linting.mypyPath": "/usr/local/py-utils/bin/mypy",
15 | "python.linting.pycodestylePath": "/usr/local/py-utils/bin/pycodestyle",
16 | "python.linting.pydocstylePath": "/usr/local/py-utils/bin/pydocstyle",
17 | "python.linting.pylintPath": "/usr/local/py-utils/bin/pylint"
18 | },
19 | "extensions": [
20 | "ms-python.python"
21 | ],
22 | "forwardPorts": [
23 | 8888
24 | ],
25 | "postCreateCommand": [
26 | ],
27 | "mounts": [
28 | "source=/var/run/docker.sock,target=/var/run/docker.sock,type=bind"
29 | ]
30 | }
--------------------------------------------------------------------------------
/.github/workflows/publish.yml:
--------------------------------------------------------------------------------
1 | # This workflows will upload a Python Package using Twine when a release is created
2 | # For more information see: https://help.github.com/en/actions/language-and-framework-guides/using-python-with-github-actions#publishing-to-package-registries
3 |
4 | name: Publish
5 |
6 | on:
7 | release:
8 | types: [created]
9 |
10 | jobs:
11 | push_to_registry:
12 | name: Push Docker image to Docker Hub
13 | runs-on: ubuntu-latest
14 | steps:
15 | - name: Check out the repo
16 | uses: actions/checkout@v2
17 | - name: Push to Docker Hub
18 | uses: docker/build-push-action@v2
19 | with:
20 | username: ${{ secrets.DOCKER_USERNAME }}
21 | password: ${{ secrets.DOCKER_PASSWORD }}
22 | repository: fugueproject/tutorials
23 | tags: latest
24 | tag_with_ref: true
25 |
--------------------------------------------------------------------------------
/.gitignore:
--------------------------------------------------------------------------------
1 | # Byte-compiled / optimized / DLL files
2 | __pycache__/
3 | *.py[cod]
4 | *$py.class
5 |
6 | # C extensions
7 | *.so
8 |
9 | # Distribution / packaging
10 | .Python
11 | build/
12 | develop-eggs/
13 | dist/
14 | downloads/
15 | eggs/
16 | .eggs/
17 | lib/
18 | lib64/
19 | parts/
20 | sdist/
21 | var/
22 | wheels/
23 | pip-wheel-metadata/
24 | share/python-wheels/
25 | *.egg-info/
26 | .installed.cfg
27 | *.egg
28 | MANIFEST
29 |
30 | # PyInstaller
31 | # Usually these files are written by a python script from a template
32 | # before PyInstaller builds the exe, so as to inject date/other infos into it.
33 | *.manifest
34 | *.spec
35 |
36 | # Installer logs
37 | pip-log.txt
38 | pip-delete-this-directory.txt
39 |
40 | # Unit test / coverage reports
41 | htmlcov/
42 | .tox/
43 | .nox/
44 | .coverage
45 | .coverage.*
46 | .cache
47 | nosetests.xml
48 | coverage.xml
49 | *.cover
50 | *.py,cover
51 | .hypothesis/
52 | .pytest_cache/
53 |
54 | # Translations
55 | *.mo
56 | *.pot
57 |
58 | # Django stuff:
59 | *.log
60 | local_settings.py
61 | db.sqlite3
62 | db.sqlite3-journal
63 |
64 | # Flask stuff:
65 | instance/
66 | .webassets-cache
67 |
68 | # Scrapy stuff:
69 | .scrapy
70 |
71 | # Sphinx documentation
72 | docs/_build/
73 | docs/images/
74 | docs/tutorials/
75 | docs/jupyter_execute/
76 |
77 | # PyBuilder
78 | target/
79 |
80 | # Jupyter Notebook
81 | .ipynb_checkpoints
82 |
83 | # IPython
84 | profile_default/
85 | ipython_config.py
86 |
87 | # pyenv
88 | .python-version
89 | pythonenv3.8
90 |
91 | # pipenv
92 | # According to pypa/pipenv#598, it is recommended to include Pipfile.lock in version control.
93 | # However, in case of collaboration, if having platform-specific dependencies or dependencies
94 | # having no cross-platform support, pipenv may install dependencies that don't work, or not
95 | # install all needed dependencies.
96 | #Pipfile.lock
97 |
98 | # PEP 582; used by e.g. github.com/David-OConnor/pyflow
99 | __pypackages__/
100 |
101 | # Celery stuff
102 | celerybeat-schedule
103 | celerybeat.pid
104 |
105 | # SageMath parsed files
106 | *.sage.py
107 |
108 | # Environments
109 | .env
110 | .venv
111 | env/
112 | venv/
113 | ENV/
114 | env.bak/
115 | venv.bak/
116 |
117 | # Spyder project settings
118 | .spyderproject
119 | .spyproject
120 |
121 | # Rope project settings
122 | .ropeproject
123 |
124 | # mkdocs documentation
125 | /site
126 |
127 | # mypy
128 | .mypy_cache
129 | .dmypy.json
130 | dmypy.json
131 |
132 | # Pyre type checker
133 | .pyre/
134 |
135 | .vscode
136 | tmp
137 |
138 | # Antlr
139 | .antlr
140 |
141 | # dask
142 | dask-worker-space
143 |
144 | # spark
145 | spark-warehourse
146 | =*
147 |
148 | .DS_Store
149 |
150 | # jupyter-book
151 | _build/
152 |
153 | # pytorch models
154 | *.pth
--------------------------------------------------------------------------------
/.jupyter/jupyter_notebook_config.py:
--------------------------------------------------------------------------------
1 | c = get_config()
2 |
3 | c.NotebookApp.token=''
--------------------------------------------------------------------------------
/.readthedocs.yaml:
--------------------------------------------------------------------------------
1 | version: 2
2 |
3 | # Set the version of Python and other tools you might need
4 | build:
5 | os: ubuntu-20.04
6 | tools:
7 | python: "3.8"
8 | jobs:
9 | pre_install:
10 | - pip install -U pip
11 | pre_build:
12 | # Generate the Sphinx configuration for this Jupyter Book so it builds.
13 | - "jupyter-book config sphinx docs/"
14 |
15 | # Build documentation in the docs/ directory with Sphinx
16 | sphinx:
17 | configuration: conf.py
18 |
19 | python:
20 | install:
21 | - requirements: requirements.txt
--------------------------------------------------------------------------------
/Dockerfile:
--------------------------------------------------------------------------------
1 | FROM fugueproject/notebook:0.5.4
2 |
3 | ENV NB_USER vscode
4 | ENV NB_UID 1000
5 | ENV USER ${NB_USER}
6 | ENV NB_UID ${NB_UID}
7 | ENV HOME /home/${NB_USER}
8 |
9 | WORKDIR ${HOME}
10 |
11 | USER root
12 |
13 | COPY README.md ${HOME}/
14 | COPY tutorials ${HOME}/tutorials
15 | COPY data ${HOME}/data
16 | COPY .jupyter ${HOME}/.jupyter
17 | COPY images ${HOME}/images
18 | RUN rm ${SPARK_CONF_DIR}/spark-defaults.conf
19 | COPY spark-defaults.conf ${SPARK_CONF_DIR}/
20 |
21 | USER root
22 | RUN chown -R ${NB_UID} ${HOME}
23 | USER ${NB_USER}
24 |
25 | WORKDIR ${HOME}
26 |
--------------------------------------------------------------------------------
/Makefile:
--------------------------------------------------------------------------------
1 | .PHONY: help clean dev docs package test
2 |
3 | help:
4 | @echo "The following make targets are available:"
5 | @echo " rundev docker build and start the jupyter notebook mounting the current directory for development"
6 |
7 | rundev:
8 | docker build -t fuguetutorial:latest .
9 | docker run -p 8888:8888 -v $(PWD):/home/vscode/work fuguetutorial:latest jupyter notebook --port=8888 --ip=0.0.0.0 --no-browser --allow-root
10 |
11 | dev:
12 | pip3 install -r requirements.txt
13 |
14 | olddocs:
15 | rm -rf docs/tutorials
16 | rm -rf docs/images
17 | rm -rf docs/build
18 | rm docs/README.ipynb
19 | cp README.ipynb docs/
20 | cp -r tutorials/ docs/tutorials
21 | cp -r images/ docs/images
22 | rm -rf docs/tutorials/.ipynb_checkpoints
23 | rm -rf docs/tutorials/dask-worker-space
24 | rm -rf docs/tutorials/spark-warehouse
25 | python -m sphinx docs/ docs/build
26 |
27 | docs:
28 | rm -rf docs/build
29 | rm -rf tutorials/.ipynb_checkpoints
30 | rm -rf tutorials/dask-worker-space
31 | rm -rf tutorials/spark-warehouse
32 | python -m sphinx ./ docs/build
33 |
34 | jdocs:
35 | jupyter-book build .
36 |
--------------------------------------------------------------------------------
/README.md:
--------------------------------------------------------------------------------
1 | #
2 |
3 | [](https://pypi.python.org/pypi/fugue/)
4 | [](https://pypi.python.org/pypi/fugue/)
5 | [](https://pypi.python.org/pypi/fugue/)
6 | [](https://codecov.io/gh/fugue-project/fugue)
7 | [](https://www.codacy.com/gh/fugue-project/fugue/dashboard?utm_source=github.com&utm_medium=referral&utm_content=fugue-project/fugue&utm_campaign=Badge_Grade)
8 | [](https://pepy.tech/project/fugue)
9 |
10 | | Tutorials | API Documentation | Chat with us on slack! |
11 | | --- | --- | --- |
12 | | [](https://fugue-tutorials.readthedocs.io/) | [](https://fugue.readthedocs.io/en/latest/) | [](http://slack.fugue.ai) |
13 |
14 | **Fugue is a unified interface for distributed computing that lets users execute Python, pandas, and SQL code on Spark, Dask, and Ray with minimal rewrites**.
15 |
16 | ## [Tutorials](https://fugue-tutorials.readthedocs.io/)
17 |
18 | The best way to get started with Fugue is to work through the 10 minute tutorials:
19 |
20 | * [Fugue in 10 minutes](https://fugue-tutorials.readthedocs.io/tutorials/quick_look/ten_minutes.html)
21 | * [FugueSQL in 10 minutes](https://fugue-tutorials.readthedocs.io/tutorials/quick_look/ten_minutes_sql.html)
22 |
23 | ## Running Tutorials Interactively
24 |
25 | ### Using Binder
26 |
27 | [](https://mybinder.org/v2/gh/fugue-project/tutorials/master)
28 |
29 | **Note it runs slow on binder** because the machine on binder isn't powerful enough for a distributed framework such as Spark. Parallel executions can become sequential, so some of the performance comparison examples will not give you the correct numbers.
30 |
31 | ### Using Docker
32 |
33 | Alternatively, you should get decent performance by running this Docker image on your own machine:
34 |
35 | ```bash
36 | docker run -p 8888:8888 fugueproject/tutorials:latest
37 | ```
38 |
39 | ## Community
40 |
41 | Feel free to message us on [Slack](http://slack.fugue.ai)
--------------------------------------------------------------------------------
/_config.yml:
--------------------------------------------------------------------------------
1 | title: Fugue Tutorials
2 | author: The Fugue Development Team
3 | logo: images/logo_blue.svg
4 | execute:
5 | timeout: 600
6 | execute_notebooks: off
7 | allow_errors: false
8 |
9 | launch_buttons:
10 | notebook_interface : classic # The interface interactive links will activate ["classic", "jupyterlab"]
11 | binderhub_url : https://mybinder.org # The URL of the BinderHub (e.g., https://mybinder.org)
12 | thebe : true
13 |
14 | repository:
15 | url : https://github.com/fugue-project/tutorials/ # The URL to your book's repository
16 | path_to_book : "" # A path to your book's folder, relative to the repository root.
17 | branch : master # Which branch of the repository should be used when creating links
18 |
19 | html:
20 | favicon: docs/_static/fugue_logo_trimmed.svg
21 | use_edit_page_button: true
22 | use_repository_button: true
23 | use_issues_button: true
24 |
25 | parse:
26 | myst_enable_extensions:
27 | - html_image
28 |
--------------------------------------------------------------------------------
/_toc.yml:
--------------------------------------------------------------------------------
1 | root: index
--------------------------------------------------------------------------------
/data/titanic-preprocessed/titanic.parquet:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/fugue-project/tutorials/505d81959e96a4df021952f56a1e4bcf767cc967/data/titanic-preprocessed/titanic.parquet
--------------------------------------------------------------------------------
/docs/Makefile:
--------------------------------------------------------------------------------
1 | # Minimal makefile for Sphinx documentation
2 | #
3 |
4 | # You can set these variables from the command line, and also
5 | # from the environment for the first two.
6 | SPHINXOPTS ?=
7 | SPHINXBUILD ?= sphinx-build
8 | SOURCEDIR = source
9 | BUILDDIR = build
10 |
11 | # Put it first so that "make" without argument is like "make help".
12 | help:
13 | @$(SPHINXBUILD) -M help "$(SOURCEDIR)" "$(BUILDDIR)" $(SPHINXOPTS) $(O)
14 |
15 | .PHONY: help Makefile
16 |
17 | # Catch-all target: route all unknown targets to Sphinx using the new
18 | # "make mode" option. $(O) is meant as a shortcut for $(SPHINXOPTS).
19 | %: Makefile
20 | @$(SPHINXBUILD) -M $@ "$(SOURCEDIR)" "$(BUILDDIR)" $(SPHINXOPTS) $(O)
21 |
--------------------------------------------------------------------------------
/docs/_static/fugue_logo_trimmed.svg:
--------------------------------------------------------------------------------
1 |
--------------------------------------------------------------------------------
/docs/_static/logo_blue.svg:
--------------------------------------------------------------------------------
1 |
2 |
3 |
4 |
23 |
--------------------------------------------------------------------------------
/docs/_static/logo_doc.svg:
--------------------------------------------------------------------------------
1 |
2 |
3 |
4 |
23 |
--------------------------------------------------------------------------------
/docs/_templates/package.rst_t:
--------------------------------------------------------------------------------
1 | {%- macro automodule(modname, options) -%}
2 | .. automodule:: {{ modname }}
3 | {%- for option in options %}
4 | :{{ option }}:
5 | {%- endfor %}
6 | {%- endmacro %}
7 |
8 | {%- macro toctree(docnames) -%}
9 | .. toctree::
10 | :maxdepth: {{ maxdepth }}
11 | {% for docname in docnames %}
12 | {{ docname }}
13 | {%- endfor %}
14 | {%- endmacro %}
15 |
16 | {%- if is_namespace %}
17 | {{- [pkgname, ""] | join(" ") | e | heading }}
18 | {% else %}
19 | {{- [pkgname, ""] | join(" ") | e | heading }}
20 | {% endif %}
21 |
22 | {%- if modulefirst and not is_namespace %}
23 | {{ automodule(pkgname, automodule_options) }}
24 | {% endif %}
25 |
26 | {%- if subpackages %}
27 | {{ toctree(subpackages) }}
28 | {% endif %}
29 |
30 | {%- if submodules %}
31 | {% if separatemodules %}
32 |
33 | {%- else %}
34 | {%- for submodule in submodules %}
35 | {% if show_headings %}
36 | {{- submodule | e | heading(2) }}
37 | {% endif %}
38 | {{ automodule(submodule, automodule_options) }}
39 | {% endfor %}
40 | {%- endif %}
41 | {% endif %}
42 |
--------------------------------------------------------------------------------
/docs/_templates/toc.rst_t:
--------------------------------------------------------------------------------
1 | {{ header | heading }}
2 |
3 | .. toctree::
4 | :maxdepth: {{ maxdepth }}
5 | {% for docname in docnames %}
6 | {{ docname }}
7 | {%- endfor %}
8 |
--------------------------------------------------------------------------------
/docs/make.bat:
--------------------------------------------------------------------------------
1 | @ECHO OFF
2 |
3 | pushd %~dp0
4 |
5 | REM Command file for Sphinx documentation
6 |
7 | if "%SPHINXBUILD%" == "" (
8 | set SPHINXBUILD=sphinx-build
9 | )
10 | set SOURCEDIR=source
11 | set BUILDDIR=build
12 |
13 | if "%1" == "" goto help
14 |
15 | %SPHINXBUILD% >NUL 2>NUL
16 | if errorlevel 9009 (
17 | echo.
18 | echo.The 'sphinx-build' command was not found. Make sure you have Sphinx
19 | echo.installed, then set the SPHINXBUILD environment variable to point
20 | echo.to the full path of the 'sphinx-build' executable. Alternatively you
21 | echo.may add the Sphinx directory to PATH.
22 | echo.
23 | echo.If you don't have Sphinx installed, grab it from
24 | echo.http://sphinx-doc.org/
25 | exit /b 1
26 | )
27 |
28 | %SPHINXBUILD% -M %1 %SOURCEDIR% %BUILDDIR% %SPHINXOPTS% %O%
29 | goto end
30 |
31 | :help
32 | %SPHINXBUILD% -M help %SOURCEDIR% %BUILDDIR% %SPHINXOPTS% %O%
33 |
34 | :end
35 | popd
36 |
--------------------------------------------------------------------------------
/images/autoarima.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/fugue-project/tutorials/505d81959e96a4df021952f56a1e4bcf767cc967/images/autoarima.png
--------------------------------------------------------------------------------
/images/databricks_koalas_iloc.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/fugue-project/tutorials/505d81959e96a4df021952f56a1e4bcf767cc967/images/databricks_koalas_iloc.png
--------------------------------------------------------------------------------
/images/fugue_backends.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/fugue-project/tutorials/505d81959e96a4df021952f56a1e4bcf767cc967/images/fugue_backends.png
--------------------------------------------------------------------------------
/images/fugue_spark_benchmark_cdf.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/fugue-project/tutorials/505d81959e96a4df021952f56a1e4bcf767cc967/images/fugue_spark_benchmark_cdf.png
--------------------------------------------------------------------------------
/images/fugue_spark_benchmark_plus_one.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/fugue-project/tutorials/505d81959e96a4df021952f56a1e4bcf767cc967/images/fugue_spark_benchmark_plus_one.png
--------------------------------------------------------------------------------
/images/fugue_spark_benchmark_subtract_mean_1.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/fugue-project/tutorials/505d81959e96a4df021952f56a1e4bcf767cc967/images/fugue_spark_benchmark_subtract_mean_1.png
--------------------------------------------------------------------------------
/images/fugue_spark_benchmark_subtract_mean_2.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/fugue-project/tutorials/505d81959e96a4df021952f56a1e4bcf767cc967/images/fugue_spark_benchmark_subtract_mean_2.png
--------------------------------------------------------------------------------
/images/fugue_sql/ipyvizzu.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/fugue-project/tutorials/505d81959e96a4df021952f56a1e4bcf767cc967/images/fugue_sql/ipyvizzu.png
--------------------------------------------------------------------------------
/images/fugue_vs_spark.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/fugue-project/tutorials/505d81959e96a4df021952f56a1e4bcf767cc967/images/fugue_vs_spark.png
--------------------------------------------------------------------------------
/images/logo_blue.svg:
--------------------------------------------------------------------------------
1 |
2 |
3 |
4 |
23 |
--------------------------------------------------------------------------------
/images/nodes.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/fugue-project/tutorials/505d81959e96a4df021952f56a1e4bcf767cc967/images/nodes.png
--------------------------------------------------------------------------------
/images/nodes.svg:
--------------------------------------------------------------------------------
1 |
--------------------------------------------------------------------------------
/images/p_orig.svg:
--------------------------------------------------------------------------------
1 |
--------------------------------------------------------------------------------
/images/pandas_like_1.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/fugue-project/tutorials/505d81959e96a4df021952f56a1e4bcf767cc967/images/pandas_like_1.png
--------------------------------------------------------------------------------
/images/pandas_like_2.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/fugue-project/tutorials/505d81959e96a4df021952f56a1e4bcf767cc967/images/pandas_like_2.png
--------------------------------------------------------------------------------
/images/pandas_like_3.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/fugue-project/tutorials/505d81959e96a4df021952f56a1e4bcf767cc967/images/pandas_like_3.png
--------------------------------------------------------------------------------
/images/prefect_fugue_block.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/fugue-project/tutorials/505d81959e96a4df021952f56a1e4bcf767cc967/images/prefect_fugue_block.png
--------------------------------------------------------------------------------
/images/warehouses/create_bigquery_dataset.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/fugue-project/tutorials/505d81959e96a4df021952f56a1e4bcf767cc967/images/warehouses/create_bigquery_dataset.png
--------------------------------------------------------------------------------
/index.md:
--------------------------------------------------------------------------------
1 | ---
2 | hide-toc: true
3 | ---
4 |
5 | # Welcome to the Fugue Tutorials!
6 |
7 | Have questions? Chat with us on Github or Slack:
8 |
9 | [](https://github.com/fugue-project/fugue)
10 | [](http://slack.fugue.ai)
11 |
12 |
13 | [Fugue](https://github.com/fugue-project/fugue) provides an easier interface to using distributed compute effectively and accelerates big data projects. It does this by minimizing the amount of code you need to write, in addition to taking care of tricks and optimizations that lead to more efficient execution on distributed compute. Fugue ports Python, Pandas, and SQL code to Spark, Dask, and Ray.
14 |
15 | 
16 |
17 | Quick Links:
18 |
19 | * Scaling Pandas code to Spark, Dask, or Ray? Start with [Fugue in 10 minutes](tutorials/quick_look/ten_minutes.ipynb).
20 | * Need a SQL interface on top of Pandas, Spark and Dask? Check [FugueSQL in 10 minutes](tutorials/quick_look/ten_minutes_sql.ipynb).
21 | * For previous conference presentations and blog posts, check the [Content page](tutorials/resources/content.md).
22 |
23 | ## How Does Fugue Compare to?
24 |
25 | ### Spark, Dask, Ray
26 |
27 | Fugue simiplifies the usage of these backends. It doesn't re-invent the wheel. Fugue will always push down to these underlying engines. The goal of Fugue is to minimize the amount of framework-specific code users need to learn to leverage these engines. Fugue also serves as a bridge between local testing and large-scale execution.
28 |
29 | ### PySpark Pandas, Modin
30 |
31 | Like Fugue, PySpark Pandas and Modin aim to simplify the experience of working with big data frameworks. The difference is Fugue does not aim to be a drop-in replacement for Pandas, because it is a [sub-optimal interface](https://towardsdatascience.com/why-pandas-like-interfaces-are-sub-optimal-for-distributed-computing-322dacbce43) for distributed computing. There are many operations (especially around the index) that don't translate well to a distributed setting. Fugue advocates a separation of tools and to use each tool's strength. Pandas code is supported and encouraged to describe business logic, but Fugue will use Spark, Dask, or Ray to distribute these multiple Pandas jobs.
32 |
33 | ### dbt
34 |
35 | dbt is a programming interface that pushes down the code to backends (Snowflake, Spark). Fugue also has FugueSQL, which is a SQL-like interface for pushing down to backends (DuckDB, Spark, Dask). FugueSQL is also extending to the data warehouse side with integrations like BigQuery and Trino. The main difference between dbt and Fugue is that Fugue does not confine users to SQL. It also supports Python (and encourages a mix of SQL and Python). dbt supports Python, but it's not a first-class citizen and not scalable.
36 |
37 | ### DuckDB
38 |
39 | DuckDB is a backend for Fugue, allowing users to prototype code in a local setting, and then scale out by switching the backend. For more information, see the [DuckDB documentation](https://duckdb.org/docs/guides/python/fugue)
40 |
41 | ### Ibis
42 |
43 | Fugue has an Ibis integration that is more about accessing data in various data stores already. For example, we use it under the hood also for our [BigQuery integration](https://fugue-tutorials.readthedocs.io/tutorials/integrations/warehouses/bigquery.html).
44 |
45 | The differences:
46 |
47 | 1. Fugue guarantees consistency between backends. NULL handling can be different depending on the backend. For example, Pandas joins NULL with NULL while Spark doesn't. So if users prototype locally on Pandas, and then scale to Spark, Fugue guarantee same results. Fugue is 100% unit tested and the backends go through the same test suite.
48 |
49 | 2. Ibis is Pythonic for SQL backends. We embrace SQL, but understand its limitations. FugueSQL is an enhanced SQL dialect that can invoke Python code. FugueSQL can be the first-class grammar instead of being sandwiched by Python code. Fugue's Python API and SQL API are 1:1 in capability.
50 |
51 | 3. Fugue doesn't want users to learn any new language. Ibis is a new way to express things; we just want to extend the capabilities of what people already know (SQL, native Python, and Pandas). Fugue can also be incrementally adopted, meaning it can be used for just one portion of your workflow.
52 |
53 | 4. Roadmap-wise, Fugue thinks the optimal solutions will be a mix of different tools. A clear one is pre-aggregating data with DuckDB, and then using Pandas for further processing. Similarly, can we preprocess in Snowflake and do machine learning in Spark? Fugue is working on connecting these different systems to enable cross-platform workloads.
54 |
55 | ### Polars
56 |
57 | Polars is a local engine similar to Pandas. Fugue has a [Polars Integration](https://fugue-tutorials.readthedocs.io/tutorials/integrations/backends/polars.html) that allows users to run Polars code across a Spark, Dask, or Ray cluster.
58 |
59 | ## Installation
60 |
61 | In order to setup your own environment, you can pip (or conda) install the package. Fugue can then
62 |
63 | ```bash
64 | pip install fugue
65 | ```
66 |
67 | Backend engines are installed separately through pip extras. For example, to install Spark:
68 |
69 | ```bash
70 | pip install "fugue[spark]"
71 | ```
72 |
73 | If Spark, Dask, or Ray are already installed on your machine, Fugue will be able to detect it. Spark requires Java to be installed separately.
74 |
75 | ## Running the Code
76 |
77 | The simplest way to run the tutorial interactively is to use [mybinder](https://mybinder.org/v2/gh/fugue-project/tutorials/master). Binder spins up an environment using a container.
78 |
79 | >- **Some code snippets run slow on binder** as the machine on binder isn't powerful enough for a distributed framework such as Spark.
80 | >- Parallel executions can become sequential, so some of the performance comparison examples will not give you the correct numbers.
81 |
82 | Alternatively, you should get decent performance if running the Docker image on your own machine:
83 |
84 | ```
85 | docker run -p 8888:8888 fugueproject/tutorials:latest
86 | ```
87 |
88 | ```{toctree}
89 | :maxdepth: 6
90 | :caption: Quick Look
91 | :hidden:
92 |
93 | tutorials/quick_look/ten_minutes
94 | tutorials/quick_look/ten_minutes_sql
95 | ```
96 |
97 | ```{toctree}
98 | :maxdepth: 6
99 | :caption: Tutorials
100 | :hidden:
101 |
102 | tutorials/beginner/index
103 | tutorials/advanced/index
104 | tutorials/fugue_sql/index
105 | tutorials/extensions/index
106 | ```
107 |
108 | ```{toctree}
109 | :maxdepth: 6
110 | :caption: Integrations
111 | :hidden:
112 |
113 | tutorials/integrations/backends/index
114 | tutorials/integrations/cloudproviders/index
115 | tutorials/integrations/warehouses/index
116 | tutorials/integrations/ecosystem/index
117 | ```
118 |
119 | ```{toctree}
120 | :caption: Applications
121 | :hidden:
122 |
123 | tutorials/applications/use_cases/index
124 | tutorials/applications/examples/index
125 | tutorials/applications/recipes/index
126 | tutorials/applications/debugging/index
127 | ```
128 |
129 | ```{toctree}
130 | :caption: Fugue Libraries
131 | :hidden:
132 |
133 | tutorials/tune/index
134 | ```
135 |
136 |
137 | ```{toctree}
138 | :caption: Resources
139 | :hidden:
140 |
141 | tutorials/resources/appendix/index
142 | tutorials/resources/best_practices/index
143 | tutorials/resources/content
144 | tutorials/resources/major_changes
145 | ```
146 |
--------------------------------------------------------------------------------
/requirements.txt:
--------------------------------------------------------------------------------
1 | pre-commit
2 | mypy
3 | flake8
4 | autopep8
5 | pylint
6 | pytest
7 | pytest-cov
8 | pytest-mock
9 | pytest-spark
10 | sphinx~=4.0
11 | myst-parser>=0.15
12 | myst-nb~=0.13.1
13 |
14 | nbconvert>=6.5.0
15 | nbsphinx>=0.8.6
16 | pandoc>=2.0
17 | ipython>=7.31
18 | jupyter-book>=0.13
19 |
20 | flask
21 | fugue[spark]>=0.8.1
22 | fugue-cloudprovider
23 | fugue-warehouses[bigquery]
24 | tune
25 |
26 | pandera
27 | scikit-learn
28 | matplotlib
29 | seaborn
30 | xgboost
--------------------------------------------------------------------------------
/spark-defaults.conf:
--------------------------------------------------------------------------------
1 | spark.ui.showConsoleProgress false
2 | spark.sql.adaptive.enabled false
3 | spark.sql.shuffle.partitions 8
4 | spark.sql.execution.arrow.pyspark.enabled true
--------------------------------------------------------------------------------
/tutorials/advanced/index.md:
--------------------------------------------------------------------------------
1 | # Deep Dive
2 |
3 | All questions are welcome in the Slack channel.
4 |
5 | [](http://slack.fugue.ai)
6 |
7 | This section is not needed to create end-to-end workflows with Fugue, but it will help give a better understanding of the features available. In some cases, applying these concepts may significantly improve performance.
8 |
9 | Since you already have experience in Spark or distributed computing in general, you may be interested in the extra values Fugue can add.
10 |
11 | ```{toctree}
12 | :hidden:
13 |
14 | useful_config
15 | execution_engine
16 | validation
17 | schema_dataframes
18 | partition
19 | rpc
20 | x-like
21 | ```
22 |
23 | ## Architecture
24 |
25 | 
26 |
27 | ## [Fugue Configurations](useful_config.ipynb) (MUST READ)
28 | These configurations can have significant impact on building and running the Fugue workflows.
29 |
30 | ## [Execution Engine](execution_engine.ipynb)
31 | The heart of Fugue. It is the layer that unifies many of the core concepts of distributed computing, and separates the underlying computing frameworks from user level logic. Normally you don't directly interact with execution engines. But it's good to understand some basics.
32 |
33 | ## [Validation](validation.ipynb)
34 | Fugue applies input validation.
35 |
36 | ## [Data Type, Schema & DataFrames](schema_dataframes.ipynb)
37 | Fugue data types and schema are strictly based on [Apache Arrow](https://arrow.apache.org/docs/index.html). Dataframe is an abstract concept with several built-in implementations to adapt to different dataframes. In this tutorial, we will go through the basic APIs and focus on the most common use cases.
38 |
39 | ## [Partition](partition.ipynb) (MUST READ)
40 | This tutorial is more focused on explaining the basic ideas of data partitioning. It's less related with Fugue. To have a good understanding of partition is the key for writing high performance code.
41 |
42 | ## [Callbacks From Transformers To Driver](rpc.ipynb)
43 | You can provide a callback function to any transformer, to communicate with driver while running
44 |
45 | ## [X-Like Objects Initialization](x-like.ipynb)
46 | You may often see -like objects in Fugue API document, here is a complete list of these objects and their ways to initialize.
--------------------------------------------------------------------------------
/tutorials/advanced/useful_config.ipynb:
--------------------------------------------------------------------------------
1 | {
2 | "cells": [
3 | {
4 | "cell_type": "markdown",
5 | "metadata": {},
6 | "source": [
7 | "# Fugue Configurations\n",
8 | "\n",
9 | "Have questions? Chat with us on Github or Slack:\n",
10 | "\n",
11 | "[](https://github.com/fugue-project/fugue)\n",
12 | "[](http://slack.fugue.ai)\n",
13 | "\n",
14 | "| Config | Default | Description |\n",
15 | "| :--- | :---: | :--- |\n",
16 | "| **fugue.spark.use_pandas_udf** | `True` | Automatically use pandas udf for `groupBY apply` semantic, see [details](#Use-Pandas-UDF-on-SparkExecutionEngine) |\n",
17 | "| **fugue.sql.compile.ignore_case** | `False` | When this is `True`, keywords in FugueSQL will be case insensitive |\n",
18 | "| **fugue.rpc.server** | [NativeRPCServer](https://fugue.readthedocs.io/en/latest/api/fugue.rpc.html#fugue.rpc.base.NativeRPCServer) | Full path to a subclass of [RPCServer](https://fugue.readthedocs.io/en/latest/api/fugue.rpc.html#fugue.rpc.base.RPCServer) |"
19 | ]
20 | },
21 | {
22 | "cell_type": "markdown",
23 | "metadata": {},
24 | "source": [
25 | "## Use Pandas UDF on SparkExecutionEngine\n",
26 | "\n",
27 | "**Notice: you may not see the expected performance on binder, it's recommended to run this tutorial on docker on a multiple core machine to get decent performance**\n",
28 | "\n",
29 | "If you don't know pandas UDF, read [this](https://spark.apache.org/docs/latest/sql-pyspark-pandas-with-arrow.html). With PyArrow and pandas, Spark is able to accelerate certain operations.\n",
30 | "\n",
31 | "In Spark 3.0 it also starts to support [some type annotations](https://databricks.com/blog/2020/05/20/new-pandas-udfs-and-python-type-hints-in-the-upcoming-release-of-apache-spark-3-0.html). But Fugue is more flexibile on type annotations. Besides `pd.DataFrame` you can also use other annotations including `List` and `Iterable`, etc.\n",
32 | "\n",
33 | "For certain cases, no matter what input type you specify, we can see great performance gain. But to maximize the gain, it's suggested to use `pd.DataFrame` as the input and output to remove conversion overhead. By doing this, it may hurt the performance on other execution engines, or on Spark without pandas_udf support. So you need to understand the pros and cons. The best way is to experiment and decide.\n",
34 | "\n",
35 | "In Fugue, only when all of the following are satisfied, it uses `pandas_udf`, otherwise, it will fall back to the common way.\n",
36 | "\n",
37 | "* config **fugue.spark.use_pandas_udf** is set to True (default)\n",
38 | "* `partition_spec` has to have non empty partition keys\n",
39 | "* output schema can't have nested types\n",
40 | "* Spark config **spark.sql.execution.arrow.pyspark.enabled** is set to `\"true\"`\n",
41 | "\n",
42 | "Plus, for **pyspark < 3** this environment variable must be set on driver and all executors:\n",
43 | "```\n",
44 | "ARROW_PRE_0_15_IPC_FORMAT=1\n",
45 | "```\n",
46 | "otherwise errors will be thrown."
47 | ]
48 | },
49 | {
50 | "cell_type": "code",
51 | "execution_count": 2,
52 | "metadata": {},
53 | "outputs": [],
54 | "source": [
55 | "from pyspark.sql import SparkSession\n",
56 | "spark = SparkSession.builder\\\n",
57 | " .config(\"spark.sql.execution.arrow.pyspark.enabled\", \"true\")\\\n",
58 | " .getOrCreate()"
59 | ]
60 | },
61 | {
62 | "cell_type": "code",
63 | "execution_count": 4,
64 | "metadata": {},
65 | "outputs": [
66 | {
67 | "name": "stdout",
68 | "output_type": "stream",
69 | "text": [
70 | "23/01/03 00:16:30 WARN TaskSetManager: Stage 6 contains a task of very large size (3910 KiB). The maximum recommended task size is 1000 KiB.\n",
71 | "pandas.median\n",
72 | "SparkDataFrame\n",
73 | "a:int|b:double\n",
74 | "-----+--------\n",
75 | "2 |4.0 \n",
76 | "9 |4.0 \n",
77 | "3 |4.0 \n",
78 | "7 |5.0 \n",
79 | "4 |5.0 \n",
80 | "1.0500444139999985\n",
81 | "23/01/03 00:16:31 WARN TaskSetManager: Stage 9 contains a task of very large size (3910 KiB). The maximum recommended task size is 1000 KiB.\n",
82 | "pandas.median\n",
83 | "SparkDataFrame\n",
84 | "a:int|b:double\n",
85 | "-----+--------\n",
86 | "2 |4.0 \n",
87 | "9 |4.0 \n",
88 | "3 |4.0 \n",
89 | "7 |5.0 \n",
90 | "4 |5.0 \n",
91 | "0.9417272339999982\n"
92 | ]
93 | }
94 | ],
95 | "source": [
96 | "import pandas as pd\n",
97 | "import numpy as np\n",
98 | "from timeit import timeit\n",
99 | "from typing import Iterable, List, Any\n",
100 | "import fugue.api as fa\n",
101 | "\n",
102 | "def helper(ct=2000000) -> pd.DataFrame:\n",
103 | " np.random.seed(0)\n",
104 | " return pd.DataFrame(np.random.randint(0,10,size=(ct, 2)), columns=list('ab'))\n",
105 | "\n",
106 | "# schema: a:int,b:double\n",
107 | "def median(df:pd.DataFrame) -> List[List[Any]]:\n",
108 | " b = df[\"b\"].median()\n",
109 | " return [[df.loc[0,\"a\"], b]]\n",
110 | "\n",
111 | "def run(engine, conf=None):\n",
112 | " with fa.engine_context(engine):\n",
113 | " res = fa.transform(helper(), \n",
114 | " median,\n",
115 | " partition={\"by\": \"a\"}, \n",
116 | " engine_conf=conf\n",
117 | " )\n",
118 | " fa.show(res, 5, title=\"pandas.median\");\n",
119 | "\n",
120 | "print(timeit(lambda: run(spark), number=1))\n",
121 | "\n",
122 | "conf = {\"fugue.spark.use_pandas_udf\":True}\n",
123 | "print(timeit(lambda: run(spark, conf=conf), number=1))"
124 | ]
125 | },
126 | {
127 | "cell_type": "markdown",
128 | "metadata": {},
129 | "source": [
130 | "## Ignore Case in Fugue SQL\n",
131 | "\n",
132 | "Normally, when writing FugueSQL, you upper case keywords by yourself"
133 | ]
134 | },
135 | {
136 | "cell_type": "code",
137 | "execution_count": 5,
138 | "metadata": {},
139 | "outputs": [
140 | {
141 | "name": "stdout",
142 | "output_type": "stream",
143 | "text": [
144 | "ArrayDataFrame\n",
145 | "a:int\n",
146 | "-----\n",
147 | "0 \n",
148 | "Total count: 1\n",
149 | "\n"
150 | ]
151 | }
152 | ],
153 | "source": [
154 | "from fugue.api import fugue_sql_flow\n",
155 | "\n",
156 | "fugue_sql_flow(\"\"\"\n",
157 | " CREATE [[0]] SCHEMA a:int\n",
158 | " PRINT\n",
159 | " \"\"\").run();"
160 | ]
161 | },
162 | {
163 | "cell_type": "markdown",
164 | "metadata": {},
165 | "source": [
166 | "But you can turn pass `fsql_ignore_case=True`"
167 | ]
168 | },
169 | {
170 | "cell_type": "code",
171 | "execution_count": 6,
172 | "metadata": {},
173 | "outputs": [
174 | {
175 | "name": "stdout",
176 | "output_type": "stream",
177 | "text": [
178 | "ArrayDataFrame\n",
179 | "a:int\n",
180 | "-----\n",
181 | "0 \n",
182 | "Total count: 1\n",
183 | "\n"
184 | ]
185 | }
186 | ],
187 | "source": [
188 | "fugue_sql_flow(\"\"\"\n",
189 | " create [[0]] schema a:int\n",
190 | " print\n",
191 | " \"\"\", fsql_ignore_case=True).run();"
192 | ]
193 | },
194 | {
195 | "cell_type": "markdown",
196 | "metadata": {},
197 | "source": [
198 | "This can make the sql less readable and make you less aware of syntax ambiguity or errors, but it may be handy if you want to migrate other SQL queries into FugueSQL.\n",
199 | "\n",
200 | "If there are many `fugue_sql_flow` calls, it might be easier to set `fugue.sql.compile.ignore_case` on the execution engine."
201 | ]
202 | },
203 | {
204 | "cell_type": "markdown",
205 | "metadata": {},
206 | "source": [
207 | "## RPCServer settings\n",
208 | "\n",
209 | "If you do not have any callbacks in your workflow, don't set this config.\n",
210 | "\n",
211 | "For testing callbacks on local machine, don't set this config. [NativeRPCServer](https://fugue.readthedocs.io/en/latest/api/fugue.rpc.html#fugue.rpc.base.NativeRPCServer) Will be used.\n",
212 | "\n",
213 | "Only when you use a distributed execution engine, and you want to use callbacks, set to a server that is distributable.\n",
214 | "\n",
215 | "[FlaskRPCServer](https://fugue.readthedocs.io/en/latest/api/fugue.rpc.html#fugue.rpc.flask.FlaskRPCServer) can be used with a distributed execution engine. Unless you have special needs, you just need to follow the example below."
216 | ]
217 | },
218 | {
219 | "cell_type": "code",
220 | "execution_count": 1,
221 | "metadata": {},
222 | "outputs": [],
223 | "source": [
224 | "conf = {\n",
225 | " \"fugue.rpc.server\": \"fugue.rpc.flask.FlaskRPCServer\",\n",
226 | " \"fugue.rpc.flask_server.host\": \"0.0.0.0\",\n",
227 | " \"fugue.rpc.flask_server.port\": \"1234\",\n",
228 | " \"fugue.rpc.flask_server.timeout\": \"2 sec\",\n",
229 | "}"
230 | ]
231 | },
232 | {
233 | "cell_type": "markdown",
234 | "metadata": {},
235 | "source": [
236 | "To use `fugue.rpc.flask.FlaskRPCServer`, you must set `fugue.rpc.flask_server.host` and `fugue.rpc.flask_server.port`, and it's suggested to also set `fugue.rpc.flask_server.timeout` to a reasonable timeout for your own case."
237 | ]
238 | }
239 | ],
240 | "metadata": {
241 | "kernelspec": {
242 | "display_name": "Python 3.8.13 ('fugue')",
243 | "language": "python",
244 | "name": "python3"
245 | },
246 | "language_info": {
247 | "codemirror_mode": {
248 | "name": "ipython",
249 | "version": 3
250 | },
251 | "file_extension": ".py",
252 | "mimetype": "text/x-python",
253 | "name": "python",
254 | "nbconvert_exporter": "python",
255 | "pygments_lexer": "ipython3",
256 | "version": "3.8.13"
257 | },
258 | "vscode": {
259 | "interpreter": {
260 | "hash": "9fcd6e71927f6b3e5f4fa4280b4e8e6a66aa8d4365bb61cf7ef4017620fc09b9"
261 | }
262 | }
263 | },
264 | "nbformat": 4,
265 | "nbformat_minor": 4
266 | }
267 |
--------------------------------------------------------------------------------
/tutorials/advanced/x-like.ipynb:
--------------------------------------------------------------------------------
1 | {
2 | "cells": [
3 | {
4 | "cell_type": "markdown",
5 | "metadata": {},
6 | "source": [
7 | "# X-like Objects\n",
8 | "\n",
9 | "Have questions? Chat with us on Github or Slack:\n",
10 | "\n",
11 | "[](https://github.com/fugue-project/fugue)\n",
12 | "[](http://slack.fugue.ai)\n",
13 | "\n",
14 | "In Fugue, it's flexibile to initialize many built-in objects. This is a tutorial for all of them.\n",
15 | "\n",
16 | "## Schema\n",
17 | "\n",
18 | "Fugue creates a special syntax to represent schema: Separated by `,`, each column type pair is `:`\n",
19 | "\n",
20 | "For example: `a:int,b:str` or `a:int,b_array:[int],c_dict:{x:int,y:str}`"
21 | ]
22 | },
23 | {
24 | "cell_type": "code",
25 | "execution_count": 1,
26 | "metadata": {},
27 | "outputs": [
28 | {
29 | "name": "stdout",
30 | "output_type": "stream",
31 | "text": [
32 | "a:int,b:str\n",
33 | "a:int,b_array:[long],c_dict:{x:int,y:str}\n",
34 | "pa schema a: int32\n",
35 | "b: string\n",
36 | "a:int,b:str\n",
37 | "c:str,d:long\n",
38 | "c:str,d:long\n",
39 | "e:str,f:str\n",
40 | "e:str,f:str,g:long\n",
41 | "a:int,b:str\n"
42 | ]
43 | }
44 | ],
45 | "source": [
46 | "from fugue import Schema\n",
47 | "\n",
48 | "print(Schema(\"a:int,b:str\"))\n",
49 | "print(Schema(\"a:int32,b_array:[int64],c_dict:{x:int,y:string}\"))\n",
50 | "\n",
51 | "# get pyarrow schema\n",
52 | "schema = Schema(\" a : int , b : str\") # space is ok\n",
53 | "print(\"pa schema\", schema.pa_schema)\n",
54 | "\n",
55 | "# more ways to initialized fugue Schema\n",
56 | "print(Schema(schema.pa_schema)) # by pyarrow schema\n",
57 | "print(Schema(c=str,d=int)) # pythonic way\n",
58 | "print(Schema(dict(c=str,d=int))) # pythonic way\n",
59 | "print(Schema(\"e:str\",\"f:str\")) # you can separate\n",
60 | "print(Schema([\"e:str\",\"f:str\"], (\"g\",int))) # you can separate, notice int in python means long in schema\n",
61 | "print(Schema(Schema(\"a:int\",\"b:str\"))) # you can separate"
62 | ]
63 | },
64 | {
65 | "cell_type": "markdown",
66 | "metadata": {},
67 | "source": [
68 | "## Partition"
69 | ]
70 | },
71 | {
72 | "cell_type": "code",
73 | "execution_count": 2,
74 | "metadata": {},
75 | "outputs": [
76 | {
77 | "data": {
78 | "text/plain": [
79 | "PartitionSpec(num='4', by=['a', 'b'], presort='')"
80 | ]
81 | },
82 | "execution_count": 2,
83 | "metadata": {},
84 | "output_type": "execute_result"
85 | }
86 | ],
87 | "source": [
88 | "from fugue import PartitionSpec\n",
89 | "\n",
90 | "assert PartitionSpec().empty # empty partition spec means no operation needed, it can be the default value\n",
91 | "PartitionSpec(num=4)\n",
92 | "PartitionSpec(algo=\"even\",num=4,by=[\"a\",\"b\"],presort=\"c,d desc\") # c,d desc == c ASC, d DESC\n",
93 | "\n",
94 | "# you can use expression in num, ROWCOUNT can be used to indicate using the row count of the dataframe to operate on\n",
95 | "# if a df has 1000 rows, this means I want to even partition it to 10 rows per partition\n",
96 | "PartitionSpec(algo=\"even\",num=\"ROWCOUNT/10\")\n",
97 | "\n",
98 | "PartitionSpec({\"num\":4, \"by\":[\"a\",\"b\"]}) # from dict, using dict on `partition-like` parameters is common\n",
99 | "PartitionSpec('{\"num\":4}') # from json\n",
100 | "\n",
101 | "a = PartitionSpec(num=4)\n",
102 | "b = PartitionSpec(by=[\"a\"])\n",
103 | "c = PartitionSpec(a,b) # combine\n",
104 | "\n",
105 | "p = PartitionSpec(num=4, by=[\"a\"])\n",
106 | "PartitionSpec(p, by=[\"a\",\"b\"], algo=\"even\") # override"
107 | ]
108 | },
109 | {
110 | "cell_type": "markdown",
111 | "metadata": {},
112 | "source": [
113 | "## RPC\n",
114 | "\n",
115 | "For callbacks you defined for transformers, you can provide a lambda function, a native python function, or an instance implementing [RPCHandler](https://fugue.readthedocs.io/en/latest/api/fugue.rpc.html#fugue.rpc.base.RPCHandler)"
116 | ]
117 | },
118 | {
119 | "cell_type": "code",
120 | "execution_count": 4,
121 | "metadata": {},
122 | "outputs": [
123 | {
124 | "name": "stdout",
125 | "output_type": "stream",
126 | "text": [
127 | "Index(['a', 'b'], dtype='object')\n",
128 | "Index(['a', 'b'], dtype='object')\n",
129 | "Index(['a', 'b'], dtype='object')\n",
130 | "Index(['a', 'b'], dtype='object')\n",
131 | "Index(['a', 'b'], dtype='object')\n",
132 | "Index(['a', 'b'], dtype='object')\n",
133 | "Index(['a', 'b'], dtype='object')\n",
134 | "Index(['a', 'b'], dtype='object')\n",
135 | "Index(['a', 'b'], dtype='object')\n"
136 | ]
137 | },
138 | {
139 | "data": {
140 | "text/html": [
141 | "
\n",
142 | "\n",
155 | "
\n",
156 | " \n",
157 | "
\n",
158 | "
\n",
159 | "
a
\n",
160 | "
b
\n",
161 | "
\n",
162 | " \n",
163 | " \n",
164 | "
\n",
165 | "
0
\n",
166 | "
0
\n",
167 | "
0
\n",
168 | "
\n",
169 | "
\n",
170 | "
1
\n",
171 | "
0
\n",
172 | "
1
\n",
173 | "
\n",
174 | "
\n",
175 | "
2
\n",
176 | "
1
\n",
177 | "
1
\n",
178 | "
\n",
179 | "
\n",
180 | "
3
\n",
181 | "
2
\n",
182 | "
2
\n",
183 | "
\n",
184 | " \n",
185 | "
\n",
186 | "
"
187 | ],
188 | "text/plain": [
189 | " a b\n",
190 | "0 0 0\n",
191 | "1 0 1\n",
192 | "2 1 1\n",
193 | "3 2 2"
194 | ]
195 | },
196 | "execution_count": 4,
197 | "metadata": {},
198 | "output_type": "execute_result"
199 | }
200 | ],
201 | "source": [
202 | "import pandas as pd\n",
203 | "import fugue.api as fa\n",
204 | "from fugue.rpc import RPCHandler\n",
205 | "\n",
206 | "def print_columns_and_return(df:pd.DataFrame, cb:callable) -> pd.DataFrame:\n",
207 | " cb(str(df.columns))\n",
208 | " return df\n",
209 | "\n",
210 | "def pt(x):\n",
211 | " print(x)\n",
212 | "\n",
213 | "# RPCHandler\n",
214 | "class Handler(RPCHandler):\n",
215 | " def __init__(self):\n",
216 | " super().__init__()\n",
217 | " \n",
218 | " def __call__(self, x):\n",
219 | " print(x)\n",
220 | "\n",
221 | "df = pd.DataFrame([[0,0],[1,1],[0,1],[2,2]], columns=[\"a\",\"b\"])\n",
222 | "\n",
223 | "# lambda\n",
224 | "fa.transform(df, print_columns_and_return, schema=\"*\", partition={\"by\": \"a\"}, callback = lambda x:print(x))\n",
225 | "\n",
226 | "# function\n",
227 | "fa.transform(df, print_columns_and_return, schema=\"*\", partition={\"by\": \"a\"}, callback = pt)\n",
228 | "\n",
229 | "# RPCHandler class\n",
230 | "fa.transform(df, print_columns_and_return, schema=\"*\", partition={\"by\": \"a\"}, callback = Handler())"
231 | ]
232 | }
233 | ],
234 | "metadata": {
235 | "kernelspec": {
236 | "display_name": "Python 3.8.9 64-bit",
237 | "language": "python",
238 | "name": "python3"
239 | },
240 | "language_info": {
241 | "codemirror_mode": {
242 | "name": "ipython",
243 | "version": 3
244 | },
245 | "file_extension": ".py",
246 | "mimetype": "text/x-python",
247 | "name": "python",
248 | "nbconvert_exporter": "python",
249 | "pygments_lexer": "ipython3",
250 | "version": "3.8.13"
251 | },
252 | "vscode": {
253 | "interpreter": {
254 | "hash": "31f2aee4e71d21fbe5cf8b01ff0e069b9275f58929596ceb00d14d90e3e16cd6"
255 | }
256 | }
257 | },
258 | "nbformat": 4,
259 | "nbformat_minor": 4
260 | }
261 |
--------------------------------------------------------------------------------
/tutorials/applications/debugging/index.md:
--------------------------------------------------------------------------------
1 | # Debugging
2 |
3 | This is a list of examples of common Fugue errors and the causes behind them.
4 |
5 | [](http://slack.fugue.ai)
6 |
7 |
8 | ```{toctree}
9 | :hidden:
10 |
11 | unknown_opcode
12 | ```
13 |
14 | ## Dask
15 |
16 | ### [Unknown opcode](unknown_opcode.ipynb)
17 | Some users encounter an error like `Exception: "SystemError(\'unknown opcode\')`. This normally doesn't happen locally, but when code is brought to the cluster, this exception will get raised. This is normally due to some Python version mismatch as we'll see in this section with some advice how to diagnose it.
--------------------------------------------------------------------------------
/tutorials/applications/debugging/unknown_opcode.ipynb:
--------------------------------------------------------------------------------
1 | {
2 | "cells": [
3 | {
4 | "cell_type": "markdown",
5 | "metadata": {},
6 | "source": [
7 | "# SystemError - unknown opcode\n",
8 | "\n",
9 | "This normally arises when using Dask on a cluster (it will not happen locally). The error will look something like this.\n",
10 | "\n",
11 | "```\n",
12 | "SystemError: unknown opcode\n",
13 | "```"
14 | ]
15 | },
16 | {
17 | "cell_type": "markdown",
18 | "metadata": {},
19 | "source": [
20 | "The most common cause is an inconsistent environment between local and the cluster. The local versions of Python, dask, and distributed all need to be aligned because code is serialized with cloudpickle on the local side before it is sent to the cluster. This code is then unpickled and the deserialization will be inconsistent if the Python versions are inconsistent.\n",
21 | "\n",
22 | "[This Github issue](https://github.com/dask/distributed/issues/5331) replicates the issue by using a nested function. We can test this is by running the following code on the Dask cluster. When using the code snippet below, configure the Client to point to your Dask cluster."
23 | ]
24 | },
25 | {
26 | "cell_type": "code",
27 | "execution_count": null,
28 | "metadata": {},
29 | "outputs": [],
30 | "source": [
31 | "from dask.distributed import Client\n",
32 | "\n",
33 | "# insert your client here\n",
34 | "client = Client()\n",
35 | "\n",
36 | "from time import sleep\n",
37 | "import numpy as np\n",
38 | "from fugue import transform\n",
39 | "import pandas as pd\n",
40 | "import dask.dataframe as dd\n",
41 | "\n",
42 | "def wrap():\n",
43 | " # schema: *,x:int\n",
44 | " def ppp(df:pd.DataFrame) -> pd.DataFrame:\n",
45 | " sleep(2)\n",
46 | " return df.assign(x=2)\n",
47 | " \n",
48 | " n=10000000\n",
49 | " df = pd.DataFrame(dict(\n",
50 | " a=np.random.rand(n),\n",
51 | " b=np.random.rand(n)\n",
52 | " ))\n",
53 | " ddf = dd.from_pandas(df, npartitions=16)\n",
54 | " \n",
55 | " return transform(ddf, ppp, engine=\"dask\").compute()"
56 | ]
57 | },
58 | {
59 | "cell_type": "markdown",
60 | "metadata": {},
61 | "source": [
62 | "This will give a traceback similar to the following. \n",
63 | "```\n",
64 | "SystemError Traceback (most recent call last)\n",
65 | " in \n",
66 | "----> 1 wrap()\n",
67 | "\n",
68 | " in wrap()\n",
69 | " 22 ddf = dd.from_pandas(df, npartitions=16)\n",
70 | " 23 \n",
71 | "---> 24 return transform(ddf, ppp, engine=\"dask\").compute()\n",
72 | "\n",
73 | "/usr/local/lib/python3.9/site-packages/dask/base.py in compute(self, **kwargs)\n",
74 | " 286 dask.base.compute\n",
75 | " 287 \"\"\"\n",
76 | "--> 288 (result,) = compute(self, traverse=False, **kwargs)\n",
77 | " 289 return result\n",
78 | " 290 \n",
79 | "\n",
80 | "/usr/local/lib/python3.9/site-packages/dask/base.py in compute(*args, **kwargs)\n",
81 | " 568 postcomputes.append(x.__dask_postcompute__())\n",
82 | " 569 \n",
83 | "--> 570 results = schedule(dsk, keys, **kwargs)\n",
84 | " 571 return repack([f(r, *a) for r, (f, a) in zip(results, postcomputes)])\n",
85 | " 572 \n",
86 | "\n",
87 | "/usr/local/lib/python3.9/site-packages/distributed/client.py in get(self, dsk, keys, workers, allow_other_workers, resources, sync, asynchronous, direct, retries, priority, fifo_timeout, actors, **kwargs)\n",
88 | " 2691 should_rejoin = False\n",
89 | " 2692 try:\n",
90 | "-> 2693 results = self.gather(packed, asynchronous=asynchronous, direct=direct)\n",
91 | " 2694 finally:\n",
92 | " 2695 for f in futures.values():\n",
93 | "\n",
94 | "/usr/local/lib/python3.9/site-packages/distributed/client.py in gather(self, futures, errors, direct, asynchronous)\n",
95 | " 1967 else:\n",
96 | " 1968 local_worker = None\n",
97 | "-> 1969 return self.sync(\n",
98 | " 1970 self._gather,\n",
99 | " 1971 futures,\n",
100 | "\n",
101 | "/usr/local/lib/python3.9/site-packages/distributed/client.py in sync(self, func, asynchronous, callback_timeout, *args, **kwargs)\n",
102 | " 863 return future\n",
103 | " 864 else:\n",
104 | "--> 865 return sync(\n",
105 | " 866 self.loop, func, *args, callback_timeout=callback_timeout, **kwargs\n",
106 | " 867 )\n",
107 | "\n",
108 | "/usr/local/lib/python3.9/site-packages/distributed/utils.py in sync(loop, func, callback_timeout, *args, **kwargs)\n",
109 | " 325 if error[0]:\n",
110 | " 326 typ, exc, tb = error[0]\n",
111 | "--> 327 raise exc.with_traceback(tb)\n",
112 | " 328 else:\n",
113 | " 329 return result[0]\n",
114 | "\n",
115 | "/usr/local/lib/python3.9/site-packages/distributed/utils.py in f()\n",
116 | " 308 if callback_timeout is not None:\n",
117 | " 309 future = asyncio.wait_for(future, callback_timeout)\n",
118 | "--> 310 result[0] = yield future\n",
119 | " 311 except Exception:\n",
120 | " 312 error[0] = sys.exc_info()\n",
121 | "\n",
122 | "/usr/local/lib/python3.9/site-packages/tornado/gen.py in run(self)\n",
123 | " 760 \n",
124 | " 761 try:\n",
125 | "--> 762 value = future.result()\n",
126 | " 763 except Exception:\n",
127 | " 764 exc_info = sys.exc_info()\n",
128 | "\n",
129 | "/usr/local/lib/python3.9/site-packages/distributed/client.py in _gather(self, futures, errors, direct, local_worker)\n",
130 | " 1832 exc = CancelledError(key)\n",
131 | " 1833 else:\n",
132 | "-> 1834 raise exception.with_traceback(traceback)\n",
133 | " 1835 raise exc\n",
134 | " 1836 if errors == \"skip\":\n",
135 | "\n",
136 | "/opt/conda/envs/coiled/lib/python3.8/site-packages/dask/optimization.py in __call__()\n",
137 | "\n",
138 | "/opt/conda/envs/coiled/lib/python3.8/site-packages/dask/core.py in get()\n",
139 | "\n",
140 | "/opt/conda/envs/coiled/lib/python3.8/site-packages/dask/core.py in _execute_task()\n",
141 | "\n",
142 | "/opt/conda/envs/coiled/lib/python3.8/site-packages/dask/utils.py in apply()\n",
143 | "\n",
144 | "/opt/conda/envs/coiled/lib/python3.8/site-packages/dask/dataframe/core.py in apply_and_enforce()\n",
145 | "\n",
146 | "/usr/local/lib/python3.9/site-packages/fugue_dask/execution_engine.py in _map()\n",
147 | " 196 pdf.reset_index(drop=True), input_schema, pandas_df_wrapper=True\n",
148 | " 197 )\n",
149 | "--> 198 if on_init_once is not None:\n",
150 | " 199 on_init_once(0, input_df)\n",
151 | " 200 cursor = partition_spec.get_cursor(input_schema, 0)\n",
152 | "\n",
153 | "SystemError: unknown opcode\n",
154 | "```"
155 | ]
156 | },
157 | {
158 | "cell_type": "markdown",
159 | "metadata": {},
160 | "source": [
161 | "In the traceback above, notice there are two versions of Python. Dask requires consistent Python versions between the client and cluster."
162 | ]
163 | }
164 | ],
165 | "metadata": {
166 | "kernelspec": {
167 | "display_name": "Python 3.8.9 64-bit",
168 | "language": "python",
169 | "name": "python3"
170 | },
171 | "language_info": {
172 | "name": "python",
173 | "nbconvert_exporter": "python",
174 | "version": "3.8.9"
175 | },
176 | "orig_nbformat": 2,
177 | "vscode": {
178 | "interpreter": {
179 | "hash": "31f2aee4e71d21fbe5cf8b01ff0e069b9275f58929596ceb00d14d90e3e16cd6"
180 | }
181 | }
182 | },
183 | "nbformat": 4,
184 | "nbformat_minor": 2
185 | }
186 |
--------------------------------------------------------------------------------
/tutorials/applications/examples/index.md:
--------------------------------------------------------------------------------
1 | # Examples
2 |
3 | End to end examples of using Fugue. Have questions? Chat with us on Github or Slack:
4 |
5 | [](https://github.com/fugue-project/fugue)
6 | [](http://slack.fugue.ai)
7 |
8 |
9 | ```{toctree}
10 | :hidden:
11 |
12 | stock_sentiment
13 | example_covid19
14 | ```
15 |
16 | ## [Stock Sentiment](stock_sentiment.ipynb)
17 | Using Fugue to analyze stock sentiment
18 |
19 | ## [COVID-19 Exploration with FugueSQL](example_covid19.ipynb)
20 | Analyzing COVID-19 data with FugueSQL. In this example, we show how to extend FugueSQL with Python and show some operations important for distributed computing such as `PERSIST` and `PREPARTITION`. This example includes tips on working with big data such as testing and persisting intermediate files as parquet.
21 |
--------------------------------------------------------------------------------
/tutorials/applications/recipes/index.md:
--------------------------------------------------------------------------------
1 | # Recipes
2 |
3 | All questions are welcome in the Slack channel.
4 |
5 | Have questions? Chat with us on Github or Slack:
6 |
7 | [](http://slack.fugue.ai)
8 | [](https://github.com/fugue-project/fugue)
9 |
10 | Recipes will cover common DataFrame operations and how to do them using Fugue. The Fugue approach will be designed to be readable, scale-agnostic, yet still performant.
11 |
12 | ```{toctree}
13 | :hidden:
14 |
15 | pivot
16 | ```
17 |
18 | ## [Pivot](pivot.ipynb)
19 |
20 | Pivoting a DataFrame is changing a tall DataFrame to a wide DataFrame.
21 |
--------------------------------------------------------------------------------
/tutorials/applications/use_cases/index.md:
--------------------------------------------------------------------------------
1 | # Use Cases
2 |
3 | This is a list of examples of Fugue use cases. Any questions are welcome in the Slack channel.
4 |
5 | [](http://slack.fugue.ai)
6 | [](https://github.com/fugue-project/fugue)
7 |
8 | ```{toctree}
9 | :hidden:
10 |
11 | unit_testing
12 | model_sweeping
13 | databricks_connect
14 | nlp
15 | distributed_image_classification
16 | ```
17 |
18 | ## [Testing Big Data Applications](unit_testing.ipynb)
19 | Unit testing is a significant pain point in big data applications. In this section, we examine what makes it so hard to test and how Fugue simplifies it. Through simplified testing, Fugue users often see speedup in the development of big data projects (in addition to lower compute costs).
20 |
21 | ## [Data Validation](../../integrations/ecosystem/pandera.ipynb)
22 | We'll get started with using Fugue and Pandera for data validation. Using Fugue, we can bring Pandas-based libraries into Spark, meaning we don't have to re-implement the same logic twice. Moreover, using Fugue allows us to achieve **validation by partition**, an operation missing in the current data validation frameworks.
23 |
24 | ## [Distributed Model Sweeping](model_sweeping.ipynb)
25 | Even if a dataset fits in one core, distributed compute can be used for parallelized model training. We can train multiple models simultaneously. In addition, Fugue provides an easy interface to train multiple models for each logical grouping of data.
26 |
27 | ## [Natural Language Processing](nlp.ipynb)
28 | Fugue helps parallelize Natural Language Processing (NLP), especially during pre-processing steps which tend to be executed for each row of data. This page shows an example of how to use the `transform()` function to do some pre-processing.
29 |
30 | ## [Image Classification](image_classification.ipynb)
31 | When dealing with images, it's more common to train the model with one GPU, but then use multiple GPUs for inference because inference is the more expensive part. Multiple GPUs can also be used for pre-processing on top of Spark or Dask, but pre-processing tends to be during training as batches are fed to the model. For the training specifically, a tool like [Horovod](https://github.com/horovod/horovod) could help. Fugue is more helpful on the inference side.
32 |
33 |
34 | ## Using Fugue with Providers
35 |
36 | This section is how to connect without using the native Fugue integrations. Fugue has integrations with some cloud providers in the `fugue-cloudprovider` repo. For more native integrations to spin-up ephemeral clusters, check [Fugue with Cloud Providers](../integrations/cloudproviders/index.md)
37 |
38 | ### [Databricks Connect](databricks_connect.ipynb)
39 | Fugue can be used with the `databricks-connect` library to run code on a Databricks cluster by using the SparkSession. Here we'll go over some details of how to set it up.
40 |
--------------------------------------------------------------------------------
/tutorials/beginner/distributed_compute.ipynb:
--------------------------------------------------------------------------------
1 | {
2 | "cells": [
3 | {
4 | "cell_type": "markdown",
5 | "metadata": {},
6 | "source": [
7 | "# Distributed Computing\n",
8 | "\n",
9 | "Have questions? Chat with us on Github or Slack:\n",
10 | "\n",
11 | "[](https://github.com/fugue-project/fugue)\n",
12 | "[](http://slack.fugue.ai)\n",
13 | "\n",
14 | "In the previous sections, we went over how to use Fugue in the form of extensions and basic data operations, such as joins. In this section, we'll talk about how those Fugue extensions scale to big data. These concepts are important for effectively utilizing distributed computing."
15 | ]
16 | },
17 | {
18 | "cell_type": "markdown",
19 | "metadata": {},
20 | "source": [
21 | "## Partition and Presort\n",
22 | "\n",
23 | "Our data is spread across several machines, and we often need to rearrange the way the data is spread across the machines. This is because of operations that need all of the related data in one place. For example, calculating the median value per group requires all of the data from the same group on one machine. Fugue allows users to control the partitioning scheme during execution.\n",
24 | "\n",
25 | "We have seen it used with the `transform()` function previously. `take()` is another operation that can be executed per partition. It extracts `n` number of rows per partition."
26 | ]
27 | },
28 | {
29 | "cell_type": "code",
30 | "execution_count": 2,
31 | "metadata": {},
32 | "outputs": [
33 | {
34 | "data": {
35 | "text/html": [
36 | "
\n",
37 | "\n",
50 | "
\n",
51 | " \n",
52 | "
\n",
53 | "
\n",
54 | "
col1
\n",
55 | "
col2
\n",
56 | "
\n",
57 | " \n",
58 | " \n",
59 | "
\n",
60 | "
0
\n",
61 | "
2
\n",
62 | "
7
\n",
63 | "
\n",
64 | "
\n",
65 | "
1
\n",
66 | "
1
\n",
67 | "
5
\n",
68 | "
\n",
69 | " \n",
70 | "
\n",
71 | "
"
72 | ],
73 | "text/plain": [
74 | " col1 col2\n",
75 | "0 2 7\n",
76 | "1 1 5"
77 | ]
78 | },
79 | "execution_count": 2,
80 | "metadata": {},
81 | "output_type": "execute_result"
82 | }
83 | ],
84 | "source": [
85 | "import fugue.api as fa\n",
86 | "import pandas as pd \n",
87 | "\n",
88 | "df = pd.DataFrame({'col1':[1,1,1,2,2,2], 'col2':[1,4,5,7,4,2]})\n",
89 | "fa.take(df, 1, presort=\"col2 desc\", partition={\"by\":['col1']})"
90 | ]
91 | },
92 | {
93 | "cell_type": "markdown",
94 | "metadata": {},
95 | "source": [
96 | "The presort expression here was `col2 desc`, which means that the data is sorted in descending order after partitioning. This makes the `take` operation give us the max value."
97 | ]
98 | },
99 | {
100 | "cell_type": "markdown",
101 | "metadata": {},
102 | "source": [
103 | "## Persist and Broadcast\n",
104 | "\n",
105 | "Persist and broadcast are two other distributed computing concepts that Fugue has support for. Persist keeps a DataFrame in memory to avoid recomputation. Distributed computing frameworks often need an explicit `persist()` call to know which DataFrames need to be kept, otherwise they tend to be calculated repeatedly.\n",
106 | "\n",
107 | "Broadcasting is making a smaller DataFrame available on all the workers of a cluster. Without `broadcast()`, these small DataFrames would be repeatedly sent to workers whenever they are needed to perform an operation. Broadcasting caches them on the workers."
108 | ]
109 | },
110 | {
111 | "cell_type": "code",
112 | "execution_count": 4,
113 | "metadata": {},
114 | "outputs": [
115 | {
116 | "data": {
117 | "text/plain": [
118 | "DataFrame[col1: bigint, col2: bigint]"
119 | ]
120 | },
121 | "execution_count": 4,
122 | "metadata": {},
123 | "output_type": "execute_result"
124 | }
125 | ],
126 | "source": [
127 | "fa.persist(df, engine=\"spark\")\n",
128 | "fa.broadcast(df, engine=\"spark\")"
129 | ]
130 | },
131 | {
132 | "cell_type": "markdown",
133 | "metadata": {},
134 | "source": [
135 | "## Repartition\n",
136 | "\n",
137 | "Fugue has support for reparitioning a distributed DataFrame. This can be used to increase the number of partitions to increase utilization. In the opposite case, sometimes the overhead of having too many partitions is too much, and an operation can be more performant with less partitions.\n",
138 | "\n",
139 | "It takes a Fugue `PartitionSpec` to partition the data. To see how partitions can be defined, check [partitioning](./partitioning.ipynb)."
140 | ]
141 | },
142 | {
143 | "cell_type": "code",
144 | "execution_count": 9,
145 | "metadata": {},
146 | "outputs": [
147 | {
148 | "data": {
149 | "text/plain": [
150 | "DataFrame[col1: bigint, col2: bigint]"
151 | ]
152 | },
153 | "execution_count": 9,
154 | "metadata": {},
155 | "output_type": "execute_result"
156 | }
157 | ],
158 | "source": [
159 | "fa.repartition(df, {\"num\": 100}, engine=\"spark\")"
160 | ]
161 | }
162 | ],
163 | "metadata": {
164 | "kernelspec": {
165 | "display_name": "Python 3.8.13 ('fugue')",
166 | "language": "python",
167 | "name": "python3"
168 | },
169 | "language_info": {
170 | "codemirror_mode": {
171 | "name": "ipython",
172 | "version": 3
173 | },
174 | "file_extension": ".py",
175 | "mimetype": "text/x-python",
176 | "name": "python",
177 | "nbconvert_exporter": "python",
178 | "pygments_lexer": "ipython3",
179 | "version": "3.8.13"
180 | },
181 | "orig_nbformat": 2,
182 | "vscode": {
183 | "interpreter": {
184 | "hash": "9fcd6e71927f6b3e5f4fa4280b4e8e6a66aa8d4365bb61cf7ef4017620fc09b9"
185 | }
186 | }
187 | },
188 | "nbformat": 4,
189 | "nbformat_minor": 2
190 | }
191 |
--------------------------------------------------------------------------------
/tutorials/beginner/engine_context.ipynb:
--------------------------------------------------------------------------------
1 | {
2 | "cells": [
3 | {
4 | "cell_type": "markdown",
5 | "metadata": {},
6 | "source": [
7 | "# Engine Context\n",
8 | "\n",
9 | "Have questions? Chat with us on Github or Slack:\n",
10 | "\n",
11 | "[](https://github.com/fugue-project/fugue)\n",
12 | "[](http://slack.fugue.ai)\n",
13 | "\n",
14 | "So far we've used Fugue's `transform()` function to port Pandas code to Spark, Dask, and Ray without any rewrites. We also the Fugue API functions `save()` and `load()` in the previous section. In the last section, we encountered code that looked like the following:\n",
15 | "\n",
16 | "```python\n",
17 | "import fugue.api as fa\n",
18 | "\n",
19 | "df = fa.load(\"/tmp/f.parquet\", engine=\"dask\")\n",
20 | "res = fa.transform(df, dummy, schema=\"*\", engine=\"dask\")\n",
21 | "fa.save(res, \"/tmp/f_out.parquet\", engine=\"dask\")\n",
22 | "```\n",
23 | "\n",
24 | "We had to repeat the engine multiple times. To simplify this, we can use the `engine_context` to set a default execution engine for all of the Fugue API functions used inside. For example, all of the Fugue functions below will run on the Dask engine."
25 | ]
26 | },
27 | {
28 | "cell_type": "code",
29 | "execution_count": 1,
30 | "metadata": {},
31 | "outputs": [
32 | {
33 | "name": "stdout",
34 | "output_type": "stream",
35 | "text": [
36 | "DaskDataFrame\n",
37 | "a:long\n",
38 | "------\n",
39 | "1 \n",
40 | "2 \n",
41 | "Total count: 2\n",
42 | "\n"
43 | ]
44 | }
45 | ],
46 | "source": [
47 | "import pandas as pd\n",
48 | "import fugue.api as fa \n",
49 | "\n",
50 | "df = pd.DataFrame({\"a\": [1,2]})\n",
51 | "df.to_parquet(\"/tmp/f.parquet\")\n",
52 | "\n",
53 | "def dummy(df:pd.DataFrame) -> pd.DataFrame:\n",
54 | " return df\n",
55 | "\n",
56 | "with fa.engine_context(\"dask\"):\n",
57 | " df = fa.load(\"/tmp/f.parquet\")\n",
58 | " res = fa.transform(df, dummy, schema=\"*\")\n",
59 | " fa.show(res)\n",
60 | " fa.save(res, \"/tmp/f_out.parquet\")"
61 | ]
62 | },
63 | {
64 | "cell_type": "markdown",
65 | "metadata": {},
66 | "source": [
67 | "From the output of the `show()` function, we can see that Dask was used to execute the operations. Using the `engine_context()` is not necessarily required, but it can heavily simplify the code."
68 | ]
69 | },
70 | {
71 | "cell_type": "markdown",
72 | "metadata": {},
73 | "source": [
74 | "## Overriding the Engine\n",
75 | "\n",
76 | "The `engine_context()` just sets a default engine, so it can be overridden if needed. In the example below, we use `engine=None` to use Pandas, but we'll specify the engine for the `transform()` call because it may be compute intensive."
77 | ]
78 | },
79 | {
80 | "cell_type": "code",
81 | "execution_count": 2,
82 | "metadata": {},
83 | "outputs": [
84 | {
85 | "name": "stdout",
86 | "output_type": "stream",
87 | "text": [
88 | "DaskDataFrame\n",
89 | "a:long\n",
90 | "------\n",
91 | "1 \n",
92 | "2 \n",
93 | "Total count: 2\n",
94 | "\n"
95 | ]
96 | }
97 | ],
98 | "source": [
99 | "with fa.engine_context(engine=None):\n",
100 | " df = fa.load(\"/tmp/f.parquet\")\n",
101 | " res = fa.transform(df, dummy, schema=\"*\", engine=\"dask\")\n",
102 | " fa.show(res)"
103 | ]
104 | },
105 | {
106 | "cell_type": "markdown",
107 | "metadata": {},
108 | "source": [
109 | "Even if we passed no engine to the `engine_context`, the Dask engine was used in the `transform()` step and returned a Dask DataFrame."
110 | ]
111 | },
112 | {
113 | "cell_type": "markdown",
114 | "metadata": {},
115 | "source": [
116 | "## Functions and engine_context()\n",
117 | "\n",
118 | "The same behavior will apply if a Python function calls Fugue API functions. This allows for grouping of logic into engine-agnostic functions."
119 | ]
120 | },
121 | {
122 | "cell_type": "code",
123 | "execution_count": 3,
124 | "metadata": {},
125 | "outputs": [
126 | {
127 | "name": "stdout",
128 | "output_type": "stream",
129 | "text": [
130 | "DaskDataFrame\n",
131 | "a:long\n",
132 | "------\n",
133 | "1 \n",
134 | "2 \n",
135 | "Total count: 2\n",
136 | "\n"
137 | ]
138 | }
139 | ],
140 | "source": [
141 | "def logic():\n",
142 | " df = fa.load(\"/tmp/f.parquet\")\n",
143 | " res = fa.transform(df, dummy, schema=\"*\")\n",
144 | " fa.show(res)\n",
145 | "\n",
146 | "with fa.engine_context(\"dask\"):\n",
147 | " logic()"
148 | ]
149 | },
150 | {
151 | "cell_type": "markdown",
152 | "metadata": {},
153 | "source": [
154 | "We can also wrap the whole `engine_context()` block under a function and pass in the engine. The output DataFrame will follow the engine passed. In the example below, a Dask DataFrame is returned."
155 | ]
156 | },
157 | {
158 | "cell_type": "code",
159 | "execution_count": 4,
160 | "metadata": {},
161 | "outputs": [
162 | {
163 | "name": "stdout",
164 | "output_type": "stream",
165 | "text": [
166 | "\n"
167 | ]
168 | }
169 | ],
170 | "source": [
171 | "def logic(engine):\n",
172 | " with fa.engine_context(engine):\n",
173 | " df = fa.load(\"/tmp/f.parquet\")\n",
174 | " res = fa.transform(df, dummy, schema=\"*\")\n",
175 | " return res\n",
176 | "\n",
177 | "\n",
178 | "out = logic(\"dask\")\n",
179 | "print(type(out))"
180 | ]
181 | },
182 | {
183 | "cell_type": "markdown",
184 | "metadata": {},
185 | "source": [
186 | "## Other Python Code\n",
187 | "\n",
188 | "The code inside the `engine_context()` is not limited to Fugue API functions. For example, loops can be used if an operation is being used multiple times. "
189 | ]
190 | },
191 | {
192 | "cell_type": "code",
193 | "execution_count": 6,
194 | "metadata": {},
195 | "outputs": [
196 | {
197 | "name": "stdout",
198 | "output_type": "stream",
199 | "text": [
200 | "PandasDataFrame\n",
201 | "a:long|x:long\n",
202 | "------+------\n",
203 | "1 |16 \n",
204 | "2 |16 \n",
205 | "Total count: 2\n",
206 | "\n"
207 | ]
208 | }
209 | ],
210 | "source": [
211 | "from fugue.column import col, lit\n",
212 | "\n",
213 | "with fa.engine_context():\n",
214 | " df = fa.load(\"/tmp/f.parquet\")\n",
215 | " df = fa.assign(df, x=lit(1))\n",
216 | " for i in range(4):\n",
217 | " df = fa.assign(df, x=col(\"x\")*lit(2))\n",
218 | " fa.show(df)"
219 | ]
220 | },
221 | {
222 | "cell_type": "markdown",
223 | "metadata": {},
224 | "source": [
225 | "## Decoupling of Logic and Execution\n",
226 | "\n",
227 | "This section illustrates how to piece together end-to-end workflows that can then we run on Pandas, Spark, Dask, or Ray. The logic is fully decoupled from the execution, which is one of the primary motivations of Fugue. This solves the following problems:\n",
228 | "\n",
229 | "1. Users have to learn an entirely new framework to work with distributed computing problems\n",
230 | "2. Logic written for a *small data* project is not reusable for a *big data* project\n",
231 | "3. Testing becomes a heavyweight process for distributed computing, especially Spark\n",
232 | "4. Along with number 3, iterations for distributed computing problems become slower and more expensive\n",
233 | "\n",
234 | "Fugue's core principle is to minimize code dependency on frameworks as much as possible, leading to flexibility and portability. **By decoupling logic and execution, we can focus on our logic in a scale-agnostic way.** In this section, we saw how to build end-to-end workflows with the Fugue API and the `engine_context()`. "
235 | ]
236 | },
237 | {
238 | "cell_type": "markdown",
239 | "metadata": {},
240 | "source": [
241 | "## Summary\n",
242 | "\n",
243 | "In this section we covered the `engine_context()` function, which sets the default execution engine for Fugue function calls. By wrapping it or using it alongside functions, it will be easier to group pieces of logic together to form framework-agnostic workloads. This can also be extended to create workflows that utilize different engines. A common use case is heavy processing with Spark, Dask, or Ray, and then doing post-processing with Pandas."
244 | ]
245 | }
246 | ],
247 | "metadata": {
248 | "kernelspec": {
249 | "display_name": "Python 3.8.13 ('fugue')",
250 | "language": "python",
251 | "name": "python3"
252 | },
253 | "language_info": {
254 | "codemirror_mode": {
255 | "name": "ipython",
256 | "version": 3
257 | },
258 | "file_extension": ".py",
259 | "mimetype": "text/x-python",
260 | "name": "python",
261 | "nbconvert_exporter": "python",
262 | "pygments_lexer": "ipython3",
263 | "version": "3.8.13"
264 | },
265 | "vscode": {
266 | "interpreter": {
267 | "hash": "9fcd6e71927f6b3e5f4fa4280b4e8e6a66aa8d4365bb61cf7ef4017620fc09b9"
268 | }
269 | }
270 | },
271 | "nbformat": 4,
272 | "nbformat_minor": 2
273 | }
274 |
--------------------------------------------------------------------------------
/tutorials/beginner/index.md:
--------------------------------------------------------------------------------
1 | # Getting Started
2 |
3 | Have questions? Chat with us on Github or Slack:
4 |
5 | [](https://github.com/fugue-project/fugue)
6 | [](http://slack.fugue.ai)
7 |
8 | Fugue is an abstraction layer that lets users write code in native Python or Pandas and then port it over to Spark, Dask, and Ray. This section will cover the motivation of Fugue, the benefits of using an abstraction layer, and how to get started. This section is not a complete reference but will be sufficient to get started with writing full workflows in Fugue.
9 |
10 |
11 | ```{toctree}
12 | :hidden:
13 |
14 | transform
15 | type_hinting
16 | schema
17 | partitioning
18 | execution_engine
19 | io
20 | engine_context
21 | joins
22 | transformations
23 | distributed_compute
24 | beginner_sql
25 | ```
26 |
27 | ## [transform() Function](transform.ipynb)
28 | We'll get started by introducing Fugue and show some motivating uses cases. The `transform()` function can take in a Python or Pandas function and scale it out in Spark or Dask without having to modify it. This provides a very simple interface to parallelize Python and Pandas code on distributed computing engines.
29 |
30 | ## [Type Hinting](type_hinting.ipynb)
31 | After diving into the `transform()` function, we look into the further flexibility Fugue provides by accepting functions with different input and output types. This allows users to define their logic in whatever expression makes the most sense and bring native Python functions to Spark, Dask or Ray. Having flexibility is important because distributed computing often goes beyond the scope of processing Pandas-like DataFrames. Think of aggregating API calls or processing image data.
32 |
33 | ## [Schema](schema.ipynb)
34 | Schema is an important part of distributed computing. Some frameworks even require it because schema inference can be especially expensive or inaccurate. Fugue has it's own schema implementation that is a simplified in syntax. This section will look into Fugue's schema expression.
35 |
36 | ## [Partitioning](partitioning.ipynb)
37 | Now that we have seen how functions can be written for Fugue to bring them to Spark or Dask, we look at how the `transform()` function can be applied with partitioning. In Pandas semantics, this would be the equivalent of a `groupby-apply()`. The difference is partitioning is a core concept in distributed computing as it controls both logical and physical grouping of data.
38 |
39 | ## [Execution Engine](execution_engine.ipynb)
40 | After seeing how the `transform` function enables the use of Python and Pandas code on Spark, we'll see all of the possible values we can pass as the engine. We can pass strings, cluster addresses, or clients to interact with clusters.
41 |
42 | ## [Saving and Loading](io.ipynb)
43 | Similar to the `transform()` function, the Fugue API also has saving and loading functions compatible with Pandas, Spark, Dask, and Ray. These help in constructing end-to-end workflows that can then be ran on top of any backend.
44 |
45 | ## [Engine Context](engine_context.ipynb)
46 | Often, we will have multiple operations that use the same execution engine. Instead of having to pass in the engine each time, we can use the `engine_context()` of the Fugue API. This will set the default execution engine for all Fugue API function calls.
47 |
48 | ## [Joins](joins.ipynb)
49 | Here we'll show the different ways to join DataFrames in Fugue along with union, intersect, and except. SQL and Pandas also have some inconsistencies that users should be aware of when joining. Fugue maintains consistency with SQL (and Spark).
50 |
51 | ## [Transformations](transformations.ipynb)
52 | Here we'll show the different ways to join DataFrames in Fugue along with union, intersect, and except. SQL and Pandas also have some inconsistencies that users should be aware of when joining. Fugue maintains consistency with SQL (and Spark).
53 |
54 | ## [Distributed Computing](distributed_compute.ipynb)
55 | The heart of Fugue is distributed computing. In this section, we'll show the keywords and concepts that allow Fugue to fully utilize the power of distributed computing. This includes `partitions`, `persisting`, and `broadcasting`.
56 |
57 | ## [FugueSQL](beginner_sql.ipynb)
58 | We'll show a bit of [FugueSQL](../fugue_sql/index.md), the SQL interface for using Fugue. This is targeted at heavy SQL users and SQL-lovers who want to use SQL on top of Spark and Dask, or even Pandas. FugueSQL is used on DataFrames in memory as opposed to data in databases.
59 |
60 | With that, you should be ready to implement data workflows using Fugue.
61 |
62 | For full end-to-end examples, check out the [Stock Sentiment](../applications/examples/stock_sentiment.ipynb) and [COVID-19](../applications/examples/example_covid19.ipynb) examples.
63 |
64 | For any questions, feel free to join the [Slack channel](http://slack.fugue.ai).
65 |
--------------------------------------------------------------------------------
/tutorials/beginner/io.ipynb:
--------------------------------------------------------------------------------
1 | {
2 | "cells": [
3 | {
4 | "cell_type": "markdown",
5 | "metadata": {},
6 | "source": [
7 | "# Saving and Loading\n",
8 | "\n",
9 | "Have questions? Chat with us on Github or Slack:\n",
10 | "\n",
11 | "[](https://github.com/fugue-project/fugue)\n",
12 | "[](http://slack.fugue.ai)\n",
13 | "\n",
14 | "So far, we've only covered modifying data with the `transform()` function. We constructed or loaded DataFrames with Pandas and then applied the transformation with a distributed computing engine. This setup will become a bottleneck for large files since we are loading everything at once on the driver node. On the other hand, loading a DataFrame using Spark, Dask, or Ray locks in the code to those frameworks.\n",
15 | "\n",
16 | "In order to make end-to-end workflows that are compatible with all backends, Fugue exposes two main ways to to load and save data that are compatible with all backends. The first is with the `transform()` function. The second is using the `load()` and `save()` functions of the Fugue API.\n",
17 | "\n",
18 | "## transform() using file path\n",
19 | "\n",
20 | "The `transform()` function can take in a file path instead of a DataFrame to load in data before performing the transformation. The engine specified will be used to directly load the file. First, we make an example file:"
21 | ]
22 | },
23 | {
24 | "cell_type": "code",
25 | "execution_count": 10,
26 | "metadata": {},
27 | "outputs": [],
28 | "source": [
29 | "import pandas as pd\n",
30 | "from fugue import transform\n",
31 | "\n",
32 | "df = pd.DataFrame({\"a\": [1,2]})\n",
33 | "df.to_parquet(\"/tmp/f.parquet\")"
34 | ]
35 | },
36 | {
37 | "cell_type": "markdown",
38 | "metadata": {},
39 | "source": [
40 | "Now we use the Dask engine to load in the data and apply the `dummy()` function."
41 | ]
42 | },
43 | {
44 | "cell_type": "code",
45 | "execution_count": 11,
46 | "metadata": {},
47 | "outputs": [
48 | {
49 | "data": {
50 | "text/html": [
51 | "
\n",
52 | "\n",
65 | "
\n",
66 | " \n",
67 | "
\n",
68 | "
\n",
69 | "
a
\n",
70 | "
\n",
71 | " \n",
72 | " \n",
73 | "
\n",
74 | "
0
\n",
75 | "
1
\n",
76 | "
\n",
77 | "
\n",
78 | "
1
\n",
79 | "
2
\n",
80 | "
\n",
81 | " \n",
82 | "
\n",
83 | "
"
84 | ],
85 | "text/plain": [
86 | " a\n",
87 | "0 1\n",
88 | "1 2"
89 | ]
90 | },
91 | "execution_count": 11,
92 | "metadata": {},
93 | "output_type": "execute_result"
94 | }
95 | ],
96 | "source": [
97 | "def dummy(df:pd.DataFrame) -> pd.DataFrame:\n",
98 | " return df\n",
99 | "\n",
100 | "res = transform(\"/tmp/f.parquet\", dummy, schema=\"*\", engine=\"dask\")\n",
101 | "res.compute()"
102 | ]
103 | },
104 | {
105 | "cell_type": "markdown",
106 | "metadata": {},
107 | "source": [
108 | "To save the results, the `transform()` function can also take in a `save_path` argument. By default, it will return the path where it was saved, which is helpful for consecutive `transform()` calls."
109 | ]
110 | },
111 | {
112 | "cell_type": "code",
113 | "execution_count": 12,
114 | "metadata": {},
115 | "outputs": [
116 | {
117 | "data": {
118 | "text/plain": [
119 | "'/tmp/f_out.parquet'"
120 | ]
121 | },
122 | "execution_count": 12,
123 | "metadata": {},
124 | "output_type": "execute_result"
125 | }
126 | ],
127 | "source": [
128 | "transform(\"/tmp/f.parquet\", dummy, schema=\"*\", engine=\"dask\", save_path=\"/tmp/f_out.parquet\")"
129 | ]
130 | },
131 | {
132 | "cell_type": "markdown",
133 | "metadata": {},
134 | "source": [
135 | "## load() and save()\n",
136 | "\n",
137 | "The Fugue API also has `load()` and `save()` methods that are compatible with any engine. These are capable of loading `parquet`, `csv`, and `json` files. Using `parquet` when possible is best practice because it contains schema information and does not require additional keywords to parse. These functions can be used independently similar to the `transform()` function."
138 | ]
139 | },
140 | {
141 | "cell_type": "code",
142 | "execution_count": 13,
143 | "metadata": {},
144 | "outputs": [],
145 | "source": [
146 | "import fugue.api as fa\n",
147 | "\n",
148 | "df = fa.load(\"/tmp/f.parquet\", engine=\"dask\")\n",
149 | "res = fa.transform(df, dummy, schema=\"*\", engine=\"dask\")\n",
150 | "fa.save(res, \"/tmp/f_out.parquet\", engine=\"dask\")"
151 | ]
152 | },
153 | {
154 | "cell_type": "markdown",
155 | "metadata": {},
156 | "source": [
157 | "Using these functions gives additional control over loading and saving compared to using the `transform()` function's saving and loading capabilities. Note that the `fa.transform()` in the cell above is exactly the same as the `transform()` function covered in earlier sections."
158 | ]
159 | },
160 | {
161 | "cell_type": "code",
162 | "execution_count": 15,
163 | "metadata": {},
164 | "outputs": [
165 | {
166 | "data": {
167 | "text/html": [
168 | "
\n",
169 | "\n",
182 | "
\n",
183 | " \n",
184 | "
\n",
185 | "
\n",
186 | "
col1
\n",
187 | "
\n",
188 | " \n",
189 | " \n",
190 | "
\n",
191 | "
0
\n",
192 | "
1
\n",
193 | "
\n",
194 | "
\n",
195 | "
1
\n",
196 | "
2
\n",
197 | "
\n",
198 | "
\n",
199 | "
2
\n",
200 | "
3
\n",
201 | "
\n",
202 | " \n",
203 | "
\n",
204 | "
"
205 | ],
206 | "text/plain": [
207 | " col1\n",
208 | "0 1\n",
209 | "1 2\n",
210 | "2 3"
211 | ]
212 | },
213 | "execution_count": 15,
214 | "metadata": {},
215 | "output_type": "execute_result"
216 | }
217 | ],
218 | "source": [
219 | "df = pd.DataFrame({\"col1\": [1,2,3], \"col2\": [1,2,3]})\n",
220 | "\n",
221 | "fa.save(df, '/tmp/data.parquet', mode='overwrite')\n",
222 | "fa.save(df, '/tmp/data.csv', mode='overwrite', header=True)\n",
223 | "df2 = fa.load('/tmp/data.parquet')\n",
224 | "df3 = fa.load(\"/tmp/data.csv\", header=True, columns=\"col1:int\")\n",
225 | "df3"
226 | ]
227 | },
228 | {
229 | "cell_type": "markdown",
230 | "metadata": {},
231 | "source": [
232 | "The `columns` argument of `load()` takes a Fugue schema expression and limits the columns loaded."
233 | ]
234 | },
235 | {
236 | "cell_type": "markdown",
237 | "metadata": {},
238 | "source": [
239 | "## Summary\n",
240 | "\n",
241 | "In this section, we learned how save and load DataFrames in an engine-agnostic way. Both methods presented in this section will work across all execution engines. There were some code snippets where we had to repeat `engine=\"dask\"` multiple times. This can be redundant and tedious to type out. In practice, we can define the engine once by using the `engine_context()` we'll learn next section."
242 | ]
243 | }
244 | ],
245 | "metadata": {
246 | "kernelspec": {
247 | "display_name": "Python 3.8.13 ('fugue')",
248 | "language": "python",
249 | "name": "python3"
250 | },
251 | "language_info": {
252 | "codemirror_mode": {
253 | "name": "ipython",
254 | "version": 3
255 | },
256 | "file_extension": ".py",
257 | "mimetype": "text/x-python",
258 | "name": "python",
259 | "nbconvert_exporter": "python",
260 | "pygments_lexer": "ipython3",
261 | "version": "3.8.13"
262 | },
263 | "orig_nbformat": 2,
264 | "vscode": {
265 | "interpreter": {
266 | "hash": "9fcd6e71927f6b3e5f4fa4280b4e8e6a66aa8d4365bb61cf7ef4017620fc09b9"
267 | }
268 | }
269 | },
270 | "nbformat": 4,
271 | "nbformat_minor": 2
272 | }
273 |
--------------------------------------------------------------------------------
/tutorials/extensions/creator.ipynb:
--------------------------------------------------------------------------------
1 | {
2 | "cells": [
3 | {
4 | "cell_type": "markdown",
5 | "metadata": {},
6 | "source": [
7 | "# Creator\n",
8 | "\n",
9 | "`Creator` represents the logic unit to generate a DataFrame. It is used at the start of workflows. The built-in `load` of Fugue is an example of a Creator.\n",
10 | "\n",
11 | "In this tutorial are the methods to define a `Creator`. There is no preferred method and Fugue makes it flexible for users to choose whatever interface works for them. The four ways are native approach, schema hint, decorator, and the class interface in order of simplicity.\n",
12 | "\n",
13 | "## Example Use Cases\n",
14 | "\n",
15 | "* **Reading special data sources** like constructing a DataFrame using an API.\n",
16 | "* **Querying a database** using `pyodbc` and returning a DataFrame\n",
17 | "* **Create mock data for unit tests**."
18 | ]
19 | },
20 | {
21 | "cell_type": "markdown",
22 | "metadata": {},
23 | "source": [
24 | "## Quick Notes on Usage\n",
25 | "\n",
26 | "**ExecutionEngine aware**\n",
27 | "\n",
28 | "* Creators run on the driver so they are aware of the `ExecutionEngine` being used. Passing a parameter with the `ExecutionEngine` annotation will pass in the current `ExecutionEngine`. There is an example of this later.\n",
29 | "\n",
30 | "**Acceptable input DataFrame types**\n",
31 | "\n",
32 | "* `Creator` can't take DataFrames in, but can take other parameters.\n",
33 | "\n",
34 | "**Acceptable output DataFrame types**\n",
35 | "\n",
36 | "* `DataFrame`, `LocalDataFrame`, `pd.DataFrame`, `List[List[Any]]`, `Iterable[List[Any]]`, `EmptyAwareIterable[List[Any]]`, `List[Dict[str, Any]]`, `Iterable[Dict[str, Any]]`, `EmptyAwareIterable[Dict[str, Any]]`\n",
37 | "\n",
38 | "**Further notes**\n",
39 | "\n",
40 | "* If the output type is NOT one of Fugue `DataFrame`, `LocalDataFrame` or `pd.DataFrame`, the output schema must be specified because it can't be inferred."
41 | ]
42 | },
43 | {
44 | "cell_type": "markdown",
45 | "metadata": {},
46 | "source": [
47 | "## Native Approach\n",
48 | "\n",
49 | "The native approach is using a regular function without any edits beyond type annotations. It is converted to a Fugue extension during runtime. In the example below, we have two create functions. The first one has an output type of `pd.DataFrame`, which means that the schema is already known. The second one has an output type of `List[List[Any]]`, which does hold schema so it has to be provided during the `create` call inside `FugueWorkflow`."
50 | ]
51 | },
52 | {
53 | "cell_type": "code",
54 | "execution_count": 1,
55 | "metadata": {},
56 | "outputs": [
57 | {
58 | "name": "stdout",
59 | "output_type": "stream",
60 | "text": [
61 | "PandasDataFrame\n",
62 | "a:long\n",
63 | "------\n",
64 | "2 \n",
65 | "Total count: 1\n",
66 | "\n",
67 | "ArrayDataFrame\n",
68 | "a:int\n",
69 | "-----\n",
70 | "2 \n",
71 | "Total count: 1\n",
72 | "\n"
73 | ]
74 | }
75 | ],
76 | "source": [
77 | "from typing import Iterable, Dict, Any, List\n",
78 | "import pandas as pd\n",
79 | "from fugue import FugueWorkflow\n",
80 | "\n",
81 | "# fugue knows the schema because the output in pd.DataFrame\n",
82 | "def create1(n=1) -> pd.DataFrame:\n",
83 | " return pd.DataFrame([[n]],columns=[\"a\"])\n",
84 | "\n",
85 | "# schema is not known so it has to be provided later\n",
86 | "def create2(n=1) -> List[List[Any]]:\n",
87 | " return [[n]]\n",
88 | "\n",
89 | "with FugueWorkflow() as dag:\n",
90 | " dag.create(create1, params={\"n\":2}).show()\n",
91 | " dag.create(create2, schema=\"a:int\", params={\"n\":2}).show()"
92 | ]
93 | },
94 | {
95 | "cell_type": "markdown",
96 | "metadata": {},
97 | "source": [
98 | "## Schema Hint\n",
99 | "\n",
100 | "The schema can also be provided during the function definition through the use of the schema hint comment. Providing it during definition means it does not need to be provided inside the `FugueWorkflow`."
101 | ]
102 | },
103 | {
104 | "cell_type": "code",
105 | "execution_count": 2,
106 | "metadata": {},
107 | "outputs": [
108 | {
109 | "name": "stdout",
110 | "output_type": "stream",
111 | "text": [
112 | "ArrayDataFrame\n",
113 | "a:int\n",
114 | "-----\n",
115 | "1 \n",
116 | "Total count: 1\n",
117 | "\n"
118 | ]
119 | }
120 | ],
121 | "source": [
122 | "# schema: a:int\n",
123 | "def create2(n=1) -> List[List[Any]]:\n",
124 | " return [[n]]\n",
125 | "\n",
126 | "with FugueWorkflow() as dag:\n",
127 | " dag.create(create2).show()"
128 | ]
129 | },
130 | {
131 | "cell_type": "markdown",
132 | "metadata": {},
133 | "source": [
134 | "## Decorator Approach\n",
135 | "\n",
136 | "There is no obvious advantage to use the decorator approach for defining a `Creator`. In general, the decorator is good if the schema is too long to type out as a comment in one line. "
137 | ]
138 | },
139 | {
140 | "cell_type": "code",
141 | "execution_count": 3,
142 | "metadata": {},
143 | "outputs": [
144 | {
145 | "name": "stdout",
146 | "output_type": "stream",
147 | "text": [
148 | "ArrayDataFrame\n",
149 | "a:int\n",
150 | "-----\n",
151 | "1 \n",
152 | "Total count: 1\n",
153 | "\n"
154 | ]
155 | }
156 | ],
157 | "source": [
158 | "from fugue import creator\n",
159 | "\n",
160 | "@creator(\"a:int\")\n",
161 | "def create(n=1) -> List[List[Any]]:\n",
162 | " return [[n]]\n",
163 | "\n",
164 | "with FugueWorkflow() as dag:\n",
165 | " dag.create(create).show()"
166 | ]
167 | },
168 | {
169 | "cell_type": "markdown",
170 | "metadata": {},
171 | "source": [
172 | "## Interface Approach (Advanced)\n",
173 | "\n",
174 | "All the previous methods are just wrappers of the interface approach. They cover most of use cases and are simpler to use. But if you want to get all execution context such as partition information, use interface approach.\n",
175 | "\n",
176 | "In the interface approach, type annotations are not necessary but it's good practice to have them."
177 | ]
178 | },
179 | {
180 | "cell_type": "code",
181 | "execution_count": 4,
182 | "metadata": {},
183 | "outputs": [
184 | {
185 | "name": "stdout",
186 | "output_type": "stream",
187 | "text": [
188 | "ArrayDataFrame\n",
189 | "a:int\n",
190 | "-----\n",
191 | "1 \n",
192 | "Total count: 1\n",
193 | "\n"
194 | ]
195 | }
196 | ],
197 | "source": [
198 | "from fugue import Creator, DataFrame\n",
199 | "\n",
200 | "class Array(Creator):\n",
201 | " def create(self) -> DataFrame:\n",
202 | " engine = self.execution_engine\n",
203 | " n = self.params.get_or_throw(\"n\",int)\n",
204 | " return engine.to_df([[n]],\"a:int\")\n",
205 | "\n",
206 | "\n",
207 | "with FugueWorkflow() as dag:\n",
208 | " dag.create(Array, params=dict(n=1)).show()"
209 | ]
210 | },
211 | {
212 | "cell_type": "markdown",
213 | "metadata": {},
214 | "source": [
215 | "## Using the ExecutionEngine\n",
216 | "\n",
217 | "In some cases, the `Creator` has to be aware of the `ExecutionEngine`. **This is an example of how to write native Spark code inside Fugue.**"
218 | ]
219 | },
220 | {
221 | "cell_type": "code",
222 | "execution_count": 5,
223 | "metadata": {},
224 | "outputs": [
225 | {
226 | "name": "stdout",
227 | "output_type": "stream",
228 | "text": [
229 | "SparkDataFrame\n",
230 | "a:int\n",
231 | "-----\n",
232 | "2 \n",
233 | "Total count: 1\n",
234 | "\n"
235 | ]
236 | }
237 | ],
238 | "source": [
239 | "from fugue import ExecutionEngine\n",
240 | "from fugue_spark import SparkExecutionEngine, SparkDataFrame\n",
241 | "\n",
242 | "# pay attention to the input and output annotations, they are both general DataFrame\n",
243 | "def create(e:ExecutionEngine, n=1) -> DataFrame:\n",
244 | " assert isinstance(e,SparkExecutionEngine) # this extension only works with SparkExecutionEngine\n",
245 | " sdf= e.spark_session.createDataFrame([[n]], schema=\"a:int\") # this is how you get spark session\n",
246 | " return SparkDataFrame(sdf) # you must wrap as Fugue SparkDataFrame to return\n",
247 | "\n",
248 | "with FugueWorkflow(SparkExecutionEngine) as dag:\n",
249 | " dag.create(create, params={\"n\":2}).show()"
250 | ]
251 | }
252 | ],
253 | "metadata": {
254 | "kernelspec": {
255 | "display_name": "Python 3.7.9 64-bit ('fugue-tutorials': conda)",
256 | "metadata": {
257 | "interpreter": {
258 | "hash": "131b24c7e1bb8763ab2b04d5b6d98a68c7b3a823a2a57c5722935f7690890f70"
259 | }
260 | },
261 | "name": "python3"
262 | },
263 | "language_info": {
264 | "codemirror_mode": {
265 | "name": "ipython",
266 | "version": 3
267 | },
268 | "file_extension": ".py",
269 | "mimetype": "text/x-python",
270 | "name": "python",
271 | "nbconvert_exporter": "python",
272 | "pygments_lexer": "ipython3",
273 | "version": "3.7.9"
274 | }
275 | },
276 | "nbformat": 4,
277 | "nbformat_minor": 4
278 | }
--------------------------------------------------------------------------------
/tutorials/extensions/index.md:
--------------------------------------------------------------------------------
1 | # Extensions
2 |
3 | All questions are welcome in the Slack channel.
4 |
5 | [](http://slack.fugue.ai)
6 |
7 | We have previously introduced extensions in the [FugueSQL section](../fugue_sql/extensions.ipynb). This section is a more comprehensive guide to extensions in Fugue. Extensions are user-created functions that perform operations on DataFrames. By converting these functions to the appropriate extension, they can be brought into FugueSQL.
8 |
9 | 
10 |
11 | ## Driver vs Worker
12 |
13 | To fully understand extensions, it is important to understand the basic distributed compute architecture. A compute cluster is comprised of one `driver`, and multiple `worker` nodes. The `driver` is the machine responsible for orchestrating the `worker` machines. Because the driver is responsible for communciating and distributing work, it has access to information that worker nodes don't have. For example, it keeps track of partition information.
14 |
15 | Because of this distinction in distributed computing, there will be code written that is specifically meant to be run on the `driver` machine. These are things like loading and saving a dataframe, or maybe the logic to divide the data into different partitions. On the other hand, code that runs on `workers` will be agnostic to what is happening on other workers.
16 |
17 | Note that this means using a function to get the maximum value of a column behaves differently depending if it is a client-side or worker-side extension. The client-side extension will give a global maximum, while the worker-side extension will give a local maximum.
18 |
19 | ## Extension Types
20 |
21 | In the following descriptions, note the difference between `DataFrame` and `LocalDataFrame`. A `LocalDataFrame` is a dataframe that exists on a single machine. The `LocalDataFrame` is an abstraction for structures like `pd.DataFrame` or `List[List[Any]]`. On the other hand, the `DataFrame` is a dataframe that can exist on multiple machines. Fugue's `DataFrame` class is a abstract version of Spark or Dask DataFrames. `DataFrame` can only be used on driver.
22 |
23 | **Driver-side extensions**
24 |
25 | * [**Creator**](./creator.ipynb): no input, single output `DataFrame`, it is to produce a `DataFrame` to be used by other extensions
26 | * [**Processor**](./processor.ipynb): one or multiple input `DataFrame`, single output `DataFrame`, it is to do certain transformation and pass to the next node
27 | * [**Outputter**](./outputter.ipynb): one or multiple input `DataFrames`, no input, it is to finalize the process of the input, for example save or print
28 |
29 | **Worker-side extensions**
30 |
31 | * [**Transformer**](./transformer.ipynb): single `LocalDataFrame` in, single `LocalDataFrame` out
32 | * [**CoTransformer**](./cotransformer.ipynb): one or multiple `LocalDataFrame` in, single `LocaDataFrame` out
33 |
34 | **Advanced worker-side extensions**
35 | * [**OutputTransformer**](./outputtransformer.ipynb): single `LocalDataFrame` in, no output
36 | * [**OutputCoTransformer**](./outputcotransformer.ipynb): one or multiple `LocalDataFrame` in, no output
37 |
38 | ## [Interfaceless](./interfaceless.ipynb)
39 |
40 | These extensions can be defined with the appropriate Python class or decorator. For example, a `transformer` can be defined with the `Transformer` class or by using the `@transformer` decorator with a Python function. These are **interfaces** provided by Fugue, but they are not required to convert functions to extensions. As seen in the beginner tutorial, schema hints can be used to define extensions. For example, the following function will create a new column called `c`.
41 |
42 | ```python
43 | # schema: *,c:int
44 | def add_transformer(df:pd.DataFrame) -> pd.DataFrame:
45 | df['c'] = df['a'] + df['b']
46 | return df
47 | ```
48 |
49 | This schema hint comment is read by Fugue to make the `add_transformer` an extension during runtime. In fact, the schema hint is not even required if the schema is provided during runtime as seen below.
50 |
51 | ```python
52 | from fugue import transform
53 |
54 | def add_transformer(df:pd.DataFrame) -> pd.DataFrame:
55 | df['c'] = df['a'] + df['b']
56 | return df
57 |
58 | df = transform(df, add_transformer, schema="*, c:int")
59 | ```
60 |
61 | These approaches that leave the code in native Python are called the [interfaceless](./interfaceless.ipynb) approach and are the easiest way to use Fugue. They are designed to be non-invasive, and at the same time, encourage more maintainable code by working around typo annotations and comments. The resulting code is not tied to Fugue, and can run independently.
62 |
63 | ## Output Schema Requirement
64 |
65 | There is a distinction when it comes to `driver`-side and `worker`-side extensions. The driver-side extensions (Creator, Processor, Outputter) have access to the schema, so there is no need to infer or guess it. This is why Fugue does not require the schema to be specified if the output annotation is one of `LocalDataFrame`, `DataFrame`, or `pd.DataFrame` for the `Creator`, `Processor`, and `Outputter`.
66 |
67 | For the `worker`-side extensions, things need to be a bit more explicit. Normally distributed computing frameworks can infer output schema, however, it is neither reliable nor efficient. To infer the schema, it has to go through at least one partition of data and figure out the possible schema. However, `transformers` can produce inconsistent schemas on different partitions. The inference can also take a long time or directly fail. So to avoid potential correctness and performance issues, `Transformer` and `CoTransformer` output schemas are required in Fugue.
68 |
69 | ## Summary
70 |
71 | | . | Creator | Processor | Outputter | Transformer | CoTransformer | OutputTransformer | OutputCoTransformer
72 | |---|---|---|---|---|---|---|---
73 | |Input | 0 | 1+ | 1+ | 1 | 1+ | 1 | 1+
74 | |Output| 1 | 1 | 0 | 1 | 1 | 0 | 0
75 | |Side |Driver|Driver | Driver | Worker | Worker | Worker | Worker
76 | |Engine Aware | Yes | Yes | Yes | No | No | No | No
77 |
78 |
79 | ```{toctree}
80 | :hidden:
81 |
82 | creator
83 | processor
84 | outputter
85 | transformer
86 | cotransformer
87 | outputtransformer
88 | outputcotransformer
89 | interfaceless
90 | ```
91 |
--------------------------------------------------------------------------------
/tutorials/extensions/interfaceless.ipynb:
--------------------------------------------------------------------------------
1 | {
2 | "cells": [
3 | {
4 | "cell_type": "markdown",
5 | "metadata": {},
6 | "source": [
7 | "# Interfaceless\n",
8 | "\n",
9 | "Fugue does have interfaces for all these extensions. But in most of the cases, you don't have to implement these interfaces, instead, Fugue will adapt to your functions and wrap them under these interfaces.\n",
10 | "\n",
11 | "One obvious benefit is that most of your code can be totally independent from Fugue. It's not like other computing frameworks, you have to implement their interfaces and use their data structures, and the dependency will spread in your code, it will be hard to move away from them, and testing is also difficult and slow.\n",
12 | "\n",
13 | "Actually a more important benefit is that, it helps you separate your logics. What can be independent from the computing framework? What has to depend on that? How to minimize the dependency? When you use Fugue, you naturally think more about these design questions. And with the interfaceless feature, you will be able to achieve them elegantly. Even someday you decide to move away from Fugue, you should find it's simple and the Fugue mindset will still be beneficial when you move into another framework.\n",
14 | "\n",
15 | "To be interfaceless, you must have type annotations for your python code so Fugue can understand what you want. Writing python code with type hints are in general regarded as good practice. If you are not familiar, read [PEP 483](https://www.python.org/dev/peps/pep-0483/), [PEP 484](https://www.python.org/dev/peps/pep-0484/), [typing module](https://docs.python.org/3/library/typing.html) and [mypy](http://mypy-lang.org/).\n",
16 | "\n",
17 | "Take [**Transformer**](./transformer.ipynb) as an example:"
18 | ]
19 | },
20 | {
21 | "cell_type": "markdown",
22 | "metadata": {},
23 | "source": []
24 | },
25 | {
26 | "cell_type": "code",
27 | "execution_count": null,
28 | "metadata": {},
29 | "outputs": [],
30 | "source": [
31 | "# absolutely no dependency on Fugue\n",
32 | "from typing import List, Dict, Iterable, Any\n",
33 | "import pandas as pd\n",
34 | "\n",
35 | "def transformer1(df:List[List[Any]], a, b) -> List[List[Any]]:\n",
36 | " pass\n",
37 | "\n",
38 | "def transformer2(df:Iterable[Dict[str, Any]], a, b) -> pd.DataFrame:\n",
39 | " pass\n",
40 | "\n",
41 | "# schema: *,b:int\n",
42 | "def transformer3(df:pd.DataFrame, a, b) -> Iterable[Dict[str, Any]]:\n",
43 | " pass"
44 | ]
45 | },
46 | {
47 | "cell_type": "markdown",
48 | "metadata": {},
49 | "source": [
50 | "`transformer1` wants to process a partition of the dataframe in the format of list of list (row in the format of list), also return a list of list. `transformer2` will process a partition in the format of iterable of dict (row in the format of dictionary), and return a pandas dataframe. `transformer3` will be pd.DataFrame in and iterable out, which is a great design pattern for distributed computing (because it can minimize the memory usage).\n",
51 | "\n",
52 | "Fugue is able to wrap these 3 functions to Fugue Transformers, and at runtime, the data will be passed in according to the type annotations. Also notice the comment line before `transformer3`, it is to tell Fugue the output schema of the transformation. For `transformer1` and `transformer2` you will need to specify the schema in the Fugue code.\n",
53 | "\n",
54 | "These 3 transformers can achieve the same thing, but with the flexibility of input and output, you may write much more intuitive and less tedious code and let Fugue handle the rest.\n",
55 | "\n",
56 | "Also with flexibile input and output, Fugue is able to optimize the execution. For example, with iterable input, Fugue will not preload the entire partition in memory, and you can exit the iteration any time. And with pd.DataFrame as input and output, you will get best performance when using [SparkExecutionEngine with pandas_udf](../advanced/useful_config.ipynb#Use-Pandas-UDF-on-SparkExecutionEngine) enabled, because pandas_udf itself requires pd.DataFrame as input and output, so your annotation eliminates data conversion.\n",
57 | "\n",
58 | "Parameters are not required to have type annotations, but it's good practice to have annotations for all parameters.\n",
59 | "\n",
60 | "It is fine to use class member functions as extensions.\n",
61 | "\n",
62 | "* **Why is this useful?** You can initialize the class with certain parameters, and they can be used inside these transformers\n",
63 | "* **What can be used:**\n",
64 | " * native function with comment (schema hint)\n",
65 | " * native function without comment (schema hint)\n",
66 | "* **What can't be used:**\n",
67 | " * functions with decorator\n",
68 | "* **What you need to be careful about?**\n",
69 | " * it's a bad idea to modify the class member variables inside a member function, for certain extensions and execution engine, it may not work\n",
70 | " * all member variables of the class are better to be simple native data types that are picklable because for certain engine such as Spark, this is required"
71 | ]
72 | },
73 | {
74 | "cell_type": "code",
75 | "execution_count": null,
76 | "metadata": {},
77 | "outputs": [],
78 | "source": [
79 | "import pandas as pd\n",
80 | "from fugue import FugueWorkflow\n",
81 | "\n",
82 | "class Test(object):\n",
83 | " def __init__(self, n):\n",
84 | " self.n = n\n",
85 | " \n",
86 | " # schema: *\n",
87 | " def transform1(self, df:pd.DataFrame) -> pd.DataFrame:\n",
88 | " df[\"a\"]+=self.n\n",
89 | " return df\n",
90 | " \n",
91 | " # schema: *\n",
92 | " def transform2(self, df:pd.DataFrame) -> pd.DataFrame:\n",
93 | " df[\"a\"]*=self.n\n",
94 | " return df\n",
95 | " \n",
96 | "test = Test(5)\n",
97 | "with FugueWorkflow() as dag:\n",
98 | " dag.df([[2]],\"a:int\").transform(test.transform1).show()\n",
99 | " dag.df([[2]],\"a:int\").transform(test.transform2).show()"
100 | ]
101 | },
102 | {
103 | "cell_type": "markdown",
104 | "metadata": {},
105 | "source": [
106 | "## Different Ways to Write Extensions\n",
107 | "\n",
108 | "Here is a general comparison on different ways to write Fugue extensions\n",
109 | "\n",
110 | "| . | Native | With Comment | With Decorator | Interface |\n",
111 | "| --- | ---|---|---| ---|\n",
112 | "|Interfaceless | Yes | Yes | Yes | No |\n",
113 | "|Independent from Fugue | Yes | Yes | No | No |\n",
114 | "|Performance | Good | Good | Good | Slightly better |\n",
115 | "|Function as extension | Yes | Yes | Yes | No (has to be class) |\n",
116 | "|Fugue can use it without providing schema | Depends | Yes | Yes | Yes |\n",
117 | "|Flexibility on constructing schema | N/A | OK | Good | Best |\n",
118 | "|Can use class member functions | Yes | Yes | No | N/A |"
119 | ]
120 | }
121 | ],
122 | "metadata": {
123 | "interpreter": {
124 | "hash": "4cd7ab41f5fca4b9b44701077e38c5ffd31fe66a6cab21e0214b68d958d0e462"
125 | },
126 | "kernelspec": {
127 | "display_name": "Python 3",
128 | "language": "python",
129 | "name": "python3"
130 | },
131 | "language_info": {
132 | "name": "python",
133 | "version": "3.7.9"
134 | }
135 | },
136 | "nbformat": 4,
137 | "nbformat_minor": 4
138 | }
--------------------------------------------------------------------------------
/tutorials/extensions/outputcotransformer.ipynb:
--------------------------------------------------------------------------------
1 | {
2 | "cells": [
3 | {
4 | "cell_type": "markdown",
5 | "source": [
6 | "# Output CoTransformer (Advanced)\n",
7 | "\n",
8 | "`OutputCoTransfomer` is similar to `CoTransformer`. And any `CoTransformer` can be used as `OutputCoTransformer`. It is important to understand the difference between the operations `transform` and `output_transform`.\n",
9 | "\n",
10 | "Remember that the syntax to use a cotran\n",
11 | "\n",
12 | "* `transform` is lazy meaning that Fugue does not ensure the compute immediately. For example, if using `SparkExecutionEngine`, the real compute of `transform` happens only when hitting an action, such as `print` or `save`.\n",
13 | "* `output_transform` is an action, Fugue ensures the compute happens immediately, regardless of what execution engine is used.\n",
14 | "* `transform` outputs a transformed dataframe for the following steps to use\n",
15 | "* `output_transform` is the last compute of a branch in the DAG, it outputs nothing.\n",
16 | "\n",
17 | "You may find that `transform().persist()` can be an alternative to `out_transform`, it's in general ok, but you must notice that, the output dataframe of a transformation can be very large, if you persist or checkpoint it, it can take up great portion of memory or disk space. In contrast, `out_transform` does not take any space. Plus, it is a more explicit way to show what you want to do.\n",
18 | "\n",
19 | "In this tutorial are the methods to define an `OutputTransformer`. There is no preferred method and Fugue makes it flexible for users to choose whatever interface works for them. The three ways are native approach, decorator, and the class interface in order of simplicity. Note schema hints do not work.\n",
20 | "\n",
21 | "A typical use case is to distributedly compare two dataframes per partition"
22 | ],
23 | "metadata": {}
24 | },
25 | {
26 | "cell_type": "markdown",
27 | "source": [
28 | "## Native Approach\n",
29 | "An `OutputCoTransformer` normally returns nothing, so the default schema is `None`. Because of this, it will work if no schema is specified. The `OutputCoTransformer` is not meant to mutate schema so it will not respect any schema hint. "
30 | ],
31 | "metadata": {}
32 | },
33 | {
34 | "cell_type": "code",
35 | "execution_count": 1,
36 | "source": [
37 | "from typing import List, Any\n",
38 | "\n",
39 | "def assert_eq(df1:List[List[Any]], df2:List[List[Any]]) -> None:\n",
40 | " assert df1 == df2\n",
41 | " print(df1,\"==\",df2)\n",
42 | "\n",
43 | "def assert_eq_2(df1:List[List[Any]], df2:List[List[Any]]) -> List[List[Any]]:\n",
44 | " assert df1 == df2\n",
45 | " print(df1,\"==\",df2)\n",
46 | " return [[0]]"
47 | ],
48 | "outputs": [],
49 | "metadata": {}
50 | },
51 | {
52 | "cell_type": "code",
53 | "execution_count": 2,
54 | "source": [
55 | "from fugue import FugueWorkflow\n",
56 | "\n",
57 | "with FugueWorkflow() as dag:\n",
58 | " df1 = dag.df([[0,1],[0,2],[1,3]], \"a:int,b:int\")\n",
59 | " df2 = dag.df([[1,3],[0,2],[0,1]], \"a:int,b:int\")\n",
60 | " z = df1.zip(df2, partition=dict(by=[\"a\"],presort=[\"b\"]))\n",
61 | " z.out_transform(assert_eq)\n",
62 | " z.out_transform(assert_eq_2) # All CoTransformer like functions/classes can be used directly"
63 | ],
64 | "outputs": [
65 | {
66 | "output_type": "stream",
67 | "name": "stdout",
68 | "text": [
69 | "[[0, 1], [0, 2]] == [[0, 1], [0, 2]]\n",
70 | "[[1, 3]] == [[1, 3]]\n",
71 | "[[0, 1], [0, 2]] == [[0, 1], [0, 2]]\n",
72 | "[[1, 3]] == [[1, 3]]\n"
73 | ]
74 | }
75 | ],
76 | "metadata": {}
77 | },
78 | {
79 | "cell_type": "markdown",
80 | "source": [
81 | "## Decorator Approach\n",
82 | "\n",
83 | "There is no obvious advantage to use decorator for `OutputCoTransformer`"
84 | ],
85 | "metadata": {}
86 | },
87 | {
88 | "cell_type": "code",
89 | "execution_count": 3,
90 | "source": [
91 | "from fugue.extensions import output_cotransformer\n",
92 | "\n",
93 | "@output_cotransformer()\n",
94 | "def assert_eq(df1:List[List[Any]], df2:List[List[Any]]) -> None:\n",
95 | " assert df1 == df2\n",
96 | " print(df1,\"==\",df2)\n",
97 | " \n",
98 | "with FugueWorkflow() as dag:\n",
99 | " df1 = dag.df([[0,1],[0,2],[1,3]], \"a:int,b:int\")\n",
100 | " df2 = dag.df([[1,3],[0,2],[0,1]], \"a:int,b:int\")\n",
101 | " z = df1.zip(df2, partition=dict(by=[\"a\"],presort=[\"b\"]))\n",
102 | " z.out_transform(assert_eq)"
103 | ],
104 | "outputs": [
105 | {
106 | "output_type": "stream",
107 | "name": "stdout",
108 | "text": [
109 | "[[0, 1], [0, 2]] == [[0, 1], [0, 2]]\n",
110 | "[[1, 3]] == [[1, 3]]\n"
111 | ]
112 | }
113 | ],
114 | "metadata": {}
115 | },
116 | {
117 | "cell_type": "markdown",
118 | "source": [
119 | "## Interface Approach\n",
120 | "\n",
121 | "Just like the interface approach of `CoTransformer`, you get all the flexibilities and control over your transformation"
122 | ],
123 | "metadata": {}
124 | },
125 | {
126 | "cell_type": "code",
127 | "execution_count": 4,
128 | "source": [
129 | "from fugue.extensions import OutputCoTransformer\n",
130 | "\n",
131 | "class AssertEQ(OutputCoTransformer):\n",
132 | " # notice the interface is different from CoTransformer\n",
133 | " def process(self, dfs):\n",
134 | " df1, df2 = dfs[0].as_array(), dfs[1].as_array()\n",
135 | " assert df1 == df2\n",
136 | " print(df1,\"==\",df2)\n",
137 | "\n",
138 | "with FugueWorkflow() as dag:\n",
139 | " df1 = dag.df([[0,1],[0,2],[1,3]], \"a:int,b:int\")\n",
140 | " df2 = dag.df([[1,3],[0,2],[0,1]], \"a:int,b:int\")\n",
141 | " z = df1.zip(df2, partition=dict(by=[\"a\"],presort=[\"b\"]))\n",
142 | " z.out_transform(AssertEQ)"
143 | ],
144 | "outputs": [
145 | {
146 | "output_type": "stream",
147 | "name": "stdout",
148 | "text": [
149 | "[[0, 1], [0, 2]] == [[0, 1], [0, 2]]\n",
150 | "[[1, 3]] == [[1, 3]]\n"
151 | ]
152 | }
153 | ],
154 | "metadata": {}
155 | }
156 | ],
157 | "metadata": {
158 | "kernelspec": {
159 | "name": "python3",
160 | "display_name": "Python 3.7.9 64-bit"
161 | },
162 | "language_info": {
163 | "codemirror_mode": {
164 | "name": "ipython",
165 | "version": 3
166 | },
167 | "file_extension": ".py",
168 | "mimetype": "text/x-python",
169 | "name": "python",
170 | "nbconvert_exporter": "python",
171 | "pygments_lexer": "ipython3",
172 | "version": "3.7.9"
173 | },
174 | "interpreter": {
175 | "hash": "4cd7ab41f5fca4b9b44701077e38c5ffd31fe66a6cab21e0214b68d958d0e462"
176 | }
177 | },
178 | "nbformat": 4,
179 | "nbformat_minor": 4
180 | }
--------------------------------------------------------------------------------
/tutorials/extensions/outputtransformer.ipynb:
--------------------------------------------------------------------------------
1 | {
2 | "cells": [
3 | {
4 | "cell_type": "markdown",
5 | "source": [
6 | "# Output Transformer (Advanced)\n",
7 | "\n",
8 | "`OutputTransfomer` is in general similar to `Transformer`. And any `Transformer` can be used as `OutputTransformer`. It is important to understand the difference between the operations `transform` and `out_transform`. \n",
9 | "\n",
10 | "* `transform` is lazy, Fugue does not ensure the compute immediately. For example, if using `SparkExecutionEngine`, the real compute of `transform` happens only when hitting an action, for example `save`.\n",
11 | "* `out_transform` is an action, Fugue ensures the compute happening immediately, regardless of what execution engine is used.\n",
12 | "* `transform` outputs a transformed dataframe for the following steps to use\n",
13 | "* `out_transform` is the last compute of a branch in the DAG, it outputs nothing.\n",
14 | "\n",
15 | "You may find that `transform().persist()` can be an alternative to `out_transform`, it's in general ok, but you must notice that, the output dataframe of a transformation can be very large, if you persist or checkpoint it, it can take up great portion of memory or disk space. In contrast, `out_transform` does not take any space. Plus, it is a more explicit way to show what you want to do. \n",
16 | "\n",
17 | "A typical use case of output_transform is to save the dataframe in a custom way, for example, pushing to redis.\n",
18 | "\n",
19 | "In this tutorial are the methods to define an `OutputTransformer`. There is no preferred method and Fugue makes it flexible for users to choose whatever interface works for them. The three ways are native approach, decorator, and the class interface in order of simplicity. Note schema hints do not work."
20 | ],
21 | "metadata": {}
22 | },
23 | {
24 | "cell_type": "markdown",
25 | "source": [
26 | "## Native Approach\n",
27 | "\n",
28 | "An `OutputTransformer` normally returns nothing, so the default schema is `None`. Because of this, it will work if no schema is specified. The `OutputTransformer` is not meant to mutate schema so it will not respect any schema hint. "
29 | ],
30 | "metadata": {}
31 | },
32 | {
33 | "cell_type": "code",
34 | "execution_count": 1,
35 | "source": [
36 | "from typing import Iterable, Dict, Any, List\n",
37 | "from fugue import FugueWorkflow\n",
38 | "\n",
39 | "def push_to_redis(df:Iterable[Dict[str,Any]]) -> Iterable[Dict[str,Any]]:\n",
40 | " for row in df:\n",
41 | " print(\"pushing1\", row)\n",
42 | " return df\n",
43 | "\n",
44 | "with FugueWorkflow() as dag:\n",
45 | " df = dag.df([[0,1],[0,2],[1,3],[1,1]],\"a:int,b:int\")\n",
46 | " df.out_transform(push_to_redis)"
47 | ],
48 | "outputs": [
49 | {
50 | "output_type": "stream",
51 | "name": "stdout",
52 | "text": [
53 | "pushing1 {'a': 0, 'b': 1}\n",
54 | "pushing1 {'a': 0, 'b': 2}\n",
55 | "pushing1 {'a': 1, 'b': 3}\n",
56 | "pushing1 {'a': 1, 'b': 1}\n"
57 | ]
58 | }
59 | ],
60 | "metadata": {}
61 | },
62 | {
63 | "cell_type": "markdown",
64 | "source": [
65 | "## Decorator Approach\n",
66 | "\n",
67 | "There is no obvious advantage to use decorator for `OutputTransformer`"
68 | ],
69 | "metadata": {}
70 | },
71 | {
72 | "cell_type": "code",
73 | "execution_count": 2,
74 | "source": [
75 | "from fugue.extensions import output_transformer\n",
76 | "\n",
77 | "@output_transformer()\n",
78 | "def push_to_redis(df:Iterable[Dict[str,Any]]) -> None:\n",
79 | " for row in df:\n",
80 | " print(\"pushing2\", row)\n",
81 | " continue\n",
82 | " \n",
83 | "with FugueWorkflow() as dag:\n",
84 | " df = dag.df([[0,1],[0,2],[1,3],[1,1]],\"a:int,b:int\")\n",
85 | " df.partition(by=[\"a\"], presort=\"b\").out_transform(push_to_redis)"
86 | ],
87 | "outputs": [
88 | {
89 | "output_type": "stream",
90 | "name": "stdout",
91 | "text": [
92 | "pushing2 {'a': 0, 'b': 1}\n",
93 | "pushing2 {'a': 0, 'b': 2}\n",
94 | "pushing2 {'a': 1, 'b': 1}\n",
95 | "pushing2 {'a': 1, 'b': 3}\n"
96 | ]
97 | }
98 | ],
99 | "metadata": {}
100 | },
101 | {
102 | "cell_type": "markdown",
103 | "source": [
104 | "## Interface Approach (Advanced)\n",
105 | "\n",
106 | "Just like the interface approach of `Transformer`, you get all the flexibilities and control over your transformation"
107 | ],
108 | "metadata": {}
109 | },
110 | {
111 | "cell_type": "code",
112 | "execution_count": 3,
113 | "source": [
114 | "from fugue.extensions import OutputTransformer\n",
115 | " \n",
116 | "class Push(OutputTransformer):\n",
117 | " # Notice OutputTransformer has different interface than Transformer\n",
118 | " def process(self, df):\n",
119 | " print(\"pushing2\", self.cursor.key_value_dict)\n",
120 | " \n",
121 | "with FugueWorkflow() as dag:\n",
122 | " df = dag.df([[0,1],[0,2],[1,3],[1,1]],\"a:int,b:int\")\n",
123 | " df.partition(by=[\"a\"], presort=\"b\").out_transform(Push)"
124 | ],
125 | "outputs": [
126 | {
127 | "output_type": "stream",
128 | "name": "stdout",
129 | "text": [
130 | "pushing2 {'a': 0}\n",
131 | "pushing2 {'a': 1}\n"
132 | ]
133 | }
134 | ],
135 | "metadata": {}
136 | }
137 | ],
138 | "metadata": {
139 | "kernelspec": {
140 | "name": "python3",
141 | "display_name": "Python 3.7.9 64-bit"
142 | },
143 | "language_info": {
144 | "codemirror_mode": {
145 | "name": "ipython",
146 | "version": 3
147 | },
148 | "file_extension": ".py",
149 | "mimetype": "text/x-python",
150 | "name": "python",
151 | "nbconvert_exporter": "python",
152 | "pygments_lexer": "ipython3",
153 | "version": "3.7.9"
154 | },
155 | "orig_nbformat": 2,
156 | "interpreter": {
157 | "hash": "4cd7ab41f5fca4b9b44701077e38c5ffd31fe66a6cab21e0214b68d958d0e462"
158 | }
159 | },
160 | "nbformat": 4,
161 | "nbformat_minor": 2
162 | }
--------------------------------------------------------------------------------
/tutorials/fugue_sql/index.md:
--------------------------------------------------------------------------------
1 | # FugueSQL
2 |
3 | All questions are welcome in the Slack channel.
4 |
5 |
6 | [](https://mybinder.org/v2/gh/fugue-project/tutorials/master) ⬅️ Launch these tutorials in Binder
7 |
8 | [](https://github.com/fugue-project/fugue) ⬅️ Check out our source code
9 |
10 | [](http://slack.fugue.ai) ⬅️ Chat with us on slack
11 |
12 | `FugueSQL` is designed for heavy SQL users to extend the boundaries of traditional SQL workflows. `FugueSQL` allows the expression of logic for end-to-end distributed computing workflows. It can also be combined with Python code to use custom functions alongside the SQL commands. It provides a unified interface, allowing the same SQL code to run on Pandas, Dask, and Spark.
13 |
14 | The SQL code is parsed with [ANTLR](https://www.antlr.org/) and mapped to the equivalent functions in the `Fugue` programming interface.
15 |
16 | ```{toctree}
17 | :hidden:
18 |
19 | syntax
20 | operators
21 | python
22 | extensions
23 | builtin
24 | ```
25 |
26 | FugueSQL has a [10 minute tutorial here](../quick_look/ten_minutes_sql.ipynb). This page is a more in-depth look at FugueSQL.
27 |
28 | ## 1. Installation
29 |
30 | In order to use `FugueSQL`, you first need to make sure you have installed the `sql` extra
31 | ```
32 | pip install fugue[sql]
33 | ```
34 | To run on Spark or Dask execution engines, install the appropriate extras. Alternatively, `all` can be used as an extra.
35 | ```
36 | pip install fugue[sql, spark]
37 | pip install fugue[sql, dask]
38 | pip install fugue[all]
39 | ```
40 |
41 | FugueSQL has a notebook extension for both Jupyter Notebooks and JupyterLab. This extension provides syntax highlighting and registers the `%%fql` cell magic. To install the extension, use pip:
42 |
43 | ```
44 | pip install fugue-jupyter
45 | ```
46 |
47 | and then to register the startup script:
48 |
49 | ```
50 | fugue-jupyter install startup
51 | ```
52 |
53 | ## [2. FugueSQL Syntax](syntax.ipynb)
54 |
55 | Get started with `FugueSQL`. This shows input and output of data, enhancements over standard SQL, and how to use SQL to describe computation logic. After this, users will be able to use `FugueSQL` with the familiar SQL keywords to perform operations on top of **Pandas**, **Spark**, and **Dask**.
56 |
57 | ## [3. Additional SQL Operators](operators.ipynb)
58 |
59 | Go over the implemented operations that `Fugue` has on top of the ones provided by standard SQL. `FugueSQL` is extensible with Python code, but the most common functions are added as built-ins. These include filling NULL values, dropping NULL values, renaming columns, changing schema, etc. This section goes over the most used additional keywords.
60 |
61 | ## [4. Integrating Python](python.ipynb)
62 |
63 | Explore [Jinja templating](https://jinja.palletsprojects.com/) for variable passing, and using a Python functions as a [Transformer](../extensions/transformer.ipynb) in a `%%fsql` cell.
64 |
65 | ## [5. Using Custom Fugue Extensions](extensions.ipynb)
66 |
67 | The [Transformer](../extensions/transformer.ipynb) is just one of many possible [Fugue extensions](../extensions/index.md). In this section we'll explore the syntax of all the other Fugue extensions: [Creator](../extensions/creator.ipynb), [Processor](../extensions/processor.ipynb), [Outputter](../extensions/outputter.ipynb), and [CoTransformer](../extensions/cotransformer.ipynb).
68 |
69 | ## [6. Using Built-in Extensions](builtin.ipynb)
70 |
71 | Commonly used extensions are also provided as built-in extensions. These are also a good way to contribute to Fugue to enhance the FugueSQL experience.
72 |
73 | ## 7. FugueSQL with Pandas
74 |
75 | `%%fsql` takes in the NativeExecutionEngine as a default parameter. This engine runs on Pandas. All of the SQL operations have equivalents in Pandas, but the behavior can be inconsistent sometimes. For example, Pandas will drop NULL values by default in a groupby operation. The NativeExecutionEngine was designed to mostly make operations consistent with Spark and SQL.
76 |
77 | ## 8. FugueSQL with Spark
78 |
79 | `FugueSQL` also works on **Spark** by passing in the execution engine. This looks like `%%fsql spark`. The operations are mapped to **Spark** and Spark SQL operations. The difference is `FugueSQL` has added functionality for syntax compared to SparkSQL as seen in the [syntax tutorial](syntax.ipynb). Additionally with `FugueSQL`, the same code will execute on Pandas and Dask without modification. This allows for quick testing without having to spin up a cluster. Users prototype with the `NativeExecutionEngine`, and then move to the **Spark** cluster by changing the execution engine.
80 |
--------------------------------------------------------------------------------
/tutorials/integrations/backends/dask_sql.ipynb:
--------------------------------------------------------------------------------
1 | {
2 | "cells": [
3 | {
4 | "cell_type": "markdown",
5 | "metadata": {},
6 | "source": [
7 | "# Fugue with Dask-sql\n",
8 | "\n",
9 | "**Pandas** and **Spark** already have solutions that allow users to execute SQL code to describe computation workflows. **Dask**, on the other hand, does not have a standard SQL interface yet. `FugueSQL` provides this feature with the DaskExecutionEngine, but users should also be aware that [dask-sql](https://dask-sql.readthedocs.io/en/latest/index.html) is a relatively new project and has a majority of SQL keywords implemented already. Additionally, it is also faster than FugueSql on average. However, there are still some features under development. Most notably, the SQL `WINDOW` is not yet implemented.\n",
10 | "\n",
11 | "We are collaborating to have our solutions converge to create the de facto SQL interface for Dask. In the meantime, we have unified our solutions by allowing `FugueSQL` to use [dask-sql](https://dask-sql.readthedocs.io/en/latest/index.html) as an [execution engine](../advanced/execution_engine.ipynb). The [dask_sql](https://github.com/nils-braun/dask-sql) project has added a `DaskSQLExecutionEngine` into their code to let us import it and pass it into `FugueSQLWorkflow`. Note this is a different engine from `Fugue's DaskExecutionEngine`\n",
12 | "\n",
13 | "`FugueSQLWorkflow` usage is nearly identical to the `fsql` function we saw previously. The main difference is that it takes in a SQL engine as seen in the example below."
14 | ]
15 | },
16 | {
17 | "cell_type": "markdown",
18 | "metadata": {},
19 | "source": [
20 | "## Sample Usage\n",
21 | "\n",
22 | "This example below shows that when the SQL query cannot be executed in `dask-sql`, it will use the `FugueSQL`. We are able to use the `TAKE` and `PRINT` keywords even if they don't exist in `dask-sql`. We can also use the `TRANSFORM and PREPARTITION` even if these are `Fugue` keywords.\n",
23 | "\n",
24 | "`FugueSQL` and `dask-sql` together can provide a more powerful solution. This allows us to use both solutions to get the best of both worlds in terms of speed and operation completeness. All we need to do is pass the `DaskSQLExecutionEngine` into `FugueSQLWorkflow`.\n",
25 | "\n",
26 | "NOTE: In order for the code below to run, `dask-sql` needs to be installed.\n"
27 | ]
28 | },
29 | {
30 | "cell_type": "code",
31 | "execution_count": null,
32 | "metadata": {},
33 | "outputs": [],
34 | "source": [
35 | "from dask_sql.integrations.fugue import DaskSQLExecutionEngine\n",
36 | "from fugue_sql import FugueSQLWorkflow\n",
37 | "import pandas as pd\n",
38 | "\n",
39 | "data = [\n",
40 | " [\"A\", \"2020-01-01\", 10],\n",
41 | " [\"A\", \"2020-01-02\", 20],\n",
42 | " [\"A\", \"2020-01-03\", 30],\n",
43 | " [\"B\", \"2020-01-01\", 20],\n",
44 | " [\"B\", \"2020-01-02\", 30],\n",
45 | " [\"B\", \"2020-01-03\", 40]\n",
46 | "]\n",
47 | "schema = \"id:str,date:date,value:int\"\n",
48 | "\n",
49 | "# schema: *, cumsum:int\n",
50 | "def cumsum(df: pd.DataFrame) -> pd.DataFrame:\n",
51 | " df[\"cumsum\"] = df['value'].cumsum()\n",
52 | " return df\n",
53 | "\n",
54 | "# Run the DAG on the DaskSQLExecutionEngine by dask-sql\n",
55 | "with FugueSQLWorkflow(DaskSQLExecutionEngine) as dag:\n",
56 | " df = dag.df(data, schema)\n",
57 | " dag(\"\"\"\n",
58 | " SELECT *\n",
59 | " FROM df\n",
60 | " TRANSFORM PREPARTITION BY id PRESORT date ASC USING cumsum\n",
61 | " TAKE 5 ROWS\n",
62 | " PRINT\n",
63 | " \"\"\")"
64 | ]
65 | },
66 | {
67 | "cell_type": "markdown",
68 | "metadata": {},
69 | "source": [
70 | "When a SQL keywords don't exist in `dask-sql`, it will default to the `Fugue DaskExecutionEngine`. However, when the keyword is registered by `dask-sql` it will use their implementation. `OVER PARTITION` is registered but still being developed, which will cause errors. One workaround is to use `Fugue's TRANSFORM and PREPARTITION` like above to avoid using `OVER PARTITION` for now."
71 | ]
72 | }
73 | ],
74 | "metadata": {
75 | "kernelspec": {
76 | "display_name": "Python 3.8.9 64-bit",
77 | "language": "python",
78 | "name": "python3"
79 | },
80 | "language_info": {
81 | "name": "python",
82 | "version": "3.8.9"
83 | },
84 | "orig_nbformat": 2,
85 | "vscode": {
86 | "interpreter": {
87 | "hash": "31f2aee4e71d21fbe5cf8b01ff0e069b9275f58929596ceb00d14d90e3e16cd6"
88 | }
89 | }
90 | },
91 | "nbformat": 4,
92 | "nbformat_minor": 2
93 | }
94 |
--------------------------------------------------------------------------------
/tutorials/integrations/backends/index.md:
--------------------------------------------------------------------------------
1 | # Backends
2 |
3 | Here, we look at the backends supported by Fugue. Backends are the execution engines that Fugue runs code on top of. It is also common to mix and match execution engines. For example, big data processing can happen on SparkSQL and then DuckDB can be used on the smaller processed subset of data.
4 |
5 | Have questions? Chat with us on Github or Slack:
6 |
7 | [](https://github.com/fugue-project/fugue)
8 | [](http://slack.fugue.ai)
9 |
10 | ```{toctree}
11 | :hidden:
12 |
13 | ibis
14 | polars
15 | dask_sql
16 | duckdb
17 | ```
18 |
19 | ## Python Backends
20 |
21 | **This is in addition to Spark, Dask, and Ray**
22 |
23 | ## [Ibis](ibis.ipynb)
24 | [Ibis](https://github.com/ibis-project/ibis) is a Python framework to write analytical workloads on top of data warehouses (along with DataFrames). Ibis can be used in conjunction with Fugue to query from data warehouses.
25 |
26 | ## [Polars](polars.ipynb)
27 | [Polars](https://github.com/pola-rs/polars) is a DataFrame library written in Rust (with a Python API) that supports multi-threaded and out-of-core operations. Polars already parallelizes operations well on a local machine. Fugue's integration is focused on allowing Polars code to run on top of a cluster with Spark, Dask, or Ray. There are certain use cases where this will increase the performance of distributed applications.
28 |
29 | ## SQL Backends
30 |
31 | **This is in addition to SparkSQL**
32 |
33 | ## [Dask SQL](dask_sql.ipynb)
34 | [Dask-sql](https://github.com/dask-contrib/dask-sql) is a Dask project that provides a SQL interface on top of Dask DataFrames (including Dask on GPU). FugueSQL can use the Dask-SQL backend to run Dask-SQL and Dask code together.
35 |
36 | ## [DuckDB](duckdb.ipynb)
37 | [DuckDB](https://duckdb.org/) is an in-process SQL OLAP database management system. It is similar to SQLite but optimized for analytical workloads. DuckDB performs optimizations of queries, allowing it to be 10x - 100x more performant than Pandas in some cases. Good use cases are testing locally, and then moving to SparkSQL when running on big data, or using DuckDB to query initial data before working with local Pandas for more complicated transformations.
38 |
--------------------------------------------------------------------------------
/tutorials/integrations/cloudproviders/anyscale.ipynb:
--------------------------------------------------------------------------------
1 | {
2 | "cells": [
3 | {
4 | "cell_type": "markdown",
5 | "id": "8e680c15",
6 | "metadata": {},
7 | "source": [
8 | "# Anyscale\n",
9 | "\n",
10 | "Fugue works perfectly well with [Anyscale](https://www.anyscale.com/). This document assumes you already have an Anyscale account setup and you know the basic operations on Anyscale. You can sign up Anyscale [here](https://www.anyscale.com/signup).\n",
11 | "\n",
12 | "## Create Fugue environment\n",
13 | "\n",
14 | "You must create a new compute environment on Anyscale where you must\n",
15 | "\n",
16 | "1. Install `fugue[ray]`\n",
17 | "2. Make sure ray>=2.0.0\n",
18 | "3. It's recommended to use python 3.8+\n",
19 | "\n",
20 | "\n",
21 | "\n",
22 | "Besides that, if you want to use s3, please install [fs-s3fs](https://pypi.org/project/fs-s3fs/),\n",
23 | "if you want to use gcs, please install [fs-gsfs](https://pypi.org/project/fs-gcsfs/).\n",
24 | "\n",
25 | "\n",
26 | "## Start from the jupyter notebook inside Anyscale\n",
27 | "\n",
28 | "The easiest way to start trying is to start a cluster inside Anyscale, and then use the jupyter notebook\n",
29 | "\n",
30 | "\n",
31 | "\n",
32 | "Using this approach, you only need to use the standard Ray execution engine, for example:\n",
33 | "\n",
34 | "```python\n",
35 | "transform(df, func, engine=\"ray\")\n",
36 | "```\n",
37 | "\n",
38 | "## Start from your local enrionment\n",
39 | "\n",
40 | "Firstly, please pip install on your local environment:\n",
41 | "\n",
42 | "```bash\n",
43 | "pip install fugue-cloudprovider[anyscale]\n",
44 | "```\n",
45 | "\n",
46 | "You need to get your token in Anyscale:\n",
47 | "\n",
48 | "\n",
49 | "\n",
50 | "You can choose to use the command to login:\n",
51 | "\n",
52 | "```bash\n",
53 | "anyscale auth set\n",
54 | "```\n",
55 | "\n",
56 | "Or, you can add `token` to your engine config (will have more examples later):\n",
57 | "\n",
58 | "```json\n",
59 | "{\n",
60 | " \"token\": \"...\"\n",
61 | "}\n",
62 | "```\n",
63 | "\n",
64 | "Putting explicit token in engine config is not a good practice. But if your config is stored at\n",
65 | "a secret store, it can be both safe and convenient.\n",
66 | "\n",
67 | "## Engine configs\n",
68 | "\n",
69 | "### Config items:\n",
70 | "\n",
71 | "> **token**\n",
72 | "\n",
73 | "The Anyscale token for authentication, it is required if you have not logged in on your machine\n",
74 | "\n",
75 | "> **address**\n",
76 | "\n",
77 | "The `anyscale://...` address representing a predefined cluster:\n",
78 | "\n",
79 | "\n",
80 | "\n",
81 | "**One and only one of `address` and `cluster` must be specified**\n",
82 | "\n",
83 | "> **cluster**\n",
84 | "\n",
85 | "The parameter to [launch a cluster](https://docs.anyscale.com/reference/python-sdk/api#launch_cluster). You can use\n",
86 | "this option to construct a new cluster on the fly.\n",
87 | "\n",
88 | "> **ephemeral** (default: False)\n",
89 | "\n",
90 | "Whether to terminate this Anyscale cluster when the Fugue execution engine is shutdown.\n",
91 | "\n",
92 | "> **fugue.ray.shuffle.partitions**\n",
93 | "\n",
94 | "Default number of shuffle partitions when groupby and transform. If not set, it will be determined by the number of\n",
95 | "partitions of the input dataframe.\n",
96 | "\n",
97 | "**Notice** for Anyscale, if this config is not set, and if the max number of cpus (`cpu_n`) of the cluster can be computed,\n",
98 | "then this config will be automatically set to `cpu_n * 2`\n",
99 | "\n",
100 | "> **fugue.ray.remote.***\n",
101 | "\n",
102 | "Default [ray remote](https://docs.ray.io/en/latest/ray-core/package-ref.html#ray-remote) arguments used by Fugue\n",
103 | "`RayExecutionEngine`. For example:\n",
104 | "\n",
105 | "```python\n",
106 | "transform(\n",
107 | " ...,\n",
108 | " engine=\"anyscale://project/cluster-1\",\n",
109 | " engine_conf={\n",
110 | " \"fugue.ray.remote.num_cpus\":2\n",
111 | " }\n",
112 | ")\n",
113 | "```\n",
114 | "\n",
115 | "Each transformation task will take 2 cpus. If you have a Ray cluster of 100 cpus, then the max parallelism will be 50.\n",
116 | "\n",
117 | "\n",
118 | "### Config combinations:\n",
119 | "\n",
120 | "> `engine=\"anyscale\", engine_conf={...}`\n",
121 | "\n",
122 | "This is the most standard way, for example:\n",
123 | "\n",
124 | "```python\n",
125 | "transform(..., engine=\"anyscale\", engine_conf={\"token\":\"...\", \"cluster\":{...}, \"ephemeral\":True})\n",
126 | "```\n",
127 | "\n",
128 | "> `engine=\"\", engine_conf={...}`\n",
129 | "\n",
130 | "This is equivalent to `engine=\"anyscale\", engine_conf={\"address\":\"\", ...}`.\n",
131 | "\n",
132 | "The simplest example would be (assuming you logged in Anyscale on the machine):\n",
133 | "\n",
134 | "```python\n",
135 | "transform(..., engine=\"anyscale://project/cluster-1\")\n",
136 | "```\n",
137 | "\n",
138 | "It will connect to the remote predefined cluster (if the cluster is not started, then it will launch\n",
139 | "the cluster first and then connect)\n",
140 | "\n",
141 | "> **The programmatical approach**\n",
142 | "\n",
143 | "In `fugue-cloudprovider` we provided a utility class `Cluster` to connect/create remote Anyscale clusters.\n",
144 | "All engine configs will also work for the `Cluster` class:\n",
145 | "\n",
146 | "```python\n",
147 | "from fugue_anyscale import Cluster\n",
148 | "\n",
149 | "with Cluster({\"address\":\"anyscale://project/cluster-1\", \"ephemeral\":True}) as cluster:\n",
150 | " transform(..., engine=cluster)\n",
151 | "```"
152 | ]
153 | },
154 | {
155 | "cell_type": "markdown",
156 | "id": "a15185ea",
157 | "metadata": {},
158 | "source": []
159 | }
160 | ],
161 | "metadata": {
162 | "kernelspec": {
163 | "display_name": "Python 3.8.13 64-bit",
164 | "language": "python",
165 | "name": "python3"
166 | },
167 | "language_info": {
168 | "codemirror_mode": {
169 | "name": "ipython",
170 | "version": 3
171 | },
172 | "file_extension": ".py",
173 | "mimetype": "text/x-python",
174 | "name": "python",
175 | "nbconvert_exporter": "python",
176 | "pygments_lexer": "ipython3",
177 | "version": "3.8.13"
178 | },
179 | "vscode": {
180 | "interpreter": {
181 | "hash": "949777d72b0d2535278d3dc13498b2535136f6dfe0678499012e853ee9abcab1"
182 | }
183 | }
184 | },
185 | "nbformat": 4,
186 | "nbformat_minor": 5
187 | }
188 |
--------------------------------------------------------------------------------
/tutorials/integrations/cloudproviders/coiled.ipynb:
--------------------------------------------------------------------------------
1 | {
2 | "cells": [
3 | {
4 | "cell_type": "markdown",
5 | "id": "8e680c15",
6 | "metadata": {},
7 | "source": [
8 | "# Coiled\n",
9 | "\n",
10 | "Fugue works perfectly well with [Coiled](https://coiled.io/). This document assumes you already have Coiled service setup and you know the basic operations on Coiled. If that is not the case, please create an account on Coiled (with free credits) and setup AWS or GCP according to their instructions.\n",
11 | "\n",
12 | "## Setup the environment\n",
13 | "\n",
14 | "### Start from the pre-built Docker image\n",
15 | "\n",
16 | "The easiest way to start trying is to run the docker environment locally:\n",
17 | "\n",
18 | "```bash\n",
19 | "docker run -p 8888:8888 -it fugueproject/coiled:latest jupyter lab --port=8888 --ip=0.0.0.0 --no-browser --allow-root --NotebookApp.token='' --NotebookApp.password='' --NotebookApp.allow_origin='*'\n",
20 | "```\n",
21 | "\n",
22 | "You should access the JupyterLab environment by `http://localhost:8888` in your browser. When you are in JupyterLab, the first thing is to log in into Coiled. You may start a terminal in the lab environment and type\n",
23 | "\n",
24 | "```bash\n",
25 | "coiled login\n",
26 | "```\n",
27 | "\n",
28 | "Or just follow the official [instruction](https://docs.coiled.io/user_guide/configuration.html) to setup your environment.\n",
29 | "\n",
30 | "Now, you can start a new notebook. The first step is to create a software environment in Coiled. For example, you can run the following code:\n",
31 | "\n",
32 | "```python\n",
33 | "import os\n",
34 | "import coiled\n",
35 | "\n",
36 | "SOFTWARE_ENV = os.environ[\"DOCKER_IMAGE\"].replace(\"/\",\"_\").replace(\":\",\"_\").replace(\".\",\"_\")\n",
37 | "coiled.create_software_environment(name=SOFTWARE_ENV, container=os.environ[\"DOCKER_IMAGE\"])\n",
38 | "```\n",
39 | "\n",
40 | "The most important thing is the container is better to be the same as what you are on, which ensures version consistency.\n",
41 | "\n",
42 | "### Start from scratch\n",
43 | "\n",
44 | "To start from scratch, you must make sure the coiled cloud provider dependency is installed:\n",
45 | "\n",
46 | "```bash\n",
47 | "pip install fugue-cloudprovider[coiled]\n",
48 | "```\n",
49 | "\n",
50 | "Besides that you also need to make sure your worker environment has matching packages and python version. **You must also install fugue on the worker side.**\n",
51 | "\n",
52 | "## Use Fugue on Coiled\n",
53 | "\n",
54 | "### If you already have a Coiled cluster\n",
55 | "\n",
56 | "You may follow the [official doc](https://docs.coiled.io/user_guide/cluster.html) or your own way to create a `Cluster` instance, assuming it is `cluster`. Then `cluster` can be used as the execution engine directly, for example:"
57 | ]
58 | },
59 | {
60 | "cell_type": "code",
61 | "execution_count": null,
62 | "id": "b5013f22",
63 | "metadata": {},
64 | "outputs": [],
65 | "source": [
66 | "from fugue_sql import fsql\n",
67 | "from fugue import transform\n",
68 | "import pandas as pd\n",
69 | "\n",
70 | "fsql(\"\"\"\n",
71 | "CREATE [[0]] SCHEMA a:int\n",
72 | "PRINT\n",
73 | "\"\"\").run(cluster)\n",
74 | "\n",
75 | "def my_transformer(df:pd.DataFrame) -> pd.DataFrame:\n",
76 | " return df\n",
77 | "\n",
78 | "transform(\n",
79 | " pd.DataFrame(dict(a=[0,1])),\n",
80 | " my_transformer,\n",
81 | " schema=\"*\",\n",
82 | " engine=cluster\n",
83 | ")"
84 | ]
85 | },
86 | {
87 | "cell_type": "markdown",
88 | "id": "89ec3e02",
89 | "metadata": {},
90 | "source": [
91 | "In this case, Fugue is not responsible to start or close the Dask cluster, you must take care of it by yourself.\n",
92 | "\n",
93 | "### If you already have a Dask Client\n",
94 | "\n",
95 | "When you already instantiate a dask client, the `client` instance can be used as the execution engine. Or you can just use the string `dask` as the execution engine, it can find the active client automatically.\n",
96 | "\n",
97 | "The following two ways are both fine:"
98 | ]
99 | },
100 | {
101 | "cell_type": "code",
102 | "execution_count": null,
103 | "id": "4cfd9ed2",
104 | "metadata": {},
105 | "outputs": [],
106 | "source": [
107 | "fsql(\"\"\"\n",
108 | "CREATE [[0]] SCHEMA a:int\n",
109 | "PRINT\n",
110 | "\"\"\").run(client)\n",
111 | "\n",
112 | "fsql(\"\"\"\n",
113 | "CREATE [[0]] SCHEMA a:int\n",
114 | "PRINT\n",
115 | "\"\"\").run(\"dask\")\n",
116 | "\n",
117 | "transform(\n",
118 | " pd.DataFrame(dict(a=[0,1])),\n",
119 | " my_transformer,\n",
120 | " schema=\"*\",\n",
121 | " engine=\"dask\"\n",
122 | ")"
123 | ]
124 | },
125 | {
126 | "cell_type": "code",
127 | "execution_count": null,
128 | "id": "34fa38cb",
129 | "metadata": {},
130 | "outputs": [],
131 | "source": [
132 | "%%fsql dask\n",
133 | "CREATE [[0]] SCHEMA a:int\n",
134 | "PRINT"
135 | ]
136 | },
137 | {
138 | "cell_type": "markdown",
139 | "id": "1fd148af",
140 | "metadata": {},
141 | "source": [
142 | "Again, you will be responsible to start and stop the Dask client, Fugue will just use it.\n",
143 | "\n",
144 | "It is true that people may forget closing the cluster, which is a waste of money. So we provide a slightly better way to help you manage the resource."
145 | ]
146 | },
147 | {
148 | "cell_type": "code",
149 | "execution_count": null,
150 | "id": "410a8323",
151 | "metadata": {},
152 | "outputs": [],
153 | "source": [
154 | "from fugue_coiled import CoiledDaskClient\n",
155 | "\n",
156 | "with CoiledDaskClient(**coiled_cluster_kwargs) as client:\n",
157 | " fsql(\"\"\"\n",
158 | " CREATE [['abc']] SCHEMA a:str\n",
159 | " SELECT * WHERE a LIKE 'ab%'\n",
160 | " PRINT\n",
161 | " \"\"\").run(client)"
162 | ]
163 | },
164 | {
165 | "cell_type": "markdown",
166 | "id": "633a7427",
167 | "metadata": {},
168 | "source": [
169 | "In this case, `CoiledDaskClient` will automatically close the Coiled cluster and the Dask client at the end of the context.\n",
170 | "\n",
171 | "### If you want to connect to a running Coiled cluster\n",
172 | "\n",
173 | "Just set the engine to `coiled:`. Make sure the cluster with the name is active. Fugue will not stop this cluster after execution."
174 | ]
175 | },
176 | {
177 | "cell_type": "code",
178 | "execution_count": null,
179 | "id": "a19ab5ff",
180 | "metadata": {},
181 | "outputs": [],
182 | "source": [
183 | "fsql(\"\"\"\n",
184 | "CREATE [[0]] SCHEMA a:int\n",
185 | "PRINT\n",
186 | "\"\"\").run(\"coiled:my_cluster\")\n",
187 | "\n",
188 | "transform(\n",
189 | " pd.DataFrame(dict(a=[0,1])),\n",
190 | " my_transformer,\n",
191 | " schema=\"*\",\n",
192 | " engine=\"coiled:my_cluster\",\n",
193 | ")"
194 | ]
195 | },
196 | {
197 | "cell_type": "markdown",
198 | "id": "f224275f",
199 | "metadata": {},
200 | "source": [
201 | "### If you want an ephemeral Coiled cluster\n",
202 | "\n",
203 | "If you haven't instantiated a Coiled cluster or a Dask client, and you only want to use the computing resource at a certain step, we have a way to use coiled as an ephemeral service.\n",
204 | "\n",
205 | "You just need to use the string `coiled` as the execution engine, and provide the configs in the following format:\n",
206 | "\n",
207 | "| Config Name | Description |\n",
208 | "| --- | --- |\n",
209 | "| token | The coiled token for authentication, it is required if you have not logged in on your machine |\n",
210 | "| cluster | a dictionary of parameter to instantiate [coiled.Cluster](https://docs.coiled.io/user_guide/api.html#coiled.Cluster). For example `dict(n_workers=2, software=\"my_env\")` |"
211 | ]
212 | },
213 | {
214 | "cell_type": "code",
215 | "execution_count": 1,
216 | "id": "c812a9c9",
217 | "metadata": {},
218 | "outputs": [],
219 | "source": [
220 | "CONF = dict(token=\"abc\", cluster=dict(n_workers=2, software=\"my_env\"))"
221 | ]
222 | },
223 | {
224 | "cell_type": "code",
225 | "execution_count": null,
226 | "id": "1c97f1d0",
227 | "metadata": {},
228 | "outputs": [],
229 | "source": [
230 | "fsql(\"\"\"\n",
231 | "CREATE [[0]] SCHEMA a:int\n",
232 | "PRINT\n",
233 | "\"\"\").run(\"coiled\", CONF)\n",
234 | "\n",
235 | "transform(\n",
236 | " pd.DataFrame(dict(a=[0,1])),\n",
237 | " my_transformer,\n",
238 | " schema=\"*\",\n",
239 | " engine=\"coiled\",\n",
240 | " engine_conf=CONF\n",
241 | ")"
242 | ]
243 | },
244 | {
245 | "cell_type": "code",
246 | "execution_count": null,
247 | "id": "f6121961",
248 | "metadata": {},
249 | "outputs": [],
250 | "source": [
251 | "%%fsql coiled CONF\n",
252 | "CREATE [[0]] SCHEMA a:int\n",
253 | "PRINT"
254 | ]
255 | }
256 | ],
257 | "metadata": {
258 | "kernelspec": {
259 | "display_name": "Python 3.8.13 64-bit",
260 | "language": "python",
261 | "name": "python3"
262 | },
263 | "language_info": {
264 | "codemirror_mode": {
265 | "name": "ipython",
266 | "version": 3
267 | },
268 | "file_extension": ".py",
269 | "mimetype": "text/x-python",
270 | "name": "python",
271 | "nbconvert_exporter": "python",
272 | "pygments_lexer": "ipython3",
273 | "version": "3.8.13"
274 | },
275 | "vscode": {
276 | "interpreter": {
277 | "hash": "949777d72b0d2535278d3dc13498b2535136f6dfe0678499012e853ee9abcab1"
278 | }
279 | }
280 | },
281 | "nbformat": 4,
282 | "nbformat_minor": 5
283 | }
284 |
--------------------------------------------------------------------------------
/tutorials/integrations/cloudproviders/images/anyscale_address.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/fugue-project/tutorials/505d81959e96a4df021952f56a1e4bcf767cc967/tutorials/integrations/cloudproviders/images/anyscale_address.png
--------------------------------------------------------------------------------
/tutorials/integrations/cloudproviders/images/anyscale_auth.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/fugue-project/tutorials/505d81959e96a4df021952f56a1e4bcf767cc967/tutorials/integrations/cloudproviders/images/anyscale_auth.png
--------------------------------------------------------------------------------
/tutorials/integrations/cloudproviders/images/anyscale_env.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/fugue-project/tutorials/505d81959e96a4df021952f56a1e4bcf767cc967/tutorials/integrations/cloudproviders/images/anyscale_env.png
--------------------------------------------------------------------------------
/tutorials/integrations/cloudproviders/images/anyscale_jupyter.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/fugue-project/tutorials/505d81959e96a4df021952f56a1e4bcf767cc967/tutorials/integrations/cloudproviders/images/anyscale_jupyter.png
--------------------------------------------------------------------------------
/tutorials/integrations/cloudproviders/index.md:
--------------------------------------------------------------------------------
1 | # Cloud Providers
2 |
3 | Since [Fugue](https://github.com/fugue-project/fugue) is a framework for distributed compute, it is often paired with a solution that manages Spark, Dask, or Ray clusters. This section will cover using Fugue on top of cloud providers such as Databricks or Coiled. Fugue has a [`fugue-cloudprovider`](https://github.com/fugue-project/fugue-cloudprovider) package that allows users to easily spin up ephemeral compute for their compute workflows.
4 |
5 | Have questions? Chat with us on Github or Slack:
6 |
7 | [](https://github.com/fugue-project/fugue)
8 | [](http://slack.fugue.ai)
9 |
10 | 
11 |
12 | ```{toctree}
13 | :hidden:
14 |
15 | databricks
16 | coiled
17 | anyscale
18 | ```
19 |
20 | ## Spark
21 |
22 | **[Databricks](databricks.ipynb)**
23 |
24 | [Databricks](https://www.databricks.com/) is the most common provider for Spark clusters. Using the `databricks-connect` library, we can easily spin up an ephemeral Spark cluster. We can connect to the SparkSession on the Databricks cluster from a local machine.
25 |
26 | ## Dask
27 |
28 | **[Coiled](coiled.ipynb)**
29 |
30 | [Coiled](https://coiled.io/) is the easiest way to host Dask clusters on the cloud. Using the [coiled](https://pypi.org/project/coiled/) library, we can easily spin up an ephemeral Dask cluster or connect to an existing Dask cluster on Coiled.
31 |
32 | ## Ray
33 |
34 | **[Anyscale](anyscale.ipynb)**
35 |
36 | [Anyscale](https://www.anyscale.com/) is the Ray platform on the cloud. Using the [anyscale](https://pypi.org/project/anyscale/) library, we can easily spin up an ephemeral Ray cluster or connect to an existing Ray cluster on Anyscale.
--------------------------------------------------------------------------------
/tutorials/integrations/ecosystem/datacompy.ipynb:
--------------------------------------------------------------------------------
1 | {
2 | "cells": [
3 | {
4 | "cell_type": "markdown",
5 | "metadata": {},
6 | "source": [
7 | "# DataComPy\n",
8 | "\n",
9 | "Have questions? Chat with us on Github or Slack:\n",
10 | "\n",
11 | "[](https://github.com/fugue-project/fugue)\n",
12 | "[](http://slack.fugue.ai)\n",
13 | "\n",
14 | "[DataComPy](https://github.com/capitalone/datacompy) is an open-source package by Capital One that started as a way to compare two Pandas DataFrames with some more functionality than just Pandas.DataFrame.equals(Pandas.DataFrame). It allows users to specify tolerances and prints out statistics.\n",
15 | "\n",
16 | "Fugue is now an internal dependency of DataCompy, which extends the functionality to be used on backends Fugue supports (Spark, Dask, Ray, Polars, DuckDB, Arrow, etc.). A common use case is also comparing a Pandas DataFrame with a distributed DataFrame (Spark, Dask or Ray)."
17 | ]
18 | },
19 | {
20 | "cell_type": "markdown",
21 | "metadata": {},
22 | "source": [
23 | "## Basic Usage\n",
24 | "\n",
25 | "The most scalable way to use DataCompy is the `is_match` method. An example can be found below. The DataFrames are joined on the `acct_id` column and then compared. There are other supported operations not covered here. For more details, check the [DataCompy documentation](https://capitalone.github.io/datacompy/#things-that-are-happening-behind-the-scenes)"
26 | ]
27 | },
28 | {
29 | "cell_type": "code",
30 | "execution_count": 4,
31 | "metadata": {},
32 | "outputs": [
33 | {
34 | "data": {
35 | "text/plain": [
36 | "False"
37 | ]
38 | },
39 | "execution_count": 4,
40 | "metadata": {},
41 | "output_type": "execute_result"
42 | }
43 | ],
44 | "source": [
45 | "from io import StringIO\n",
46 | "import pandas as pd\n",
47 | "import datacompy\n",
48 | "\n",
49 | "data1 = \"\"\"acct_id,dollar_amt,name,float_fld,date_fld\n",
50 | "10000001234,123.45,George Maharis,14530.1555,2017-01-01\n",
51 | "10000001235,0.45,Michael Bluth,1,2017-01-01\n",
52 | "10000001236,1345,George Bluth,,2017-01-01\n",
53 | "10000001237,123456,Bob Loblaw,345.12,2017-01-01\n",
54 | "10000001239,1.05,Lucille Bluth,,2017-01-01\n",
55 | "\"\"\"\n",
56 | "\n",
57 | "data2 = \"\"\"acct_id,dollar_amt,name,float_fld\n",
58 | "10000001234,123.4,George Michael Bluth,14530.155\n",
59 | "10000001235,0.45,Michael Bluth,\n",
60 | "10000001236,1345,George Bluth,1\n",
61 | "10000001237,123456,Robert Loblaw,345.12\n",
62 | "10000001238,1.05,Loose Seal Bluth,111\n",
63 | "\"\"\"\n",
64 | "\n",
65 | "df1 = pd.read_csv(StringIO(data1))\n",
66 | "df2 = pd.read_csv(StringIO(data2))\n",
67 | "\n",
68 | "datacompy.is_match(\n",
69 | " df1,\n",
70 | " df2,\n",
71 | " join_columns='acct_id', #You can also specify a list of columns\n",
72 | " abs_tol=0, #Optional, defaults to 0\n",
73 | " rel_tol=0, #Optional, defaults to 0\n",
74 | " df1_name='Original', #Optional, defaults to 'df1'\n",
75 | " df2_name='New' #Optional, defaults to 'df2'\n",
76 | ")\n",
77 | "# False"
78 | ]
79 | },
80 | {
81 | "cell_type": "markdown",
82 | "metadata": {},
83 | "source": [
84 | "## Report Generation\n",
85 | "\n",
86 | "For a full report, use the `report` function. The report is truncated in this notebook because the output is long."
87 | ]
88 | },
89 | {
90 | "cell_type": "code",
91 | "execution_count": 5,
92 | "metadata": {},
93 | "outputs": [
94 | {
95 | "name": "stdout",
96 | "output_type": "stream",
97 | "text": [
98 | "DataComPy Comparison\n",
99 | "--------------------\n",
100 | "\n",
101 | "DataFrame Summary\n",
102 | "-----------------\n",
103 | "\n",
104 | " DataFrame Columns Rows\n",
105 | "0 Original 5 5\n",
106 | "1 New 4 5\n",
107 | "\n",
108 | "Column Summary\n",
109 | "--------------\n",
110 | "\n",
111 | "Number of columns in common: 4\n",
112 | "Number of columns in Original but not in New: 1\n",
113 | "Number of columns in New but not in Original: 0\n",
114 | "\n",
115 | "Row Summary\n",
116 | "-----------\n",
117 | "\n",
118 | "Matched on: acct_id\n",
119 | "Any duplicates on match values: No\n",
120 | "Absolute Tolerance: 0\n",
121 | "Relative Tolerance: 0\n",
122 | "Number of rows in common: 4\n",
123 | "Number of rows in Original but not in New: 1\n",
124 | "Number of rows in New but not in Original: 1\n",
125 | "\n",
126 | "Number of rows with some compared columns unequal: 4\n",
127 | "Number of rows with all compared columns equal: 0\n",
128 | "\n",
129 | "Column Comparison\n",
130 | "-----------------\n",
131 | "\n",
132 | "Number of columns compared with some values unequal: 3\n",
133 | "Number of columns compared with all values equal: 1\n",
134 | "Total number of values which compare unequal: 6\n",
135 | "\n",
136 | "Columns with Unequal Values or Types\n",
137 | "------------------------------------\n",
138 | "\n",
139 | " Column Original dtype New dtype # Unequal Max Diff # Null Diff\n",
140 | "0 dollar_amt float64 float64 1 0.0500 0\n",
141 | "2 float_fld float64 float64 3 0.0005 2\n",
142 | "1 name object object 2 0.0000 0\n",
143 | "\n",
144 | "Sample Rows with Unequal Values\n",
145 | "-------------------------------\n",
146 | "\n",
147 | " acct_id dollar_amt (Original) dollar_amt (New)\n",
148 | "0 10000001234 123.45 123.4\n",
149 | "\n",
150 | " acct_id name (Original) name (New)\n",
151 | "0 10000001237 Bob Loblaw Robert Loblaw\n",
152 | "1 10000001234 George Maharis George Michael Bluth\n",
153 | "\n",
154 | " acct_id float_fld (Original) float_fld (New)\n",
155 | "0 10000001234 14530.1555 14530.155\n",
156 | "1 10000001236 NaN 1.000\n",
157 | "2 10000001235 1.0000 NaN\n",
158 | "\n",
159 | "Sample Rows Only in Original (First 10 Columns)\n",
160 | "-----------------------------------------------\n",
161 | "\n",
162 | " acct_id dollar_amt name float_fld date_fld\n",
163 | "0 10000001239 1.05 Lucille Bluth NaN 2017-01-01\n",
164 | "\n",
165 | "Sample Rows Only in New (First 10 Columns)\n",
166 | "------------------------------------------\n",
167 | "\n",
168 | " acct_id dollar_amt name float_fld\n",
169 | "0 10000001238 1.05 Loose Seal Bluth 111.0\n",
170 | "\n",
171 | "\n"
172 | ]
173 | }
174 | ],
175 | "source": [
176 | "# This method prints out a human-readable report summarizing and sampling differences\n",
177 | "print(datacompy.report(\n",
178 | " df1,\n",
179 | " df2,\n",
180 | " join_columns='acct_id', #You can also specify a list of columns\n",
181 | " abs_tol=0, #Optional, defaults to 0\n",
182 | " rel_tol=0, #Optional, defaults to 0\n",
183 | " df1_name='Original', #Optional, defaults to 'df1'\n",
184 | " df2_name='New' #Optional, defaults to 'df2'\n",
185 | "))"
186 | ]
187 | },
188 | {
189 | "cell_type": "markdown",
190 | "metadata": {},
191 | "source": [
192 | "## Distributed Usage\n",
193 | "\n",
194 | "In order to compare DataFrames of different backends, you just need to replace df1 and df2 with DataFrames of different backends. Just pass in DataFrames such as Pandas DataFrames, DuckDB relations, Polars DataFrames, Arrow tables, Spark DataFrames, Dask DataFrames or Ray Datasets. For example, to compare a Pandas DataFrame with a Spark DataFrame:"
195 | ]
196 | },
197 | {
198 | "cell_type": "code",
199 | "execution_count": null,
200 | "metadata": {},
201 | "outputs": [],
202 | "source": [
203 | "from pyspark.sql import SparkSession\n",
204 | "\n",
205 | "spark = SparkSession.builder.getOrCreate()\n",
206 | "spark_df2 = spark.createDataFrame(df2)\n",
207 | "datacompy.is_match(\n",
208 | " df1,\n",
209 | " spark_df2,\n",
210 | " join_columns='acct_id',\n",
211 | ")"
212 | ]
213 | },
214 | {
215 | "cell_type": "markdown",
216 | "metadata": {},
217 | "source": [
218 | "Notice that in order to use a specific backend, you need to have the corresponding library installed. For example, if you want compare Ray datasets, you must do:\n",
219 | "\n",
220 | "```\n",
221 | "pip install datacompy[ray]\n",
222 | "```"
223 | ]
224 | },
225 | {
226 | "cell_type": "markdown",
227 | "metadata": {},
228 | "source": [
229 | "## Conclusion\n",
230 | "\n",
231 | "Here we introduced the DataComPy library and showed how to compare DataFrames across Spark, Dask, Ray, DuckDB, PyArrow, and Polars through Fugue."
232 | ]
233 | }
234 | ],
235 | "metadata": {
236 | "kernelspec": {
237 | "display_name": "Python 3.8.13 ('fugue')",
238 | "language": "python",
239 | "name": "python3"
240 | },
241 | "language_info": {
242 | "codemirror_mode": {
243 | "name": "ipython",
244 | "version": 3
245 | },
246 | "file_extension": ".py",
247 | "mimetype": "text/x-python",
248 | "name": "python",
249 | "nbconvert_exporter": "python",
250 | "pygments_lexer": "ipython3",
251 | "version": "3.8.13"
252 | },
253 | "orig_nbformat": 2,
254 | "vscode": {
255 | "interpreter": {
256 | "hash": "9fcd6e71927f6b3e5f4fa4280b4e8e6a66aa8d4365bb61cf7ef4017620fc09b9"
257 | }
258 | }
259 | },
260 | "nbformat": 4,
261 | "nbformat_minor": 2
262 | }
263 |
--------------------------------------------------------------------------------
/tutorials/integrations/ecosystem/index.md:
--------------------------------------------------------------------------------
1 | # Ecosystem
2 |
3 | [Fugue](https://github.com/fugue-project/fugue) can be used in conjunction with a lot of other Python libraries. Some of these integrations are native where Fugue can be used as a backend. For the others, there is no native integration but they can be used together with minimal lines of code, normally through the `transform()` function.
4 |
5 | Have questions? Chat with us on Github or Slack:
6 |
7 | [](https://github.com/fugue-project/fugue)
8 | [](http://slack.fugue.ai)
9 |
10 | ```{toctree}
11 | :hidden:
12 |
13 | pandera
14 | whylogs
15 | nixtla
16 | pycaret
17 | datacompy
18 | prefect
19 | ```
20 |
21 | ## Data Validation
22 |
23 | **[Whylogs](whylogs.ipynb)**
24 |
25 | [Whylogs](https://github.com/whylabs/whylogs) is a data logging library that scalably profiles data for use cases like data validation or anomaly detection.
26 |
27 | **[Pandera](pandera.ipynb)**
28 |
29 | [Pandera](https://pandera.readthedocs.io/en/stable/) is a lightweight data validation framework originally designed to provide a minimal interface in validating Pandas DataFrames. Pandera has seen expanded to Spark and Dask libraries through Koalas and Modin, but it can also be used pretty seamlessly with Fugue. Fugue also supports validation by partition.
30 |
31 | **[Datacompy](datacompy.ipynb)**
32 |
33 | [Datacompy](https://github.com/capitalone/datacompy) is a package to compare two DataFrames of any type. It originally allowed users to compare two Pandas DataFrames or two Spark DataFrames. By adding Fugue as a backend, it can now compare all DataFrames Fugue supports (Pandas, DuckDB, Polars, Arrow, Spark, Dask, Ray, and more).
34 |
35 | ## Machine Learning
36 |
37 | **[PyCaret](pycaret.ipynb)**
38 |
39 | [PyCaret](https://pycaret.readthedocs.io/en/stable/) is an open-source low-code machine learning library that allows users to train dozens of models in a few lines of code. With a native integration, Fugue users and distribute the machine learning training over Spark, Dask or Ray.
40 |
41 | **[Nixtla](nixtla.ipynb)**
42 |
43 | [Nixtla](https://github.com/Nixtla/nixtla) is a project focused on state-of-the-art time series modelling. The current Fugue integration is around their statistical forecasting packages named [statsforecast](https://github.com/Nixtla/statsforecast). Fugue lets users apply `AutoARIMA` and `ETS` models to forecast millions of independent timeseries on top of distributed compute.
44 |
45 | ## Orchestration
46 |
47 | **[Prefect](prefect.ipynb)**
48 |
49 | [Prefect] is an open-source workflow orchestration framework used for scheduling and monitoring tasks. The `prefect-fugue` collection allows users to iterate locally, and then bring the code to Databricks or Coiled for execution when production ready.
50 |
--------------------------------------------------------------------------------
/tutorials/integrations/warehouses/index.md:
--------------------------------------------------------------------------------
1 | # Warehouses
2 |
3 | Here, we look at the warehouses supported by Fugue such as Snowflake and BigQuery.
4 |
5 | Have questions? Chat with us on Github or Slack:
6 |
7 | [](https://github.com/fugue-project/fugue)
8 | [](http://slack.fugue.ai)
9 |
10 | ```{toctree}
11 | :hidden:
12 |
13 | bigquery.ipynb
14 | trino.ipynb
15 | ```
16 |
17 | ## [BigQuery](bigquery.ipynb)
18 | Google BigQuery is is Google Cloud's Enterprise Data Warehouse offering. It is serverless and cost-effective.
19 |
20 | ## [Trino](trino.ipynb)
21 | Trino (formerly known as Presto) is an open-source project that lets users query across different data sources distributedly. For example,
22 | users can query Google BigQuery and combine it with data on Google Cloud Storage.
23 |
--------------------------------------------------------------------------------
/tutorials/quick_look/index.md:
--------------------------------------------------------------------------------
1 | # Quick Look
2 |
3 | The [Fugue](https://github.com/fugue-project/fugue) project aims to make distributed computing effortless. It ports Python, [Pandas](https://pandas.pydata.org/docs/), and SQL code to [Spark](https://spark.apache.org/docs/latest/api/python/), [Dask](https://docs.dask.org/en/stable/), [Ray](https://docs.ray.io/en/latest/index.html), and [DuckDB](https://duckdb.org/docs/). Through Fugue, users only have to worry about defining their logic in the most intuitive way. Production-ready code can then be scaled out to a distributed computing backend just by adding a few lines of code.
4 |
5 | This section contains 10-minute introductions to Fugue and FugueSQL.
6 |
7 | Have questions? Chat with us on Github or Slack:
8 |
9 | [](https://github.com/fugue-project/fugue)
10 | [](http://slack.fugue.ai)
11 |
12 |
13 | ```{toctree}
14 | :hidden:
15 |
16 | ten_minutes
17 | ten_minutes_sql
18 | ```
19 |
20 | ## [Fugue API in 10 Minutes](ten_minutes.ipynb)
21 | Learn the basic Python interface of Fugue by starting with the `transform()` function. This function takes existing Python and Pandas code, and brings it to Spark, Dask, or Ray with minimal re-writes. The `transform()` function alone already allows users to write framework-agnostic code will all its features. It's incrementally adoptable, and users can use it for as little as a single step in their Spark, Dask or Ray pipelines. For end-to-end agnostic workflows, we take a quick look at the functions of the Fugue API.
22 |
23 | ## [FugueSQL in 10 Minutes](ten_minutes_sql.ipynb)
24 | For users that prefer SQL over Python, Fugue also has a first-class SQL interface to use on top of Pandas, Spark, and Dask DataFrames. FugueSQL is an enhanced version of SQL that has added keywords and syntax intended for end-to-end computing workflows. FugueSQL is parsed, and then ran on the specified backend. For example, FugueSQL using Spark will run on SparkSQL and PySpark.
25 |
--------------------------------------------------------------------------------
/tutorials/resources/appendix/generate_types.ipynb:
--------------------------------------------------------------------------------
1 | {
2 | "metadata": {
3 | "language_info": {
4 | "codemirror_mode": {
5 | "name": "ipython",
6 | "version": 3
7 | },
8 | "file_extension": ".py",
9 | "mimetype": "text/x-python",
10 | "name": "python",
11 | "nbconvert_exporter": "python",
12 | "pygments_lexer": "ipython3",
13 | "version": "3.8.0"
14 | },
15 | "orig_nbformat": 2,
16 | "kernelspec": {
17 | "name": "python3",
18 | "display_name": "Python 3.8.0 64-bit ('fugue-tutorials': conda)",
19 | "metadata": {
20 | "interpreter": {
21 | "hash": "131b24c7e1bb8763ab2b04d5b6d98a68c7b3a823a2a57c5722935f7690890f70"
22 | }
23 | }
24 | }
25 | },
26 | "nbformat": 4,
27 | "nbformat_minor": 2,
28 | "cells": [
29 | {
30 | "source": [
31 | "# Fugue and PyArrow Types\n",
32 | "\n",
33 | "We can use Fugue SQL to generate Fugue -> PyArrow type table."
34 | ],
35 | "cell_type": "markdown",
36 | "metadata": {}
37 | },
38 | {
39 | "cell_type": "code",
40 | "execution_count": 1,
41 | "metadata": {},
42 | "outputs": [
43 | {
44 | "output_type": "stream",
45 | "name": "stdout",
46 | "text": [
47 | "PandasDataFrame\nis_primary:str|fugue_type_expr:str|pa_type:str \n--------------+-------------------+-----------------------------------------------------------------\nYES |bytes |binary \n. |binary |binary \nYES |bool |bool \n. |boolean |bool \nYES |date |date32[day] \nYES |double |double \n. |float64 |double \nYES |float |float \n. |float32 |float \nYES |float16 |halffloat \nYES |short |int16 \n. |int16 |int16 \nYES |int |int32 \n. |int32 |int32 \nYES |long |int64 \n. |int64 |int64 \nYES |byte |int8 \n. |int8 |int8 \nYES |null |null \nYES |str |string \n. |string |string \nYES |datetime |timestamp[us] \nYES |ushort |uint16 \n. |uint16 |uint16 \nYES |uint |uint32 \n. |uint32 |uint32 \nYES |ulong |uint64 \n. |uint64 |uint64 \nYES |ubyte |uint8 \n. |uint8 |uint8 \nTotal count: 30\n\n"
48 | ]
49 | }
50 | ],
51 | "source": [
52 | "import triad\n",
53 | "from fugue_sql import FugueSQLWorkflow\n",
54 | "from typing import List, Any\n",
55 | "\n",
56 | "#schema: fugue_type_expr:str, pa_type:str\n",
57 | "def type_to_expr(primary:bool=False) -> List[List[Any]]:\n",
58 | " if not primary:\n",
59 | " return [[k,str(v)] for k,v in triad.utils.pyarrow._TYPE_EXPRESSION_MAPPING.items()]\n",
60 | " else:\n",
61 | " return [[v,str(k)] for k,v in triad.utils.pyarrow._TYPE_EXPRESSION_R_MAPPING.items()]\n",
62 | " \n",
63 | "with FugueSQLWorkflow() as dag:\n",
64 | " dag(\"\"\"\n",
65 | " f2p = CREATE USING type_to_expr\n",
66 | " f2p_primary = CREATE USING type_to_expr(primary=true)\n",
67 | " SELECT CASE WHEN f2p_primary.pa_type IS NOT NULL THEN \"YES\" ELSE \".\" END AS is_primary,f2p.*\n",
68 | " FROM f2p LEFT OUTER JOIN f2p_primary \n",
69 | " ON f2p.fugue_type_expr=f2p_primary.fugue_type_expr\n",
70 | " ORDER BY pa_type, is_primary DESC\n",
71 | " \n",
72 | " PRINT 100 ROWS\n",
73 | " \"\"\")"
74 | ]
75 | }
76 | ]
77 | }
--------------------------------------------------------------------------------
/tutorials/resources/appendix/index.md:
--------------------------------------------------------------------------------
1 | # Appendix
2 |
3 | The appendix is for miscellaneous content. Any questions are welcome in the Slack channel.
4 |
5 | Have questions? Chat with us on Github or Slack:
6 |
7 | [](https://github.com/fugue-project/fugue)
8 | [](http://slack.fugue.ai)
9 |
10 |
11 | ```{toctree}
12 | :hidden:
13 |
14 | generate_types.ipynb
15 | ```
16 |
17 | ## [Fugue and PyArrow Types](generate_types.ipynb)
18 | Comparison of Fugue and PyArrow types.
19 |
--------------------------------------------------------------------------------
/tutorials/resources/best_practices/explicit_schema.ipynb:
--------------------------------------------------------------------------------
1 | {
2 | "cells": [
3 | {
4 | "cell_type": "markdown",
5 | "metadata": {},
6 | "source": [
7 | "# Explicit Schema\n",
8 | "\n",
9 | "**COMING SOON**\n",
10 | "\n",
11 | "## Pandas and Mixed Type Columns\n",
12 | "\n",
13 | "One of the bad habits that Pandas enables is having mixed type columns that are basically labelled as type `object`. This is not allowed in distribute computing frameworks such as Spark, Dask, and Ray because the data can be soread across multiple machines and having explicit data types guarantees consistency of the operations performed distributedly."
14 | ]
15 | }
16 | ],
17 | "metadata": {
18 | "kernelspec": {
19 | "display_name": "Python 3.8.9 64-bit",
20 | "language": "python",
21 | "name": "python3"
22 | },
23 | "language_info": {
24 | "name": "python",
25 | "version": "3.8.9"
26 | },
27 | "orig_nbformat": 4,
28 | "vscode": {
29 | "interpreter": {
30 | "hash": "31f2aee4e71d21fbe5cf8b01ff0e069b9275f58929596ceb00d14d90e3e16cd6"
31 | }
32 | }
33 | },
34 | "nbformat": 4,
35 | "nbformat_minor": 2
36 | }
37 |
--------------------------------------------------------------------------------
/tutorials/resources/best_practices/file_formats.ipynb:
--------------------------------------------------------------------------------
1 | {
2 | "cells": [
3 | {
4 | "cell_type": "markdown",
5 | "metadata": {},
6 | "source": [
7 | "# File Formats\n",
8 | "\n",
9 | "Pandas practitioners often use CSVs to ingest data for processing. Although this is common practice, it's not good practice. In this section, we'll explain why parquet files are more often used when dealing with distributed computing frameworks such as Spark, Dask, and Ray. Even if the size of the data is still small, there are benefits to musing parquet."
10 | ]
11 | },
12 | {
13 | "cell_type": "markdown",
14 | "metadata": {},
15 | "source": [
16 | "## CSVs Don't Hold Schema Information\n",
17 | "\n",
18 | "The first major downside is that CSVs do not hold schema information. Pandas users often just rely on schema inference during loading. This is why many Pandas users have to convert types after loading in data. For example, boolean columns might be represented as integer columns, which would occupy significantly more memory.\n",
19 | "\n",
20 | "Another common case is that data types can sometimes be loaded as a string, needing conversion before any processing is done.\n",
21 | "\n",
22 | "To get past this, Pandas users often write a function to change all the types after loading in the data. Sometimes, this script will be present across multiple files.\n",
23 | "\n",
24 | "**On the other hand, parquet holds schema information making it easy for sharing data across a team. It also eliminates the need for schema inference.**"
25 | ]
26 | },
27 | {
28 | "cell_type": "markdown",
29 | "metadata": {},
30 | "source": [
31 | "## Compression\n",
32 | "\n",
33 | "The compression is significantly better on parquet. Parquet files tend to be around one-fifth the size of CSV files."
34 | ]
35 | },
36 | {
37 | "cell_type": "markdown",
38 | "metadata": {},
39 | "source": [
40 | "## "
41 | ]
42 | }
43 | ],
44 | "metadata": {
45 | "kernelspec": {
46 | "display_name": "Python 3.8.9 64-bit",
47 | "language": "python",
48 | "name": "python3"
49 | },
50 | "language_info": {
51 | "codemirror_mode": {
52 | "name": "ipython",
53 | "version": 3
54 | },
55 | "file_extension": ".py",
56 | "mimetype": "text/x-python",
57 | "name": "python",
58 | "nbconvert_exporter": "python",
59 | "pygments_lexer": "ipython3",
60 | "version": "3.8.9"
61 | },
62 | "orig_nbformat": 4,
63 | "vscode": {
64 | "interpreter": {
65 | "hash": "31f2aee4e71d21fbe5cf8b01ff0e069b9275f58929596ceb00d14d90e3e16cd6"
66 | }
67 | }
68 | },
69 | "nbformat": 4,
70 | "nbformat_minor": 2
71 | }
72 |
--------------------------------------------------------------------------------
/tutorials/resources/best_practices/index.md:
--------------------------------------------------------------------------------
1 | # Best Practices
2 |
3 | This section is about best practices related to distributed computing, and less about the Fugue framework. One of the things that makes it hard to transition from small data to big data is the mindset. Here, we go over best practices and explain how to fully utilize distributed computing.
4 |
5 | Have questions? Chat with us on Github or Slack:
6 |
7 | [](https://github.com/fugue-project/fugue)
8 | [](http://slack.fugue.ai)
9 |
10 | ```{toctree}
11 | :hidden:
12 |
13 | file_formats
14 | fugue_not_pandas
15 | fugue_spark_benchmark
16 | ```
17 |
18 | ## [File Format](file_formats.ipynb)
19 | This section explains the difference of CSV files and Parquet files, and why Parquet files are better for big data jobs.
20 |
21 | ## [Explicit Schema](explicit_schema.ipynb)
22 | COMING SOON
23 |
24 | ## [Why Fugue is Not Pandas-like](fugue_not_pandas.ipynb)
25 | There are other libraries that promise to distribute Pandas just by changing the import statement. In this section, we explain why Pandas-like frameworks are not meant for distributed computing.
26 |
27 | ## [Fugue Spark Benchmark](fugue_spark_benchmark.ipynb)
28 | We show that Fugue has a minimal overhead by adding it to the Databricks benchmarks.
--------------------------------------------------------------------------------
/tutorials/resources/content.md:
--------------------------------------------------------------------------------
1 | # Content
2 |
3 | This is a collection of blogs and talks at conferences.
4 |
5 | Have questions? Chat with us on Github or Slack:
6 |
7 | [](https://github.com/fugue-project/fugue)
8 | [](http://slack.fugue.ai)
9 |
10 | ## Case Studies
11 |
12 | * Lyft - [How LyftLearn Democratizes Distributed Compute through Kubernetes Spark and Fugue](https://eng.lyft.com/how-lyftlearn-democratizes-distributed-compute-through-kubernetes-spark-and-fugue-c0875b97c3d9)
13 | * Lyft - [Distributed Machine Learning at Lyft](https://www.youtube.com/watch?v=_IVyIOV0LgY)
14 | * Clobotics - [Large Scale Image Processing with Spark through Fugue](https://medium.com/fugue-project/large-scale-image-processing-with-spark-through-fugue-e510b9813da8)
15 |
16 | ## Blogs
17 |
18 | **2022**
19 |
20 | Fugue Core
21 |
22 | * [Introducing Fugue — Reducing PySpark Developer Friction](https://towardsdatascience.com/introducing-fugue-reducing-pyspark-developer-friction-a702230455de)
23 | * [Why Pandas-like Interfaces are Sub-optimal for Distributed Computing](https://towardsdatascience.com/why-pandas-like-interfaces-are-sub-optimal-for-distributed-computing-322dacbce43?gi=cb919ef43b2b)
24 |
25 | Fugue SQL
26 | * [Why SQL-Like Interfaces are Sub-optimal for Distributed Computing](https://towardsdatascience.com/why-sql-like-interfaces-are-sub-optimal-for-distributed-computing-45f62224bab4)
27 |
28 | Integrations
29 |
30 | * [PyCaret](https://github.com/pycaret/pycaret)
31 | * [Scaling PyCaret with Spark (or Dask) through Fugue](https://medium.com/p/60bdc3ce133f) (Towards Data Science)
32 | * [DuckDB](https://duckdb.org/)
33 | * [Fugue and DuckDB: Fast SQL Code in Python](https://towardsdatascience.com/fugue-and-duckdb-fast-sql-code-in-python-e2e2dfc0f8eb) (Towards Data Science by Khuyen Tran)
34 | * [WhyLogs](https://github.com/whylabs/whylogs)
35 | * [Large Scale Data Profiling with whylogs and Fugue on Spark, Ray or Dask](https://kdykho.medium.com/large-scale-data-profiling-with-whylogs-and-fugue-on-spark-ray-or-dask-e6917f6e1621?source=user_profile---------1----------------------------)
36 | * [Nixtla](https://github.com/Nixtla/statsforecast/)
37 | * [Distributed Forecast of 1M Time Series in Under 15 Minutes with Spark, Nixtla, and Fugue](https://towardsdatascience.com/distributed-forecast-of-1m-time-series-in-under-15-minutes-with-spark-nixtla-and-fugue-e9892da6fd5c)
38 |
39 | **2021**
40 |
41 | Fugue Core
42 |
43 | * [Fugue - Reducing Spark Developer Friction](https://jameskle.com/writes/fugue) (James Le Blog)
44 | * [Creating Pandas and Spark Compatible Functions with Fugue](https://towardsdatascience.com/creating-pandas-and-spark-compatible-functions-with-fugue-8617c0b3d3a8) (Towards Data Science)
45 |
46 | Data Validation
47 |
48 | * [Using Pandera on Spark for Data Validation through Fugue](https://towardsdatascience.com/using-pandera-on-spark-for-data-validation-through-fugue-72956f274793) (Towards Data Science)
49 |
50 | FugueSQL
51 |
52 | * [Interoperable Python and SQL in Jupyter Notebooks](https://towardsdatascience.com/interoperable-python-and-sql-in-jupyter-notebooks-86245e711352) (Towards Data Science)
53 | * [Data Analysis with FugueSQL on Coiled Dask Clusters](https://coiled.io/data-analysis-with-fuguesql-on-coiled-dask-clusters/) ([Coiled](https://coiled.io/) Blog)
54 | * [Introducing FugueSQL — SQL for Pandas, Spark, and Dask DataFrames](https://towardsdatascience.com/introducing-fuguesql-sql-for-pandas-spark-and-dask-dataframes-63d461a16b27) (By Khuyen Tran on Towards Data Science)
55 |
56 |
57 | ## Conferences and Meetups
58 |
59 | **2022**
60 |
61 | Fugue
62 |
63 | * [Comparing the Different Ways to Scale Python and Pandas Code](https://www.youtube.com/watch?v=b3ae0m_XTys) (PyCon US)
64 | * [Comparing the Different Ways to Scale Python and Pandas Code](https://www.youtube.com/watch?v=uyaIrVvBSW4) (SciPy)
65 | * [Testing Big Data Applications (Spark, Dask, and Ray)](https://www.youtube.com/watch?v=yQHksEh1GCs&list=PLGVZCDnMOq0opPc5-dp6ZDCFvOqDBlUuv&index=37) (PyData NYC)
66 |
67 | Machine Learning
68 |
69 | * [Distributed Machine Learning at Lyft](https://www.youtube.com/watch?v=_IVyIOV0LgY)
70 |
71 | Tune
72 |
73 | * [Distributed Hybrid Parameter Tuning](https://www.youtube.com/watch?v=_GBjqskD8Qk) (Databricks Summit)
74 | * [An Introduction to Distributed Hybrid Hyperparameter Optimization](https://www.youtube.com/watch?v=vj5Tsy_qM5g) (SciPy)
75 |
76 | FugueSQL
77 |
78 | * [FugueSQL - The Enhanced SQL Interface for Pandas and Spark DataFrames](https://www.youtube.com/watch?v=F9uzZh5dC0M) (Databricks Summit)
79 |
80 | **2021**
81 |
82 | Data Validation
83 |
84 | * [Large Scale Data Validation with Spark and Dask](https://www.youtube.com/watch?v=2AdvBgjO_3Q) (PyCon US)
85 | * [Fully Utilizing Spark for Data Validation](https://www.youtube.com/watch?v=f901OJrP5ls) (Spark AI Summit)
86 | * [Large Scale Data Validation with Fugue](https://www.youtube.com/watch?v=fSASmPNW3vc) (PyData Global)
87 |
88 | FugueSQL
89 |
90 | * [Dask SQL Query Engines](https://www.youtube.com/watch?v=bQDN41Bc3bw) (Dask Summit)
91 | * [FugueSQL: Extending SQL Interface for End-to-End Data Pipelines](https://www.dremio.com/subsurface/fugue-sql-extending-sql-interface-for-end-to-end-data-pipelines/) (Dremio Subsurface)
92 | * [FugueSQL - The Enhanced SQL Interface for Pandas, Spark, and Dask DataFrames](https://www.youtube.com/watch?v=OBpnGYjNBBI) (PyData Global)
93 |
94 | Machine Learning
95 |
96 | * [Superworkflow of Graph Neural Networks with K8S and Fugue](https://www.youtube.com/watch?v=-aEZjQiqSFA) (Spark AI Summit)
97 | * [Scaling Machine Learning Workflows to Big Data with Fugue](https://www.youtube.com/watch?v=fDIRMiwc0aA) (KubeCon)
98 | * [Distributed ML to Learn Causal Effect Using Fugue and Spark](https://www.youtube.com/watch?v=dafU1SZs4iw) (AI Camp)
99 |
100 | Tune
101 |
102 | * [Intuitive and Scalable Hyperparameter Tuning with Apache Spark + Fugue](https://www.youtube.com/watch?v=JUretXiLtK0) (Spark AI Summit)
103 | * [Fugue Tune](https://www.youtube.com/watch?v=MRa0ao4tfWc) (PyData Global)
104 |
105 | Testing Spark
106 |
107 | * [Simplifying Testing of Spark Applications](https://www.youtube.com/watch?v=_ieqg_soB3U) (PyData Global)
108 |
109 | **2020**
110 |
111 | * [Unifying Spark and Non-Spark Ecosystems for Big Data Analytics](https://www.youtube.com/watch?v=BBd4b2pMk0c) (Spark AI Summit)
--------------------------------------------------------------------------------
/tutorials/tune/index.md:
--------------------------------------------------------------------------------
1 | # Fugue Tune
2 |
3 | [Tune](https://github.com/fugue-project/tune) is an abstraction layer for general parameter tuning built on top of [Fugue](https://github.com/fugue-project/fugue). It can run hyperparameter tuning frameworks such as [Optuna](https://optuna.org/) and [Hyperopt](http://hyperopt.github.io/hyperopt/) on the backends supported by Fugue (Spark, Dask, Ray, and local). Tune can also be used for general scientific computing in addition to typical machine learning libraries such as [Scikit-learn](https://scikit-learn.org/stable/) and [Keras](https://keras.io/).
4 |
5 | Tune has the following goals:
6 |
7 | * Provide the simplest and most intuitive APIs for major tuning cases.
8 | * Be scale agnostic and platform agnostic. We want you to worry less about distributed computing, and just focus on the tuning logic itself. Built on Fugue, Tune let you develop your tuning process iteratively. You can test with small spaces on local machine, and then switch to larger spaces and run distributedly with no code change.
9 | * Be highly extendable and flexible on lower level abstractions to integrate with libraries such as [Hyperopt](http://hyperopt.github.io/hyperopt/), [Optuna](https://optuna.org/), and [Nevergrad](https://facebookresearch.github.io/nevergrad/).
10 |
11 | Have questions? Chat with us on Github or Slack:
12 |
13 | [](https://github.com/fugue-project/fugue)
14 | [](http://slack.fugue.ai)
15 |
16 | ## Installation
17 |
18 | Tune is available through pip.
19 |
20 | ```bash
21 | pip install tune
22 | ```
23 |
24 | Tune does not come with any machine learning libraries because it can also be used to tune any objective functionn (as in the case of scientific computing). To use it with scikit-learn and [Bayesian Optimization](https://en.wikipedia.org/wiki/Bayesian_optimization), you can install with extras.
25 |
26 | ```bash
27 | pip install tune[hyperopt,sklearn]
28 | ```
29 |
30 | ```{toctree}
31 | :hidden:
32 |
33 | search_space
34 | non_iterative
35 | iterative
36 | ```
37 |
38 | ## Tune Tutorials
39 |
40 | ### [Search Space](search_space.ipynb)
41 |
42 | Here we learn how to define the search space for hyperparameter tuning. We'll learn how Fugue Tune provides an intuitive and scalable interface for defining hyperparameter combinations for an experiment. Tune's search space is decoupled any specific framework.
43 |
44 | ### [Non-iterative Problems](non_iterative.ipynb)
45 |
46 | Next we apply the search space on non-iterative problems. These are machine learning models that converge to a solution. Scikit-learn models fall under this.
47 |
48 | ### [Iterative Problems](iterative.ipynb)
49 |
50 | Next we apply the search space on iterative problems such as deep learning problems
--------------------------------------------------------------------------------