├── fugue
├── py.typed
├── sql
│ ├── __init__.py
│ └── _utils.py
├── _utils
│ ├── __init__.py
│ ├── registry.py
│ ├── misc.py
│ ├── interfaceless.py
│ └── exception.py
├── collections
│ ├── __init__.py
│ └── yielded.py
├── bag
│ ├── __init__.py
│ └── array_bag.py
├── extensions
│ ├── transformer
│ │ ├── constants.py
│ │ └── __init__.py
│ ├── creator
│ │ ├── __init__.py
│ │ └── creator.py
│ ├── outputter
│ │ ├── __init__.py
│ │ └── outputter.py
│ ├── processor
│ │ ├── __init__.py
│ │ └── processor.py
│ ├── __init__.py
│ └── _builtins
│ │ ├── __init__.py
│ │ └── creators.py
├── dataset
│ └── __init__.py
├── rpc
│ └── __init__.py
├── column
│ └── __init__.py
├── workflow
│ ├── __init__.py
│ ├── input.py
│ └── _workflow_context.py
├── test
│ ├── __init__.py
│ └── pandas_tester.py
├── execution
│ └── __init__.py
├── dataframe
│ └── __init__.py
├── registry.py
├── plugins.py
├── dev.py
├── api.py
├── exceptions.py
└── __init__.py
├── tests
├── fugue
│ ├── __init__.py
│ ├── bag
│ │ ├── __init__.py
│ │ └── test_array_bag.py
│ ├── rpc
│ │ ├── __init__.py
│ │ ├── test_func.py
│ │ ├── test_flask.py
│ │ └── test_base.py
│ ├── sql
│ │ └── __init__.py
│ ├── test
│ │ └── __init__.py
│ ├── column
│ │ ├── __init__.py
│ │ └── test_functions.py
│ ├── workflow
│ │ ├── __init__.py
│ │ ├── test_workflow_parallel.py
│ │ └── test_runtime_exception.py
│ ├── collections
│ │ └── __init__.py
│ ├── dataframe
│ │ ├── __init__.py
│ │ ├── test_arrow_dataframe.py
│ │ ├── test_dataframes.py
│ │ └── test_dataframe.py
│ ├── execution
│ │ ├── __init__.py
│ │ ├── test_execution_engine.py
│ │ └── test_api.py
│ ├── extensions
│ │ ├── __init__.py
│ │ ├── transformer
│ │ │ └── __init__.py
│ │ ├── creator
│ │ │ └── __init__.py
│ │ ├── outputter
│ │ │ └── __init__.py
│ │ └── processor
│ │ │ └── __init__.py
│ └── utils
│ │ ├── __init__.py
│ │ ├── test_misc.py
│ │ └── test_interfaceless.py
├── fugue_dask
│ ├── __init__.py
│ ├── test_importless.py
│ └── test_sql.py
├── fugue_ibis
│ ├── __init__.py
│ ├── mock
│ │ ├── __init__.py
│ │ ├── registry.py
│ │ ├── tester.py
│ │ └── dataframe.py
│ ├── test_execution_engine.py
│ └── test_dataframe.py
├── fugue_ray
│ ├── __init__.py
│ ├── test_utils.py
│ └── test_registry.py
├── fugue_duckdb
│ ├── __init__.py
│ └── test_importless.py
├── fugue_notebook
│ └── __init__.py
├── fugue_polars
│ ├── __init__.py
│ └── test_api.py
├── fugue_spark
│ ├── __init__.py
│ ├── utils
│ │ └── __init__.py
│ ├── test_sql.py
│ ├── test_importless.py
│ └── test_spark_connect.py
└── __init__.py
├── fugue_ray
├── _utils
│ ├── __init__.py
│ └── cluster.py
├── __init__.py
├── tester.py
├── _constants.py
└── registry.py
├── fugue_spark
├── _utils
│ ├── __init__.py
│ └── misc.py
├── __init__.py
├── _constants.py
└── tester.py
├── fugue_notebook
└── nbextension
│ ├── __init__.py
│ ├── README.md
│ └── description.yaml
├── fugue_version
└── __init__.py
├── fugue_polars
├── __init__.py
└── _utils.py
├── images
├── extensions.png
├── architecture.png
└── logo.svg
├── .github
├── ISSUE_TEMPLATE
│ ├── questions.md
│ ├── deprecation.md
│ ├── compatibility.md
│ ├── bug_report.md
│ └── feature_request.md
└── workflows
│ ├── publish.yml
│ ├── test_notebook.yml
│ ├── test_win.yml
│ ├── test_core.yml
│ ├── test_no_sql.yml
│ ├── test_all.yml
│ ├── test_ray.yml
│ ├── test_dask.yml
│ └── test_spark.yml
├── docs
├── _templates
│ └── toc.rst_t
├── api.rst
├── Makefile
├── make.bat
├── tutorials.rst
├── index.rst
├── _static
│ ├── logo.svg
│ └── logo_doc.svg
├── api_sql
│ └── fugue_sql.rst
├── api
│ ├── fugue.rpc.rst
│ ├── fugue.sql.rst
│ ├── fugue.bag.rst
│ ├── fugue.dataset.rst
│ ├── fugue.extensions.rst
│ ├── fugue.extensions.creator.rst
│ ├── fugue.extensions.outputter.rst
│ ├── fugue.extensions.processor.rst
│ ├── fugue.column.rst
│ ├── fugue.collections.rst
│ ├── fugue.workflow.rst
│ └── fugue.extensions.transformer.rst
├── api_ibis
│ ├── fugue_ibis.execution.rst
│ └── fugue_ibis.rst
└── api_ray
│ └── fugue_ray.rst
├── fugue_dask
├── __init__.py
├── _constants.py
├── tester.py
├── registry.py
└── _dask_sql_wrapper.py
├── fugue_ibis
├── __init__.py
└── _compat.py
├── fugue_contrib
├── __init__.py
├── contrib.py
├── viz
│ ├── __init__.py
│ └── _ext.py
└── seaborn
│ └── __init__.py
├── fugue_duckdb
├── __init__.py
└── tester.py
├── fugue_sql
├── exceptions.py
└── __init__.py
├── scripts
└── setupsparkconnect.sh
├── .readthedocs.yaml
├── .pylintrc
├── requirements.txt
├── .gitpod.yml
├── .devcontainer
└── devcontainer.json
├── setup.cfg
├── .pre-commit-config.yaml
├── fugue_test
├── fixtures.py
├── __init__.py
└── bag_suite.py
└── .gitignore
/fugue/py.typed:
--------------------------------------------------------------------------------
1 |
--------------------------------------------------------------------------------
/fugue/sql/__init__.py:
--------------------------------------------------------------------------------
1 |
--------------------------------------------------------------------------------
/tests/fugue/__init__.py:
--------------------------------------------------------------------------------
1 |
--------------------------------------------------------------------------------
/fugue/_utils/__init__.py:
--------------------------------------------------------------------------------
1 |
--------------------------------------------------------------------------------
/fugue_ray/_utils/__init__.py:
--------------------------------------------------------------------------------
1 |
--------------------------------------------------------------------------------
/tests/fugue/bag/__init__.py:
--------------------------------------------------------------------------------
1 |
--------------------------------------------------------------------------------
/tests/fugue/rpc/__init__.py:
--------------------------------------------------------------------------------
1 |
--------------------------------------------------------------------------------
/tests/fugue/sql/__init__.py:
--------------------------------------------------------------------------------
1 |
--------------------------------------------------------------------------------
/tests/fugue/test/__init__.py:
--------------------------------------------------------------------------------
1 |
--------------------------------------------------------------------------------
/tests/fugue_dask/__init__.py:
--------------------------------------------------------------------------------
1 |
--------------------------------------------------------------------------------
/tests/fugue_ibis/__init__.py:
--------------------------------------------------------------------------------
1 |
--------------------------------------------------------------------------------
/tests/fugue_ray/__init__.py:
--------------------------------------------------------------------------------
1 |
--------------------------------------------------------------------------------
/fugue/collections/__init__.py:
--------------------------------------------------------------------------------
1 |
--------------------------------------------------------------------------------
/fugue_spark/_utils/__init__.py:
--------------------------------------------------------------------------------
1 |
--------------------------------------------------------------------------------
/tests/fugue/column/__init__.py:
--------------------------------------------------------------------------------
1 |
--------------------------------------------------------------------------------
/tests/fugue/workflow/__init__.py:
--------------------------------------------------------------------------------
1 |
--------------------------------------------------------------------------------
/tests/fugue_duckdb/__init__.py:
--------------------------------------------------------------------------------
1 |
--------------------------------------------------------------------------------
/tests/fugue_ibis/mock/__init__.py:
--------------------------------------------------------------------------------
1 |
--------------------------------------------------------------------------------
/tests/fugue_notebook/__init__.py:
--------------------------------------------------------------------------------
1 |
--------------------------------------------------------------------------------
/tests/fugue_polars/__init__.py:
--------------------------------------------------------------------------------
1 |
--------------------------------------------------------------------------------
/tests/fugue_spark/__init__.py:
--------------------------------------------------------------------------------
1 |
--------------------------------------------------------------------------------
/fugue_notebook/nbextension/__init__.py:
--------------------------------------------------------------------------------
1 |
--------------------------------------------------------------------------------
/tests/fugue/collections/__init__.py:
--------------------------------------------------------------------------------
1 |
--------------------------------------------------------------------------------
/tests/fugue/dataframe/__init__.py:
--------------------------------------------------------------------------------
1 |
2 |
--------------------------------------------------------------------------------
/tests/fugue/execution/__init__.py:
--------------------------------------------------------------------------------
1 |
2 |
--------------------------------------------------------------------------------
/tests/fugue/extensions/__init__.py:
--------------------------------------------------------------------------------
1 |
--------------------------------------------------------------------------------
/tests/fugue/utils/__init__.py:
--------------------------------------------------------------------------------
1 |
2 |
--------------------------------------------------------------------------------
/tests/fugue_spark/utils/__init__.py:
--------------------------------------------------------------------------------
1 |
--------------------------------------------------------------------------------
/tests/__init__.py:
--------------------------------------------------------------------------------
1 | # pylint: disable-all
2 |
--------------------------------------------------------------------------------
/tests/fugue/extensions/transformer/__init__.py:
--------------------------------------------------------------------------------
1 |
--------------------------------------------------------------------------------
/fugue_version/__init__.py:
--------------------------------------------------------------------------------
1 | __version__ = "0.9.3"
2 |
--------------------------------------------------------------------------------
/fugue/bag/__init__.py:
--------------------------------------------------------------------------------
1 | # flake8: noqa
2 | from .bag import Bag, LocalBag
3 |
--------------------------------------------------------------------------------
/fugue_polars/__init__.py:
--------------------------------------------------------------------------------
1 | # flake8: noqa
2 | from .polars_dataframe import PolarsDataFrame
3 |
--------------------------------------------------------------------------------
/images/extensions.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/fugue-project/fugue/HEAD/images/extensions.png
--------------------------------------------------------------------------------
/images/architecture.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/fugue-project/fugue/HEAD/images/architecture.png
--------------------------------------------------------------------------------
/fugue/extensions/transformer/constants.py:
--------------------------------------------------------------------------------
1 | OUTPUT_TRANSFORMER_DUMMY_SCHEMA = "__output_no_data__:int"
2 |
--------------------------------------------------------------------------------
/fugue/dataset/__init__.py:
--------------------------------------------------------------------------------
1 | # flake8: noqa
2 | from .api import *
3 | from .dataset import AnyDataset, Dataset, DatasetDisplay, get_dataset_display
4 |
--------------------------------------------------------------------------------
/fugue_ray/__init__.py:
--------------------------------------------------------------------------------
1 | # flake8: noqa
2 |
3 | from fugue_ray.dataframe import RayDataFrame
4 | from fugue_ray.execution_engine import RayExecutionEngine
5 |
--------------------------------------------------------------------------------
/.github/ISSUE_TEMPLATE/questions.md:
--------------------------------------------------------------------------------
1 | ---
2 | name: Questions
3 | about: General questions
4 | title: "[QUESTION]"
5 | labels: ''
6 | assignees: ''
7 |
8 | ---
9 |
--------------------------------------------------------------------------------
/.github/ISSUE_TEMPLATE/deprecation.md:
--------------------------------------------------------------------------------
1 | ---
2 | name: Deprecation
3 | about: Deprecate certain features
4 | title: "[DEPRECATION]"
5 | labels: ''
6 | assignees: ''
7 |
8 | ---
9 |
--------------------------------------------------------------------------------
/fugue_notebook/nbextension/README.md:
--------------------------------------------------------------------------------
1 | # Fugue Notebook Extension
2 |
3 | - Add `%%fsql` magic to run Fugue SQL
4 | - Add Fugue SQL highlight in code cells for `%%fsql`
5 |
--------------------------------------------------------------------------------
/docs/_templates/toc.rst_t:
--------------------------------------------------------------------------------
1 | {{ header | heading }}
2 |
3 | .. toctree::
4 | :maxdepth: {{ maxdepth }}
5 | {% for docname in docnames %}
6 | {{ docname }}
7 | {%- endfor %}
8 |
--------------------------------------------------------------------------------
/.github/ISSUE_TEMPLATE/compatibility.md:
--------------------------------------------------------------------------------
1 | ---
2 | name: Compatibility
3 | about: Compatibility with dependent packages updates
4 | title: "[COMPATIBILITY]"
5 | labels: ''
6 | assignees: ''
7 |
8 | ---
9 |
--------------------------------------------------------------------------------
/fugue_dask/__init__.py:
--------------------------------------------------------------------------------
1 | # flake8: noqa
2 | from fugue_version import __version__
3 |
4 | from fugue_dask.dataframe import DaskDataFrame
5 | from fugue_dask.execution_engine import DaskExecutionEngine
6 |
--------------------------------------------------------------------------------
/fugue_polars/_utils.py:
--------------------------------------------------------------------------------
1 | import polars as pl
2 | from triad import Schema
3 |
4 |
5 | def build_empty_pl(schema: Schema) -> pl.DataFrame:
6 | return pl.from_arrow(schema.create_empty_arrow_table())
7 |
--------------------------------------------------------------------------------
/fugue_spark/__init__.py:
--------------------------------------------------------------------------------
1 | # flake8: noqa
2 | from fugue_version import __version__
3 |
4 | from fugue_spark.dataframe import SparkDataFrame
5 | from fugue_spark.execution_engine import SparkExecutionEngine
6 |
--------------------------------------------------------------------------------
/fugue_notebook/nbextension/description.yaml:
--------------------------------------------------------------------------------
1 | Type: Jupyter Notebook Extension
2 | Compatibility: 3.x, 4.x, 5.x, 6.x
3 | Name: Fugue
4 | Main: main.js
5 | Link: README.md
6 | Description: |
7 | Fugue Jupyter extension
8 |
--------------------------------------------------------------------------------
/fugue_spark/_constants.py:
--------------------------------------------------------------------------------
1 | from typing import Dict, Any
2 |
3 | FUGUE_SPARK_CONF_USE_PANDAS_UDF = "fugue.spark.use_pandas_udf"
4 |
5 | FUGUE_SPARK_DEFAULT_CONF: Dict[str, Any] = {FUGUE_SPARK_CONF_USE_PANDAS_UDF: True}
6 |
--------------------------------------------------------------------------------
/fugue_ibis/__init__.py:
--------------------------------------------------------------------------------
1 | # flake8: noqa
2 | from triad import run_at_def
3 |
4 | from ._compat import IbisSchema, IbisTable
5 | from .dataframe import IbisDataFrame
6 | from .execution_engine import IbisExecutionEngine, IbisSQLEngine
7 |
--------------------------------------------------------------------------------
/fugue/rpc/__init__.py:
--------------------------------------------------------------------------------
1 | # flake8: noqa
2 | from fugue.rpc.base import (
3 | RPCClient,
4 | EmptyRPCHandler,
5 | RPCFunc,
6 | RPCHandler,
7 | RPCServer,
8 | make_rpc_server,
9 | to_rpc_handler,
10 | )
11 |
--------------------------------------------------------------------------------
/fugue/column/__init__.py:
--------------------------------------------------------------------------------
1 | # flake8: noqa
2 | from fugue.column.expressions import ColumnExpr, all_cols, col, function, lit, null
3 | from fugue.column.functions import is_agg
4 | from fugue.column.sql import SelectColumns, SQLExpressionGenerator
5 |
--------------------------------------------------------------------------------
/fugue/extensions/creator/__init__.py:
--------------------------------------------------------------------------------
1 | # flake8: noqa
2 | from fugue.extensions.creator.convert import (
3 | _to_creator,
4 | creator,
5 | parse_creator,
6 | register_creator,
7 | )
8 | from fugue.extensions.creator.creator import Creator
9 |
--------------------------------------------------------------------------------
/docs/api.rst:
--------------------------------------------------------------------------------
1 | API Reference
2 | ==============
3 |
4 | .. toctree::
5 |
6 | api/fugue
7 | api_sql/fugue_sql
8 | api_duckdb/fugue_duckdb
9 | api_spark/fugue_spark
10 | api_dask/fugue_dask
11 | api_ray/fugue_ray
12 | api_ibis/fugue_ibis
13 |
--------------------------------------------------------------------------------
/fugue/extensions/outputter/__init__.py:
--------------------------------------------------------------------------------
1 | # flake8: noqa
2 | from fugue.extensions.outputter.convert import (
3 | _to_outputter,
4 | outputter,
5 | parse_outputter,
6 | register_outputter,
7 | )
8 | from fugue.extensions.outputter.outputter import Outputter
9 |
--------------------------------------------------------------------------------
/fugue/extensions/processor/__init__.py:
--------------------------------------------------------------------------------
1 | # flake8: noqa
2 | from fugue.extensions.processor.convert import (
3 | _to_processor,
4 | parse_processor,
5 | processor,
6 | register_processor,
7 | )
8 | from fugue.extensions.processor.processor import Processor
9 |
--------------------------------------------------------------------------------
/fugue/workflow/__init__.py:
--------------------------------------------------------------------------------
1 | # flake8: noqa
2 |
3 | from ._workflow_context import FugueWorkflowContext
4 | from .api import *
5 | from .input import register_raw_df_type
6 | from .module import module
7 | from .workflow import FugueWorkflow, WorkflowDataFrame, WorkflowDataFrames
8 |
--------------------------------------------------------------------------------
/fugue_contrib/__init__.py:
--------------------------------------------------------------------------------
1 | import importlib
2 |
3 | from .contrib import FUGUE_CONTRIB
4 |
5 |
6 | def load_namespace(namespace: str) -> None:
7 | if namespace in FUGUE_CONTRIB:
8 | path = FUGUE_CONTRIB[namespace]["module"]
9 | importlib.import_module(path)
10 |
--------------------------------------------------------------------------------
/fugue_ibis/_compat.py:
--------------------------------------------------------------------------------
1 | # flake8: noqa
2 | # pylint: disable-all
3 |
4 | try: # pragma: no cover
5 | from ibis.expr.types import Table as IbisTable
6 | except Exception: # pragma: no cover
7 | from ibis.expr.types import TableExpr as IbisTable
8 |
9 | from ibis import Schema as IbisSchema
10 |
--------------------------------------------------------------------------------
/fugue_contrib/contrib.py:
--------------------------------------------------------------------------------
1 | from typing import Dict, Any
2 |
3 | FUGUE_CONTRIB: Dict[str, Any] = {
4 | "viz": {"module": "fugue_contrib.viz"},
5 | "sns": {"module": "fugue_contrib.seaborn"},
6 | "why": {"module": "whylogs.api.fugue.registry"},
7 | "vizzu": {"module": "ipyvizzu.integrations.fugue"},
8 | }
9 |
--------------------------------------------------------------------------------
/fugue/test/__init__.py:
--------------------------------------------------------------------------------
1 | # flake8: noqa
2 | from .pandas_tester import NativeTestBackend, PandasTestBackend
3 | from .plugins import (
4 | FugueTestBackend,
5 | FugueTestContext,
6 | FugueTestSuite,
7 | extract_conf,
8 | fugue_test_backend,
9 | fugue_test_suite,
10 | with_backend,
11 | )
12 |
--------------------------------------------------------------------------------
/fugue_duckdb/__init__.py:
--------------------------------------------------------------------------------
1 | # flake8: noqa
2 | from fugue import register_execution_engine, register_sql_engine
3 |
4 | from fugue_duckdb.execution_engine import DuckDBEngine, DuckExecutionEngine
5 |
6 | try:
7 | from fugue_duckdb.dask import DuckDaskExecutionEngine
8 | except Exception: # pragma: no cover
9 | pass
10 |
--------------------------------------------------------------------------------
/fugue_sql/exceptions.py:
--------------------------------------------------------------------------------
1 | # pylint: disable-all
2 | # flake8: noqa
3 | # TODO: This folder is to be deprecated
4 | import warnings
5 | from fugue.exceptions import *
6 |
7 | warnings.warn(
8 | "fsql and FugueSQLWorkflow now should be imported directly from fugue, "
9 | "fugue_sql will be removed in 0.9.0"
10 | )
11 |
--------------------------------------------------------------------------------
/scripts/setupsparkconnect.sh:
--------------------------------------------------------------------------------
1 | wget https://dlcdn.apache.org/spark/spark-3.5.7/spark-3.5.7-bin-hadoop3.tgz -O - | tar -xz -C /tmp
2 | # export SPARK_NO_DAEMONIZE=1
3 | bash /tmp/spark-3.5.7-bin-hadoop3/sbin/start-connect-server.sh --jars https://repo1.maven.org/maven2/org/apache/spark/spark-connect_2.12/3.5.7/spark-connect_2.12-3.5.7.jar
4 |
--------------------------------------------------------------------------------
/fugue_sql/__init__.py:
--------------------------------------------------------------------------------
1 | # flake8: noqa
2 | # TODO: This folder is to be deprecated
3 | from fugue_version import __version__
4 |
5 | import warnings
6 | from fugue import FugueSQLWorkflow, fsql
7 |
8 | warnings.warn(
9 | "fsql and FugueSQLWorkflow now should be imported directly from fugue, "
10 | "fugue_sql will be removed in 0.9.0"
11 | )
12 |
--------------------------------------------------------------------------------
/.readthedocs.yaml:
--------------------------------------------------------------------------------
1 | version: 2
2 |
3 | # Set the version of Python and other tools you might need
4 | build:
5 | os: ubuntu-20.04
6 | tools:
7 | python: "3.10"
8 | jobs:
9 | pre_install:
10 | - pip install -U pip
11 |
12 | sphinx:
13 | configuration: docs/conf.py
14 |
15 | python:
16 | install:
17 | - requirements: requirements.txt
18 |
--------------------------------------------------------------------------------
/fugue/_utils/registry.py:
--------------------------------------------------------------------------------
1 | from typing import Callable
2 |
3 | from triad import conditional_dispatcher
4 | from triad.utils.dispatcher import ConditionalDispatcher
5 |
6 | from ..constants import FUGUE_ENTRYPOINT
7 |
8 |
9 | def fugue_plugin(func: Callable) -> ConditionalDispatcher:
10 | return conditional_dispatcher(entry_point=FUGUE_ENTRYPOINT)(func) # type: ignore
11 |
--------------------------------------------------------------------------------
/fugue/workflow/input.py:
--------------------------------------------------------------------------------
1 | from typing import Type
2 |
3 |
4 | def register_raw_df_type(df_type: Type) -> None: # pragma: no cover
5 | """TODO: This function is to be removed before 0.9.0
6 |
7 | .. deprecated:: 0.8.0
8 | Register using :func:`fugue.api.is_df` instead.
9 | """
10 | raise DeprecationWarning("use fugue.api.is_df to register the dataframe")
11 |
--------------------------------------------------------------------------------
/tests/fugue_polars/test_api.py:
--------------------------------------------------------------------------------
1 | import fugue.api as fa
2 | import pandas as pd
3 | import polars as pl
4 |
5 |
6 | def test_to_df():
7 | df = pl.from_pandas(pd.DataFrame({"a": [0, 1]}))
8 | res = fa.fugue_sql("SELECT * FROM df", df=df, engine="duckdb")
9 | assert fa.as_array(res) == [[0], [1]]
10 |
11 | df2 = pl.from_pandas(pd.DataFrame({"a": [0]}))
12 | res = fa.inner_join(df, df2, engine="duckdb")
13 | assert fa.as_array(res) == [[0]]
14 |
--------------------------------------------------------------------------------
/.pylintrc:
--------------------------------------------------------------------------------
1 | [MESSAGES CONTROL]
2 | disable=unknown-option-value,useless-option-value,C0103,C0114,C0115,C0116,C2201,C0200,C0201,C0207,C0209,C0302,C0411,C0415,C2801,E0401,E0712,E1130,E1136,R0201,R0205,R0801,R0902,R0903,R0904,R0911,R0912,R0913,R0914,R0915,R0917,R1705,R1710,R1714,R1718,R1720,R1724,R1731,R1735,R1737,W0102,W0107,W0108,W0201,W0212,W0221,W0223,W0237,W0511,W0603,W0613,W0621,W0622,W0631,W0640,W0703,W0707,W0719,W1116
3 | # TODO: R0205: inherits from object, can be safely removed
4 |
--------------------------------------------------------------------------------
/tests/fugue_ibis/mock/registry.py:
--------------------------------------------------------------------------------
1 | from fugue.plugins import parse_execution_engine
2 | from typing import Any
3 | from .execution_engine import MockDuckExecutionEngine
4 |
5 |
6 | @parse_execution_engine.candidate(
7 | lambda engine, conf, **kwargs: isinstance(engine, str) and engine == "mockibisduck"
8 | )
9 | def _parse_mockibisduck(
10 | engine: str, conf: Any, **kwargs: Any
11 | ) -> MockDuckExecutionEngine:
12 | return MockDuckExecutionEngine(conf=conf)
13 |
--------------------------------------------------------------------------------
/tests/fugue/utils/test_misc.py:
--------------------------------------------------------------------------------
1 | from fugue._utils.misc import get_attribute
2 | from pytest import raises
3 |
4 |
5 | def test_get_attribute():
6 | class C(object):
7 | pass
8 |
9 | c = C()
10 | assert "x" not in c.__dict__
11 | assert 0 == get_attribute(c, "x", int)
12 | assert 0 == c.x
13 | assert 0 == get_attribute(c, "x", int)
14 | c.x = 10
15 | assert 10 == get_attribute(c, "x", int)
16 | raises(TypeError, lambda: get_attribute(c, "x", str))
17 |
--------------------------------------------------------------------------------
/fugue/execution/__init__.py:
--------------------------------------------------------------------------------
1 | # flake8: noqa
2 | from .api import *
3 | from .execution_engine import AnyExecutionEngine, ExecutionEngine, MapEngine, SQLEngine
4 | from .factory import (
5 | infer_execution_engine,
6 | make_execution_engine,
7 | make_sql_engine,
8 | register_default_execution_engine,
9 | register_default_sql_engine,
10 | register_execution_engine,
11 | register_sql_engine,
12 | )
13 | from .native_execution_engine import NativeExecutionEngine, QPDPandasEngine
14 |
--------------------------------------------------------------------------------
/fugue/extensions/transformer/__init__.py:
--------------------------------------------------------------------------------
1 | # flake8: noqa
2 | from fugue.extensions.transformer.convert import (
3 | _to_output_transformer,
4 | _to_transformer,
5 | cotransformer,
6 | output_cotransformer,
7 | output_transformer,
8 | parse_output_transformer,
9 | parse_transformer,
10 | register_output_transformer,
11 | register_transformer,
12 | transformer,
13 | )
14 | from fugue.extensions.transformer.transformer import (
15 | CoTransformer,
16 | OutputCoTransformer,
17 | OutputTransformer,
18 | Transformer,
19 | )
20 |
--------------------------------------------------------------------------------
/fugue_dask/_constants.py:
--------------------------------------------------------------------------------
1 | from typing import Any, Dict
2 |
3 | import dask
4 | import pandas as pd
5 | import pyarrow as pa
6 | from packaging import version
7 |
8 | FUGUE_DASK_CONF_DEFAULT_PARTITIONS = "fugue.dask.default.partitions"
9 | FUGUE_DASK_DEFAULT_CONF: Dict[str, Any] = {FUGUE_DASK_CONF_DEFAULT_PARTITIONS: -1}
10 | FUGUE_DASK_USE_ARROW = (
11 | hasattr(pd, "ArrowDtype")
12 | and version.parse(dask.__version__) >= version.parse("2023.2")
13 | and version.parse(pa.__version__) >= version.parse("7")
14 | and version.parse(pd.__version__) >= version.parse("2")
15 | )
16 |
--------------------------------------------------------------------------------
/.github/ISSUE_TEMPLATE/bug_report.md:
--------------------------------------------------------------------------------
1 | ---
2 | name: Bug report
3 | about: Create a report to help us improve
4 | title: "[BUG]"
5 | labels: ''
6 | assignees: ''
7 |
8 | ---
9 |
10 | **Minimal Code To Reproduce**
11 |
12 | ```python
13 | ```
14 |
15 | **Describe the bug**
16 | A clear and concise description of what the bug is.
17 |
18 | **Expected behavior**
19 | A clear and concise description of what you expected to happen.
20 |
21 | **Environment (please complete the following information):**
22 | - Backend: pandas/dask/ray?
23 | - Backend version:
24 | - Python version:
25 | - OS: linux/windows
26 |
--------------------------------------------------------------------------------
/fugue_ray/tester.py:
--------------------------------------------------------------------------------
1 | from contextlib import contextmanager
2 | from typing import Any, Dict, Iterator
3 |
4 | import ray
5 |
6 | import fugue.test as ft
7 |
8 |
9 | @ft.fugue_test_backend
10 | class RayTestBackend(ft.FugueTestBackend):
11 | name = "ray"
12 | default_session_conf = {"num_cpus": 2}
13 | default_fugue_conf = {
14 | "fugue.ray.zero_copy": True,
15 | "fugue.ray.default.batch_size": 10000,
16 | }
17 |
18 | @classmethod
19 | @contextmanager
20 | def session_context(cls, session_conf: Dict[str, Any]) -> Iterator[Any]:
21 | with ray.init(**session_conf):
22 | yield "ray"
23 |
--------------------------------------------------------------------------------
/fugue_ray/_constants.py:
--------------------------------------------------------------------------------
1 | from typing import Any, Dict
2 |
3 | import ray
4 | from packaging import version
5 |
6 | FUGUE_RAY_CONF_SHUFFLE_PARTITIONS = "fugue.ray.shuffle.partitions"
7 | FUGUE_RAY_DEFAULT_PARTITIONS = "fugue.ray.default.partitions"
8 | FUGUE_RAY_DEFAULT_BATCH_SIZE = "fugue.ray.default.batch_size"
9 | FUGUE_RAY_ZERO_COPY = "fugue.ray.zero_copy"
10 |
11 | FUGUE_RAY_DEFAULT_CONF: Dict[str, Any] = {
12 | FUGUE_RAY_CONF_SHUFFLE_PARTITIONS: -1,
13 | FUGUE_RAY_DEFAULT_PARTITIONS: 0,
14 | FUGUE_RAY_ZERO_COPY: True,
15 | }
16 | RAY_VERSION = version.parse(ray.__version__)
17 |
18 | _ZERO_COPY: Dict[str, Any] = {"zero_copy_batch": True}
19 |
--------------------------------------------------------------------------------
/tests/fugue_duckdb/test_importless.py:
--------------------------------------------------------------------------------
1 | from fugue import FugueWorkflow
2 | from fugue import fsql
3 |
4 |
5 | def test_importless():
6 | for engine in ["duck", "duckdb"]:
7 | dag = FugueWorkflow()
8 | dag.df([[0]], "a:int").show()
9 |
10 | dag.run(engine)
11 |
12 | fsql(
13 | """
14 | CREATE [[0],[1]] SCHEMA a:int
15 | SELECT * WHERE a<1
16 | PRINT
17 | """
18 | ).run(engine)
19 |
20 | dag = FugueWorkflow()
21 | tdf = dag.df([[0], [1]], "a:int")
22 | dag.select("SELECT * FROM ", tdf, " WHERE a<1", sql_engine=engine)
23 |
24 | dag.run()
25 |
--------------------------------------------------------------------------------
/tests/fugue_ray/test_utils.py:
--------------------------------------------------------------------------------
1 | from triad import Schema
2 |
3 | import fugue.test as ft
4 | from fugue_ray import RayDataFrame
5 | from fugue_ray._utils.dataframe import add_partition_key
6 |
7 |
8 | @ft.with_backend("ray")
9 | def test_add_partition_key():
10 | df = RayDataFrame([[0, "a"], [1, "b"]], "a:int,b:str")
11 | res, s = add_partition_key(df.native, df.schema, ["b", "a"], output_key="x")
12 | assert s == Schema("a:int,b:str,x:binary")
13 |
14 | res, s = add_partition_key(df.native, df.schema, ["b"], output_key="x")
15 | assert s == "a:int,b:str,x:str"
16 | assert RayDataFrame(res, s).as_array() == [[0, "a", "a"], [1, "b", "b"]]
17 |
--------------------------------------------------------------------------------
/tests/fugue_dask/test_importless.py:
--------------------------------------------------------------------------------
1 | import pytest
2 |
3 | from fugue import FugueWorkflow, fsql
4 | import fugue.test as ft
5 |
6 | @ft.with_backend("dask")
7 | def test_importless(backend_context):
8 | pytest.importorskip("fugue_sql_antlr")
9 | for engine in ["dask", backend_context.session]:
10 | dag = FugueWorkflow()
11 | dag.df([[0]], "a:int").show()
12 |
13 | dag.run(engine)
14 |
15 | fsql(
16 | """
17 | CREATE [[0],[1]] SCHEMA a:int
18 | SELECT * WHERE a<1
19 | PRINT
20 | """
21 | ).run(engine)
22 |
23 | dag = FugueWorkflow()
24 |
25 | dag.run(engine)
26 |
--------------------------------------------------------------------------------
/fugue/extensions/__init__.py:
--------------------------------------------------------------------------------
1 | # flake8: noqa
2 | from ._utils import namespace_candidate
3 | from .creator import Creator, creator, parse_creator, register_creator
4 | from .outputter import Outputter, outputter, parse_outputter, register_outputter
5 | from .processor import Processor, parse_processor, processor, register_processor
6 | from .transformer import (
7 | CoTransformer,
8 | OutputCoTransformer,
9 | OutputTransformer,
10 | Transformer,
11 | cotransformer,
12 | output_cotransformer,
13 | output_transformer,
14 | parse_output_transformer,
15 | parse_transformer,
16 | register_output_transformer,
17 | register_transformer,
18 | transformer,
19 | )
20 |
--------------------------------------------------------------------------------
/fugue_ray/_utils/cluster.py:
--------------------------------------------------------------------------------
1 | from fugue import ExecutionEngine
2 |
3 | from .._constants import FUGUE_RAY_CONF_SHUFFLE_PARTITIONS, FUGUE_RAY_DEFAULT_PARTITIONS
4 | from fugue.constants import FUGUE_CONF_DEFAULT_PARTITIONS
5 |
6 |
7 | def get_default_partitions(engine: ExecutionEngine) -> int:
8 | n = engine.conf.get(
9 | FUGUE_RAY_DEFAULT_PARTITIONS, engine.conf.get(FUGUE_CONF_DEFAULT_PARTITIONS, -1)
10 | )
11 | return n if n >= 0 else engine.get_current_parallelism() * 2
12 |
13 |
14 | def get_default_shuffle_partitions(engine: ExecutionEngine) -> int:
15 | n = engine.conf.get(FUGUE_RAY_CONF_SHUFFLE_PARTITIONS, -1)
16 | return n if n >= 0 else get_default_partitions(engine)
17 |
--------------------------------------------------------------------------------
/fugue/extensions/_builtins/__init__.py:
--------------------------------------------------------------------------------
1 | # flake8: noqa
2 | from fugue.extensions._builtins.creators import Load, CreateData
3 | from fugue.extensions._builtins.outputters import (
4 | AssertEqual,
5 | AssertNotEqual,
6 | RunOutputTransformer,
7 | Save,
8 | Show,
9 | )
10 | from fugue.extensions._builtins.processors import (
11 | Aggregate,
12 | AlterColumns,
13 | Assign,
14 | Distinct,
15 | DropColumns,
16 | Dropna,
17 | Fillna,
18 | Filter,
19 | Rename,
20 | RunJoin,
21 | RunSetOperation,
22 | RunSQLSelect,
23 | RunTransformer,
24 | Sample,
25 | SaveAndUse,
26 | Select,
27 | SelectColumns,
28 | Take,
29 | Zip,
30 | )
31 |
--------------------------------------------------------------------------------
/requirements.txt:
--------------------------------------------------------------------------------
1 | .[all]
2 |
3 | furo
4 |
5 | # test requirements
6 | pre-commit
7 | black>=22.3.0
8 | mypy
9 | flake8
10 | autopep8
11 | pylint==3.2.6
12 | pytest
13 | pytest-cov
14 | pytest-mock
15 | pytest-rerunfailures==10.2
16 | sphinx>=2.4.0
17 | sphinx-rtd-theme
18 | sphinx-autodoc-typehints
19 | flask
20 | psutil
21 | matplotlib
22 | seaborn
23 |
24 | notebook<7
25 | jupyter_contrib_nbextensions
26 |
27 | s3fs
28 |
29 | pyspark[connect]
30 | duckdb-engine>=0.6.4
31 | sqlalchemy==2.0.10 # 2.0.11 has a bug
32 | ray[data]>=2.30.0
33 | pydantic<2.5 # 2.5.0+ doesn't work with ray 2.8
34 | # pyarrow==7.0.0
35 | dask[distributed,dataframe]==2025.3.0
36 | dask-sql
37 |
38 | # publish to pypi
39 | wheel
40 | twine
41 |
--------------------------------------------------------------------------------
/tests/fugue_ibis/mock/tester.py:
--------------------------------------------------------------------------------
1 | from contextlib import contextmanager
2 | from typing import Any, Dict, Iterator
3 |
4 | import pytest
5 |
6 | import fugue.test as ft
7 | from .registry import * # noqa: F401, F403 # pylint: disable-all
8 |
9 |
10 | @ft.fugue_test_backend
11 | class _MockIbisDuckDBTestBackend(ft.FugueTestBackend):
12 | name = "mockibisduck"
13 |
14 | @classmethod
15 | @contextmanager
16 | def session_context(cls, session_conf: Dict[str, Any]) -> Iterator[Any]:
17 | yield "mockibisduck"
18 |
19 |
20 | @pytest.fixture(scope="module")
21 | def mockibisduck_session():
22 | with _MockIbisDuckDBTestBackend.generate_session_fixture() as session:
23 | yield session
24 |
--------------------------------------------------------------------------------
/.github/ISSUE_TEMPLATE/feature_request.md:
--------------------------------------------------------------------------------
1 | ---
2 | name: Feature request
3 | about: Suggest an idea for this project
4 | title: "[FEATURE]"
5 | labels: ''
6 | assignees: ''
7 |
8 | ---
9 |
10 | **Is your feature request related to a problem? Please describe.**
11 | A clear and concise description of what the problem is. Ex. I'm always frustrated when [...]
12 |
13 | **Describe the solution you'd like**
14 | A clear and concise description of what you want to happen.
15 |
16 | **Describe alternatives you've considered**
17 | A clear and concise description of any alternative solutions or features you've considered.
18 |
19 | **Additional context**
20 | Add any other context or screenshots about the feature request here.
21 |
--------------------------------------------------------------------------------
/docs/Makefile:
--------------------------------------------------------------------------------
1 | # Minimal makefile for Sphinx documentation
2 | #
3 |
4 | # You can set these variables from the command line, and also
5 | # from the environment for the first two.
6 | SPHINXOPTS ?=
7 | SPHINXBUILD ?= sphinx-build
8 | SOURCEDIR = source
9 | BUILDDIR = build
10 |
11 | # Put it first so that "make" without argument is like "make help".
12 | help:
13 | @$(SPHINXBUILD) -M help "$(SOURCEDIR)" "$(BUILDDIR)" $(SPHINXOPTS) $(O)
14 |
15 | .PHONY: help Makefile
16 |
17 | # Catch-all target: route all unknown targets to Sphinx using the new
18 | # "make mode" option. $(O) is meant as a shortcut for $(SPHINXOPTS).
19 | %: Makefile
20 | @$(SPHINXBUILD) -M $@ "$(SOURCEDIR)" "$(BUILDDIR)" $(SPHINXOPTS) $(O)
21 |
--------------------------------------------------------------------------------
/fugue/test/pandas_tester.py:
--------------------------------------------------------------------------------
1 | from contextlib import contextmanager
2 | from typing import Any, Dict, Iterator
3 |
4 | from .plugins import FugueTestBackend, fugue_test_backend
5 |
6 |
7 | @fugue_test_backend
8 | class PandasTestBackend(FugueTestBackend):
9 | name = "pandas"
10 |
11 | @classmethod
12 | @contextmanager
13 | def session_context(cls, session_conf: Dict[str, Any]) -> Iterator[Any]:
14 | yield "pandas" # pragma: no cover
15 |
16 |
17 | @fugue_test_backend
18 | class NativeTestBackend(FugueTestBackend):
19 | name = "native"
20 |
21 | @classmethod
22 | @contextmanager
23 | def session_context(cls, session_conf: Dict[str, Any]) -> Iterator[Any]:
24 | yield "native" # pragma: no cover
25 |
--------------------------------------------------------------------------------
/tests/fugue_spark/test_sql.py:
--------------------------------------------------------------------------------
1 | import pandas as pd
2 | import pytest
3 | from pyspark.sql import SparkSession
4 |
5 | from fugue import FugueSQLWorkflow, register_execution_engine
6 | from fugue_spark import SparkExecutionEngine
7 |
8 |
9 | def test_sql(spark_session):
10 | pytest.importorskip("fugue_sql_antlr")
11 | register_execution_engine(
12 | "_spark",
13 | lambda conf, **kwargs: SparkExecutionEngine(
14 | conf=conf, spark_session=spark_session
15 | ),
16 | )
17 | df = spark_session.createDataFrame(pd.DataFrame([[0], [1]], columns=["a"]))
18 | dag = FugueSQLWorkflow()
19 | dag(
20 | """
21 | SELECT * FROM df WHERE a>0
22 | PRINT
23 | """,
24 | df=df,
25 | )
26 | dag.run("_spark")
27 |
--------------------------------------------------------------------------------
/fugue/dataframe/__init__.py:
--------------------------------------------------------------------------------
1 | # flake8: noqa
2 | from .api import *
3 | from .array_dataframe import ArrayDataFrame
4 | from .arrow_dataframe import ArrowDataFrame
5 | from .dataframe import (
6 | AnyDataFrame,
7 | DataFrame,
8 | LocalBoundedDataFrame,
9 | LocalDataFrame,
10 | YieldedDataFrame,
11 | )
12 | from .dataframe_iterable_dataframe import (
13 | IterableArrowDataFrame,
14 | IterablePandasDataFrame,
15 | LocalDataFrameIterableDataFrame,
16 | )
17 | from .dataframes import DataFrames
18 | from .function_wrapper import DataFrameFunctionWrapper, fugue_annotated_param
19 | from .iterable_dataframe import IterableDataFrame
20 | from .pandas_dataframe import PandasDataFrame
21 | from .utils import get_column_names, normalize_dataframe_column_names, rename
22 |
--------------------------------------------------------------------------------
/tests/fugue_dask/test_sql.py:
--------------------------------------------------------------------------------
1 | import pytest
2 |
3 | pytest.importorskip("fugue_sql_antlr")
4 | import dask.dataframe as dd
5 | import pandas as pd
6 |
7 | from fugue import FugueSQLWorkflow, register_execution_engine
8 | from fugue_dask import DaskExecutionEngine
9 | import fugue.test as ft
10 |
11 |
12 | @ft.with_backend("dask")
13 | def test_sql(backend_context):
14 | register_execution_engine(
15 | "da",
16 | lambda conf, **kwargs: DaskExecutionEngine(
17 | conf=conf, dask_client=backend_context.session
18 | ),
19 | )
20 | df = dd.from_pandas(pd.DataFrame([[0], [1]], columns=["a"]), npartitions=2)
21 | dag = FugueSQLWorkflow()
22 | dag(
23 | """
24 | SELECT * FROM df WHERE a>0
25 | PRINT
26 | """,
27 | df=df,
28 | )
29 | dag.run("da")
30 |
--------------------------------------------------------------------------------
/fugue_dask/tester.py:
--------------------------------------------------------------------------------
1 | from contextlib import contextmanager
2 | from typing import Any, Dict, Iterator
3 |
4 | import dask
5 | from dask.distributed import Client
6 |
7 | import fugue.test as ft
8 |
9 |
10 | @ft.fugue_test_backend
11 | class DaskTestBackend(ft.FugueTestBackend):
12 | name = "dask"
13 |
14 | @classmethod
15 | def transform_session_conf(cls, conf: Dict[str, Any]) -> Dict[str, Any]:
16 | return ft.extract_conf(conf, "dask.", remove_prefix=True)
17 |
18 | @classmethod
19 | @contextmanager
20 | def session_context(cls, session_conf: Dict[str, Any]) -> Iterator[Any]:
21 | with Client(**session_conf) as client:
22 | dask.config.set({"dataframe.shuffle.method": "tasks"})
23 | dask.config.set({"dataframe.convert-string": False})
24 | yield client
25 |
--------------------------------------------------------------------------------
/tests/fugue_ray/test_registry.py:
--------------------------------------------------------------------------------
1 | import pandas as pd
2 | import ray.data as rd
3 |
4 | import fugue.test as ft
5 | from fugue import FugueWorkflow
6 | from fugue_ray import RayExecutionEngine
7 |
8 |
9 | @ft.with_backend("ray")
10 | def test_registry():
11 | def creator() -> rd.Dataset:
12 | return rd.from_pandas(pd.DataFrame(dict(a=[1, 2], b=["a", "b"])))
13 |
14 | def processor1(ctx: RayExecutionEngine, df: rd.Dataset) -> pd.DataFrame:
15 | assert isinstance(ctx, RayExecutionEngine)
16 | return df.to_pandas()
17 |
18 | def processor2(df: pd.DataFrame) -> rd.Dataset:
19 | return rd.from_pandas(df)
20 |
21 | def outputter(df: rd.Dataset) -> None:
22 | assert [[1, "a"], [2, "b"]] == df.to_pandas().values.tolist()
23 |
24 | dag = FugueWorkflow()
25 | dag.create(creator).process(processor1).process(processor2).output(outputter)
26 |
27 | dag.run("ray")
28 |
--------------------------------------------------------------------------------
/docs/make.bat:
--------------------------------------------------------------------------------
1 | @ECHO OFF
2 |
3 | pushd %~dp0
4 |
5 | REM Command file for Sphinx documentation
6 |
7 | if "%SPHINXBUILD%" == "" (
8 | set SPHINXBUILD=sphinx-build
9 | )
10 | set SOURCEDIR=source
11 | set BUILDDIR=build
12 |
13 | if "%1" == "" goto help
14 |
15 | %SPHINXBUILD% >NUL 2>NUL
16 | if errorlevel 9009 (
17 | echo.
18 | echo.The 'sphinx-build' command was not found. Make sure you have Sphinx
19 | echo.installed, then set the SPHINXBUILD environment variable to point
20 | echo.to the full path of the 'sphinx-build' executable. Alternatively you
21 | echo.may add the Sphinx directory to PATH.
22 | echo.
23 | echo.If you don't have Sphinx installed, grab it from
24 | echo.http://sphinx-doc.org/
25 | exit /b 1
26 | )
27 |
28 | %SPHINXBUILD% -M %1 %SOURCEDIR% %BUILDDIR% %SPHINXOPTS% %O%
29 | goto end
30 |
31 | :help
32 | %SPHINXBUILD% -M help %SOURCEDIR% %BUILDDIR% %SPHINXOPTS% %O%
33 |
34 | :end
35 | popd
36 |
--------------------------------------------------------------------------------
/fugue_spark/_utils/misc.py:
--------------------------------------------------------------------------------
1 | from typing import Any
2 |
3 | try:
4 | from pyspark.sql.connect.session import SparkSession as SparkConnectSession
5 | from pyspark.sql.connect.dataframe import DataFrame as SparkConnectDataFrame
6 | except Exception: # pragma: no cover
7 | SparkConnectSession = None
8 | SparkConnectDataFrame = None
9 | import pyspark.sql as ps
10 |
11 |
12 | def is_spark_connect(session: Any) -> bool:
13 | return SparkConnectSession is not None and isinstance(
14 | session, (SparkConnectSession, SparkConnectDataFrame)
15 | )
16 |
17 |
18 | def is_spark_dataframe(df: Any) -> bool:
19 | return isinstance(df, ps.DataFrame) or (
20 | SparkConnectDataFrame is not None and isinstance(df, SparkConnectDataFrame)
21 | )
22 |
23 |
24 | def is_spark_session(session: Any) -> bool:
25 | return isinstance(session, ps.SparkSession) or (
26 | SparkConnectSession is not None and isinstance(session, SparkConnectSession)
27 | )
28 |
--------------------------------------------------------------------------------
/fugue/_utils/misc.py:
--------------------------------------------------------------------------------
1 | from typing import Any, Type, TypeVar
2 |
3 | from triad.utils.assertion import assert_or_throw
4 |
5 | T = TypeVar("T")
6 |
7 |
8 | def get_attribute(obj: object, attr_name: str, data_type: Type[T]) -> T:
9 | if attr_name not in obj.__dict__ or obj.__dict__[attr_name] is None:
10 | obj.__dict__[attr_name] = data_type()
11 | assert_or_throw(
12 | isinstance(obj.__dict__[attr_name], data_type),
13 | lambda: TypeError(f"{obj.__dict__[attr_name]} is not type {data_type}"),
14 | )
15 | return obj.__dict__[attr_name]
16 |
17 |
18 | def import_or_throw(package_name: str, message: str) -> Any:
19 | try:
20 | return __import__(package_name)
21 | except Exception as e: # pragma: no cover
22 | raise ImportError(str(e) + ". " + message)
23 |
24 |
25 | def import_fsql_dependency(package_name: str) -> Any:
26 | return import_or_throw(
27 | package_name, "Please try to install the package by `pip install fugue[sql]`."
28 | )
29 |
--------------------------------------------------------------------------------
/fugue/registry.py:
--------------------------------------------------------------------------------
1 | from fugue.execution.factory import register_execution_engine, register_sql_engine
2 | from fugue.execution.native_execution_engine import (
3 | NativeExecutionEngine,
4 | QPDPandasEngine,
5 | )
6 |
7 |
8 | def _register() -> None:
9 | """Register Fugue core additional types
10 |
11 | .. note::
12 |
13 | This function is automatically called when you do
14 |
15 | >>> import fugue
16 | """
17 | _register_engines()
18 |
19 |
20 | def _register_engines() -> None:
21 | register_execution_engine(
22 | "native", lambda conf: NativeExecutionEngine(conf), on_dup="ignore"
23 | )
24 | register_execution_engine(
25 | "pandas", lambda conf: NativeExecutionEngine(conf), on_dup="ignore"
26 | )
27 | register_sql_engine(
28 | "qpdpandas", lambda engine: QPDPandasEngine(engine), on_dup="ignore"
29 | )
30 | register_sql_engine(
31 | "qpd_pandas", lambda engine: QPDPandasEngine(engine), on_dup="ignore"
32 | )
33 |
--------------------------------------------------------------------------------
/.github/workflows/publish.yml:
--------------------------------------------------------------------------------
1 | # This workflows will upload a Python Package using Twine when a release is created
2 | # For more information see: https://help.github.com/en/actions/language-and-framework-guides/using-python-with-github-actions#publishing-to-package-registries
3 |
4 | name: Publish
5 |
6 | on:
7 | release:
8 | types: [created]
9 |
10 | jobs:
11 | deploy:
12 | runs-on: ubuntu-latest
13 |
14 | steps:
15 | - uses: actions/checkout@v2
16 | - name: Set up Python
17 | uses: actions/setup-python@v1
18 | with:
19 | python-version: '3.10'
20 | - name: Install dependencies
21 | run: make devenv
22 | - name: Test
23 | if: "!github.event.release.prerelease"
24 | run: make test
25 | - name: Build and publish
26 | env:
27 | RELEASE_TAG: ${{ github.event.release.tag_name }}
28 | TWINE_USERNAME: __token__
29 | TWINE_PASSWORD: ${{ secrets.PYPI_TOKEN }}
30 | run: |
31 | make package
32 | twine upload dist/*
33 |
--------------------------------------------------------------------------------
/.github/workflows/test_notebook.yml:
--------------------------------------------------------------------------------
1 | # This workflow will install Python dependencies, run tests and lint with a variety of Python versions
2 | # For more information see: https://help.github.com/actions/language-and-framework-guides/using-python-with-github-actions
3 |
4 | name: Test Notebook Experience
5 |
6 | on:
7 | push:
8 | branches: [ master ]
9 | paths-ignore:
10 | - 'docs/**'
11 | - '**.md'
12 | pull_request:
13 | branches: [ master ]
14 | paths-ignore:
15 | - 'docs/**'
16 | - '**.md'
17 |
18 | concurrency:
19 | group: ${{ github.workflow }}-${{ github.ref }}
20 | cancel-in-progress: true
21 |
22 | jobs:
23 | build:
24 | runs-on: ubuntu-latest
25 | strategy:
26 | matrix:
27 | python-version: ["3.10"]
28 |
29 | steps:
30 | - uses: actions/checkout@v2
31 | - name: Set up Python ${{ matrix.python-version }}
32 | uses: actions/setup-python@v1
33 | with:
34 | python-version: ${{ matrix.python-version }}
35 | - name: Install dependencies
36 | run: make devenv
37 | - name: Test
38 | run: make testnotebook
39 |
--------------------------------------------------------------------------------
/.gitpod.yml:
--------------------------------------------------------------------------------
1 | image: fugueproject/gitpod:0.7.2
2 |
3 | tasks:
4 | - init: |
5 | make devenv
6 |
7 | github:
8 | prebuilds:
9 | # enable for the master/default branch (defaults to true)
10 | master: true
11 | # enable for all branches in this repo (defaults to false)
12 | branches: true
13 | # enable for pull requests coming from this repo (defaults to true)
14 | pullRequests: true
15 | # enable for pull requests coming from forks (defaults to false)
16 | pullRequestsFromForks: true
17 | # add a "Review in Gitpod" button as a comment to pull requests (defaults to true)
18 | addComment: true
19 | # add a "Review in Gitpod" button to pull requests (defaults to false)
20 | addBadge: false
21 | # add a label once the prebuild is ready to pull requests (defaults to false)
22 | addLabel: prebuilt-in-gitpod
23 |
24 | vscode:
25 | extensions:
26 | - ms-python.python
27 | - njpwerner.autodocstring
28 | - ms-toolsai.jupyter
29 | - ms-toolsai.jupyter-keymap
30 | - ms-toolsai.jupyter-renderers
31 | - ms-python.isort
32 | - virgilsisoe.python-auto-import
33 |
--------------------------------------------------------------------------------
/docs/tutorials.rst:
--------------------------------------------------------------------------------
1 |
2 | Fugue Tutorials
3 | ================
4 |
5 | To directly read the tutorials without running them:
6 |
7 | .. toctree::
8 |
9 | Tutorial Homepage
10 | For Beginners
11 | For Advanced Users
12 | For Fugue-SQL
13 |
14 |
15 |
16 | You may launch a
17 | `Fugue tutorial notebook environemnt on binder `_
18 |
19 | **But it runs slow on binder**, the machine on binder isn't powerful enough for
20 | a distributed framework such as Spark. Parallel executions can become sequential, so some of the
21 | performance comparison examples will not give you the correct numbers.
22 |
23 | Alternatively, you should get decent performance if running its docker image on your own machine:
24 |
25 | .. code-block:: bash
26 |
27 | docker run -p 8888:8888 fugueproject/tutorials:latest
28 |
29 |
30 |
31 |
32 |
33 |
--------------------------------------------------------------------------------
/tests/fugue/bag/test_array_bag.py:
--------------------------------------------------------------------------------
1 | from typing import Any
2 |
3 | from fugue import ArrayBag, Bag
4 | from fugue_test.bag_suite import BagTests
5 |
6 |
7 | class ArrayBagTests(BagTests.Tests):
8 | def bg(self, data: Any = None) -> Bag:
9 | return ArrayBag(data)
10 |
11 | def test_array_bag_init(self):
12 | def _it():
13 | yield from [1, 2, 3]
14 |
15 | bg = self.bg([])
16 | assert bg.count() == 0
17 | assert bg.is_local
18 | assert bg.is_bounded
19 | assert bg.as_local() is bg
20 | assert bg.empty
21 | assert bg.native == []
22 |
23 | for x in [[1, 2, 3], _it(), set([1, 2, 3])]:
24 | bg = self.bg(x)
25 | assert bg.count() == 3
26 | assert bg.is_local
27 | assert bg.is_bounded
28 | assert bg.as_local() is bg
29 | assert not bg.empty
30 | assert 1 == bg.num_partitions
31 | assert isinstance(bg.native, list)
32 |
33 | bg = self.bg(x + 1 for x in [])
34 | assert bg.count() == 0
35 | bg = self.bg(x + 1 for x in [1, 2, 3])
36 | assert bg.count() == 3
37 |
--------------------------------------------------------------------------------
/fugue/plugins.py:
--------------------------------------------------------------------------------
1 | # flake8: noqa
2 | # pylint: disable-all
3 | from fugue.collections.sql import transpile_sql
4 | from fugue.dataframe import (
5 | alter_columns,
6 | as_array,
7 | as_array_iterable,
8 | as_arrow,
9 | as_dict_iterable,
10 | as_dicts,
11 | as_pandas,
12 | drop_columns,
13 | fugue_annotated_param,
14 | get_column_names,
15 | get_schema,
16 | head,
17 | is_df,
18 | peek_array,
19 | peek_dict,
20 | rename,
21 | select_columns,
22 | )
23 | from fugue.dataset import (
24 | as_fugue_dataset,
25 | as_local,
26 | as_local_bounded,
27 | count,
28 | get_dataset_display,
29 | get_num_partitions,
30 | is_bounded,
31 | is_empty,
32 | is_local,
33 | )
34 | from fugue.execution.api import as_fugue_engine_df
35 | from fugue.execution.factory import (
36 | infer_execution_engine,
37 | parse_execution_engine,
38 | parse_sql_engine,
39 | )
40 | from fugue.extensions.creator import parse_creator
41 | from fugue.extensions.outputter import parse_outputter
42 | from fugue.extensions.processor import parse_processor
43 | from fugue.extensions.transformer import parse_output_transformer, parse_transformer
44 |
--------------------------------------------------------------------------------
/tests/fugue_spark/test_importless.py:
--------------------------------------------------------------------------------
1 | import pandas as pd
2 | import pytest
3 | from pyspark.sql import DataFrame, SparkSession
4 |
5 | from fugue import FugueWorkflow, fsql, transform
6 | from fugue_spark._utils.convert import to_pandas
7 | from fugue_spark.registry import _is_sparksql
8 |
9 |
10 | def test_importless(spark_session):
11 | pytest.importorskip("fugue_sql_antlr")
12 |
13 | for engine in [spark_session, "spark"]:
14 | dag = FugueWorkflow()
15 | dag.df([[0]], "a:int").show()
16 |
17 | dag.run(engine)
18 |
19 | fsql(
20 | """
21 | CREATE [[0],[1]] SCHEMA a:int
22 | SELECT * WHERE a<1
23 | PRINT
24 | """
25 | ).run(engine)
26 |
27 |
28 | def test_is_sparksql():
29 | assert _is_sparksql(("sparksql", "abc"))
30 | assert not _is_sparksql(123)
31 | assert not _is_sparksql("SELECT *")
32 |
33 |
34 | def test_transform_from_sparksql(spark_session):
35 | # schema: *
36 | def t(df: pd.DataFrame) -> pd.DataFrame:
37 | return df
38 |
39 | res = transform(("sparksql", "SELECT 1 AS a, 'b' AS aa"), t)
40 | assert isinstance(res, DataFrame) # engine inference
41 | assert to_pandas(res).to_dict("records") == [{"a": 1, "aa": "b"}]
42 |
--------------------------------------------------------------------------------
/.github/workflows/test_win.yml:
--------------------------------------------------------------------------------
1 | # This workflow will install Python dependencies, run tests and lint with a variety of Python versions
2 | # For more information see: https://help.github.com/actions/language-and-framework-guides/using-python-with-github-actions
3 |
4 | name: Test Windows
5 |
6 | on:
7 | push:
8 | branches: [ master ]
9 | paths-ignore:
10 | - 'docs/**'
11 | - '**.md'
12 | pull_request:
13 | branches: [ master ]
14 | paths-ignore:
15 | - 'docs/**'
16 | - '**.md'
17 |
18 | concurrency:
19 | group: ${{ github.workflow }}-${{ github.ref }}
20 | cancel-in-progress: true
21 |
22 | jobs:
23 | build:
24 | runs-on: windows-latest
25 | strategy:
26 | matrix:
27 | python-version: ["3.10"]
28 | steps:
29 | - uses: actions/checkout@v2
30 | - name: Set up Python ${{ matrix.python-version }}
31 | uses: actions/setup-python@v1
32 | with:
33 | python-version: ${{ matrix.python-version }}
34 | - name: Install dependencies
35 | run: pip install -r requirements.txt
36 | # - name: Install pyarrow
37 | # run: pip install pyarrow==8.0.0
38 | - name: Test
39 | run: python -m pytest --reruns 2 --only-rerun 'Overflow in cast' tests/fugue tests/fugue_dask tests/fugue_duckdb
40 |
--------------------------------------------------------------------------------
/fugue/bag/array_bag.py:
--------------------------------------------------------------------------------
1 | from types import GeneratorType
2 | from typing import Any, Iterable, List
3 |
4 | from ..exceptions import FugueDatasetEmptyError
5 | from .bag import LocalBoundedBag
6 |
7 |
8 | class ArrayBag(LocalBoundedBag):
9 | def __init__(self, data: Any, copy: bool = True):
10 | if isinstance(data, list):
11 | self._native = list(data) if copy else data
12 | elif isinstance(data, (GeneratorType, Iterable)):
13 | self._native = list(data)
14 | else:
15 | raise ValueError(f"{type(data)} can't be converted to ArrayBag")
16 | super().__init__()
17 |
18 | @property
19 | def native(self) -> List[Any]:
20 | """The underlying Python list object"""
21 | return self._native
22 |
23 | @property
24 | def empty(self) -> bool:
25 | return len(self._native) == 0
26 |
27 | def count(self) -> int:
28 | return len(self._native)
29 |
30 | def peek(self) -> Any:
31 | if self.count() == 0:
32 | raise FugueDatasetEmptyError()
33 | return self._native[0]
34 |
35 | def as_array(self) -> List[Any]:
36 | return list(self._native)
37 |
38 | def head(self, n: int) -> LocalBoundedBag:
39 | return ArrayBag(self._native[:n])
40 |
--------------------------------------------------------------------------------
/tests/fugue_ibis/mock/dataframe.py:
--------------------------------------------------------------------------------
1 | from typing import Any
2 |
3 | from fugue import ArrowDataFrame, DataFrame, LocalDataFrame
4 | from fugue.plugins import as_fugue_dataset, as_local_bounded
5 | from fugue_ibis import IbisDataFrame, IbisTable
6 |
7 |
8 | class MockDuckDataFrame(IbisDataFrame):
9 | def to_sql(self) -> str:
10 | return str(self.native.compile())
11 |
12 | def _to_new_df(self, table: IbisTable, schema: Any = None) -> DataFrame:
13 | return MockDuckDataFrame(table, schema=schema)
14 |
15 | def _to_local_df(self, table: IbisTable, schema: Any = None) -> LocalDataFrame:
16 | return ArrowDataFrame(table.execute(), schema=schema)
17 |
18 | def _to_iterable_df(self, table: IbisTable, schema: Any = None) -> LocalDataFrame:
19 | return self._to_local_df(table, schema=schema)
20 |
21 |
22 | # should also check the df._findbackend is duckdb
23 | @as_fugue_dataset.candidate(lambda df, **kwargs: isinstance(df, IbisTable))
24 | def _ibis_as_fugue(df: IbisTable, **kwargs: Any) -> bool:
25 | return MockDuckDataFrame(df, **kwargs)
26 |
27 |
28 | # should also check the df._findbackend is duckdb
29 | @as_local_bounded.candidate(lambda df, **kwargs: isinstance(df, IbisTable))
30 | def _ibis_as_local(df: IbisTable, **kwargs: Any) -> bool:
31 | return df.execute()
32 |
--------------------------------------------------------------------------------
/.github/workflows/test_core.yml:
--------------------------------------------------------------------------------
1 | # This workflow will install Python dependencies, run tests and lint with a variety of Python versions
2 | # For more information see: https://help.github.com/actions/language-and-framework-guides/using-python-with-github-actions
3 |
4 | name: Core Tests
5 |
6 | on:
7 | push:
8 | branches: [ master ]
9 | paths-ignore:
10 | - 'docs/**'
11 | - '**.md'
12 | pull_request:
13 | branches: [ master ]
14 | paths-ignore:
15 | - 'docs/**'
16 | - '**.md'
17 |
18 | concurrency:
19 | group: ${{ github.workflow }}-${{ github.ref }}
20 | cancel-in-progress: true
21 |
22 | jobs:
23 | core-tests:
24 | name: Tests
25 | runs-on: ubuntu-latest
26 | strategy:
27 | matrix:
28 | python-version: ["3.10", "3.11", "3.12"]
29 |
30 | steps:
31 | - uses: actions/checkout@v2
32 | - name: Set up Python ${{ matrix.python-version }}
33 | uses: actions/setup-python@v1
34 | with:
35 | python-version: ${{ matrix.python-version }}
36 | - name: Fix setuptools_scm
37 | run: pip install "setuptools_scm<7"
38 | - name: Install dependencies
39 | run: make devenv
40 | - name: Install pandas 2
41 | if: matrix.python-version == '3.10'
42 | run: pip install "pandas>=2"
43 | - name: Test
44 | run: make testcore
45 |
--------------------------------------------------------------------------------
/tests/fugue/rpc/test_func.py:
--------------------------------------------------------------------------------
1 | from fugue.rpc import RPCFunc, to_rpc_handler
2 | from pytest import raises
3 | from triad import to_uuid
4 | from copy import copy, deepcopy
5 |
6 |
7 | def test_rpc_func():
8 | def f1(a: str) -> str:
9 | return "1"
10 |
11 | d1 = RPCFunc(f1)
12 | d2 = to_rpc_handler(f1)
13 | assert to_uuid(d1) == to_uuid(d2)
14 | assert to_uuid(d1) == to_uuid(to_rpc_handler(d1))
15 | assert "1" == d1("x")
16 | with raises(ValueError):
17 | RPCFunc(1)
18 |
19 |
20 | def test_determinism():
21 | def _f1(a: str) -> str:
22 | return "1"
23 |
24 | assert to_uuid(RPCFunc(_f1)) == to_uuid(to_rpc_handler(_f1))
25 | assert to_uuid(RPCFunc(lambda x: x)) == to_uuid(RPCFunc(lambda x: x + 1))
26 |
27 |
28 | def test_no_copy():
29 | class T(object):
30 | def __init__(self):
31 | self.n = 0
32 |
33 | def call(self, n: int) -> int:
34 | self.n += n
35 | return self.n
36 |
37 | t = T()
38 | d1 = RPCFunc(t.call)
39 | assert 10 == d1(10)
40 | assert 10 == t.n
41 |
42 | d2 = to_rpc_handler(t.call)
43 | d2(10)
44 |
45 | d3 = to_rpc_handler(d1)
46 | d3(10)
47 | assert 30 == t.n
48 |
49 | d4 = copy(d3)
50 | d4(10)
51 |
52 | d5 = deepcopy(d4)
53 | d5(10)
54 | assert 50 == t.n
55 |
--------------------------------------------------------------------------------
/.devcontainer/devcontainer.json:
--------------------------------------------------------------------------------
1 | {
2 | "name": "Fugue Development Environment",
3 | "image": "mcr.microsoft.com/vscode/devcontainers/python:3.11-bookworm",
4 | "customizations": {
5 | "vscode": {
6 | "settings": {
7 | "terminal.integrated.shell.linux": "/bin/bash",
8 | "python.pythonPath": "/usr/local/bin/python",
9 | "python.defaultInterpreterPath": "/usr/local/bin/python",
10 | "editor.defaultFormatter": "ms-python.black-formatter",
11 | "isort.interpreter": [
12 | "/usr/local/bin/python"
13 | ],
14 | "flake8.interpreter": [
15 | "/usr/local/bin/python"
16 | ],
17 | "pylint.interpreter": [
18 | "/usr/local/bin/python"
19 | ],
20 | "black-formatter.interpreter": [
21 | "/usr/local/bin/python"
22 | ]
23 | },
24 | "extensions": [
25 | "ms-python.python",
26 | "ms-python.isort",
27 | "ms-python.flake8",
28 | "ms-python.pylint",
29 | "ms-python.mypy",
30 | "ms-python.black-formatter",
31 | "GitHub.copilot",
32 | "njpwerner.autodocstring"
33 | ]
34 | }
35 | },
36 | "forwardPorts": [
37 | 8888
38 | ],
39 | "postCreateCommand": "make devenv",
40 | "features": {
41 | "ghcr.io/devcontainers/features/docker-in-docker:2.12.4": {},
42 | "ghcr.io/devcontainers/features/java:1.6.3": {},
43 | "ghcr.io/devcontainers/features/node:1.6.3": {}
44 | }
45 | }
46 |
--------------------------------------------------------------------------------
/docs/index.rst:
--------------------------------------------------------------------------------
1 | .. Fugue documentation master file, created by
2 | sphinx-quickstart on Sun May 17 21:49:44 2020.
3 | You can adapt this file completely to your liking, but it should at least
4 | contain the root `toctree` directive.
5 |
6 | Fugue API Docs
7 | ==============
8 |
9 | Fugue is a unified interface for distributed computing that lets users execute Python,
10 | pandas, and SQL code on Spark, Dask, and Ray with minimal rewrites.
11 |
12 | This documentation page is mainly an API reference. To learn more about Fugue, the
13 | `Github repo README `_ and the
14 | `tutorials `_ will be the best places to start.
15 | The API reference is mainly for users looking for specific functions and methods.
16 |
17 | Installation
18 | ------------
19 |
20 | Fugue is available on both pip and conda. `Detailed instructions `_
21 | can be found on the README.
22 |
23 | Community
24 | ---------
25 |
26 | Please join the `Fugue Slack `_
27 | to ask questions. We will try to reply as soon as possible.
28 |
29 | For contributing, start with the `contributing guide `_
30 |
31 |
32 | .. toctree::
33 | :maxdepth: 3
34 | :hidden:
35 |
36 | tutorials
37 | top_api
38 | api
39 |
40 |
--------------------------------------------------------------------------------
/fugue_contrib/viz/__init__.py:
--------------------------------------------------------------------------------
1 | import json
2 | from typing import Any, Tuple
3 |
4 | import pandas as pd
5 |
6 | from fugue import Outputter
7 | from fugue.extensions import namespace_candidate, parse_outputter
8 |
9 | from ._ext import Visualize
10 |
11 |
12 | @parse_outputter.candidate(namespace_candidate("viz", lambda x: isinstance(x, str)))
13 | def _parse_pandas_plot(obj: Tuple[str, str]) -> Outputter:
14 | return _PandasVisualize(obj[1])
15 |
16 |
17 | class _PandasVisualize(Visualize):
18 | def __init__(self, func: str) -> None:
19 | super().__init__(func)
20 | if func != "plot":
21 | getattr(pd.DataFrame.plot, func) # ensure the func exists
22 |
23 | def _plot(self, df: pd.DataFrame) -> None:
24 | params = dict(self.params)
25 | if len(self.partition_spec.partition_by) > 0:
26 | keys = df[self.partition_spec.partition_by].head(1).to_dict("records")[0]
27 | kt = json.dumps(keys)[1:-1]
28 | if "title" in params:
29 | params["title"] = params["title"] + " -- " + kt
30 | else:
31 | params["title"] = kt
32 | df = df.drop(self.partition_spec.partition_by, axis=1)
33 | func = self._get_func(df)
34 | func(**params)
35 |
36 | def _get_func(self, df: pd.DataFrame) -> Any:
37 | if self._func == "plot":
38 | return df.plot
39 | return getattr(df.plot, self._func)
40 |
--------------------------------------------------------------------------------
/.github/workflows/test_no_sql.yml:
--------------------------------------------------------------------------------
1 | # This workflow will install Python dependencies, run tests and lint with a variety of Python versions
2 | # For more information see: https://help.github.com/actions/language-and-framework-guides/using-python-with-github-actions
3 |
4 | name: Tests Excluding SQL Dependencies
5 |
6 | on:
7 | push:
8 | branches: [ master ]
9 | paths-ignore:
10 | - 'docs/**'
11 | - '**.md'
12 | pull_request:
13 | branches: [ master ]
14 | paths-ignore:
15 | - 'docs/**'
16 | - '**.md'
17 |
18 | concurrency:
19 | group: ${{ github.workflow }}-${{ github.ref }}
20 | cancel-in-progress: true
21 |
22 | jobs:
23 | tests-no-sql:
24 | name: Tests
25 | runs-on: ubuntu-latest
26 | strategy:
27 | matrix:
28 | python-version: ["3.10"]
29 |
30 | steps:
31 | - uses: actions/checkout@v2
32 | - name: Set up Python ${{ matrix.python-version }}
33 | uses: actions/setup-python@v1
34 | with:
35 | python-version: ${{ matrix.python-version }}
36 | - name: Fix setuptools_scm
37 | run: pip install "setuptools_scm<7"
38 | - name: Install dependencies
39 | run: make devenv
40 | - name: Install pandas 2
41 | if: matrix.python-version == '3.10'
42 | run: pip install "pandas>=2"
43 | - name: Remove SQL dependencies
44 | run: pip uninstall -y qpd fugue-sql-antlr sqlglot
45 | - name: Test
46 | run: make testnosql
47 |
--------------------------------------------------------------------------------
/.github/workflows/test_all.yml:
--------------------------------------------------------------------------------
1 | # This workflow will install Python dependencies, run tests and lint with a variety of Python versions
2 | # For more information see: https://help.github.com/actions/language-and-framework-guides/using-python-with-github-actions
3 |
4 | name: Full Tests
5 |
6 | on:
7 | push:
8 | branches: [ master ]
9 | paths-ignore:
10 | - 'docs/**'
11 | - '**.md'
12 | pull_request:
13 | branches: [ master ]
14 | paths-ignore:
15 | - 'docs/**'
16 | - '**.md'
17 |
18 | concurrency:
19 | group: ${{ github.workflow }}-${{ github.ref }}
20 | cancel-in-progress: true
21 |
22 | jobs:
23 | all:
24 | name: Tests & Lint
25 | runs-on: ubuntu-latest
26 | strategy:
27 | matrix:
28 | python-version: ["3.10", "3.11", "3.12"]
29 |
30 | steps:
31 | - uses: actions/checkout@v2
32 | - name: Set up Python ${{ matrix.python-version }}
33 | uses: actions/setup-python@v1
34 | with:
35 | python-version: ${{ matrix.python-version }}
36 | - name: Install dependencies
37 | run: make devenv
38 | - name: Lint
39 | if: matrix.python-version == '3.10'
40 | run: make lint
41 | - name: Test
42 | run: make test
43 | - name: "Upload coverage to Codecov"
44 | if: matrix.python-version == '3.10'
45 | uses: codecov/codecov-action@v4
46 | with:
47 | fail_ci_if_error: false
48 | token: ${{ secrets.CODECOV_TOKEN }}
49 |
--------------------------------------------------------------------------------
/tests/fugue_spark/test_spark_connect.py:
--------------------------------------------------------------------------------
1 | import fugue.test as ft
2 |
3 | from .test_dataframe import NativeSparkDataFrameTestsBase as _NativeDataFrameTests
4 | from .test_dataframe import SparkDataFrameTestsBase as _DataFrameTests
5 | from .test_execution_engine import _CONF
6 | from .test_execution_engine import (
7 | SparkExecutionEngineBuiltInTestsBase as _WorkflowTests,
8 | )
9 | from .test_execution_engine import (
10 | SparkExecutionEnginePandasUDFTestsBase as _EngineTests,
11 | )
12 |
13 |
14 | @ft.fugue_test_suite("sparkconnect", mark_test=True)
15 | class SparkConnectDataFrameTests(_DataFrameTests):
16 | pass
17 |
18 |
19 | @ft.fugue_test_suite("sparkconnect", mark_test=True)
20 | class SparkConnectNativeDataFrameTests(_NativeDataFrameTests):
21 | pass
22 |
23 |
24 | @ft.fugue_test_suite("sparkconnect", mark_test=True)
25 | class SparkConnectExecutionEngineTests(_EngineTests):
26 | def test_using_pandas_udf(self):
27 | return
28 |
29 | def test_map_with_dict_col(self):
30 | return # spark connect has a bug
31 |
32 |
33 | @ft.fugue_test_suite(("sparkconnect", _CONF), mark_test=True)
34 | class SparkConnectBuiltInTests(_WorkflowTests):
35 | def test_annotation_3(self):
36 | return # RDD is not implemented in spark connect
37 |
38 | def test_repartition(self):
39 | return # spark connect doesn't support even repartitioning
40 |
41 | def test_repartition_large(self):
42 | return # spark connect doesn't support even repartitioning
43 |
--------------------------------------------------------------------------------
/fugue_contrib/viz/_ext.py:
--------------------------------------------------------------------------------
1 | from abc import ABC, abstractmethod
2 | from typing import Any
3 |
4 | import pandas as pd
5 | from triad import assert_or_throw
6 |
7 | from fugue import DataFrames, Outputter
8 | from fugue.exceptions import FugueWorkflowError
9 |
10 |
11 | class Visualize(Outputter, ABC):
12 | def __init__(self, func: str) -> None:
13 | super().__init__()
14 | self._func = func
15 |
16 | def process(self, dfs: DataFrames) -> None:
17 | assert_or_throw(len(dfs) == 1, FugueWorkflowError("not single input"))
18 | df = dfs[0].as_pandas()
19 | presort = self.partition_spec.presort
20 | presort_keys = list(presort.keys())
21 | presort_asc = list(presort.values())
22 | if len(presort_keys) > 0:
23 | df = df.sort_values(presort_keys, ascending=presort_asc).reset_index(
24 | drop=True
25 | )
26 | if len(self.partition_spec.partition_by) == 0:
27 | self._plot(df)
28 | else:
29 | keys: Any = ( # avoid pandas warning
30 | self.partition_spec.partition_by
31 | if len(self.partition_spec.partition_by) > 1
32 | else self.partition_spec.partition_by[0]
33 | )
34 | for _, gp in df.groupby(keys, dropna=False):
35 | self._plot(gp.reset_index(drop=True))
36 |
37 | @abstractmethod
38 | def _plot(self, df: pd.DataFrame) -> None: # pragma: no cover
39 | raise NotImplementedError
40 |
--------------------------------------------------------------------------------
/.github/workflows/test_ray.yml:
--------------------------------------------------------------------------------
1 | # This workflow will install Python dependencies, run tests and lint with a variety of Python versions
2 | # For more information see: https://help.github.com/actions/language-and-framework-guides/using-python-with-github-actions
3 |
4 | name: Ray Tests
5 |
6 | on:
7 | push:
8 | branches: [ master ]
9 | paths-ignore:
10 | - 'docs/**'
11 | - '**.md'
12 | pull_request:
13 | branches: [ master ]
14 | paths-ignore:
15 | - 'docs/**'
16 | - '**.md'
17 |
18 | concurrency:
19 | group: ${{ github.workflow }}-${{ github.ref }}
20 | cancel-in-progress: true
21 |
22 | jobs:
23 | test_ray_lower_bound:
24 | name: Ray 2.30.0
25 | runs-on: ubuntu-latest
26 |
27 | steps:
28 | - uses: actions/checkout@v2
29 | - name: Set up Python 3.10
30 | uses: actions/setup-python@v1
31 | with:
32 | python-version: "3.10"
33 | - name: Install dependencies
34 | run: make devenv
35 | - name: Setup Ray
36 | run: pip install ray[data]==2.30.0
37 | - name: Test
38 | run: make testray
39 |
40 | test_ray_latest:
41 | name: Ray Latest
42 | runs-on: ubuntu-latest
43 |
44 | steps:
45 | - uses: actions/checkout@v2
46 | - name: Set up Python 3.10
47 | uses: actions/setup-python@v1
48 | with:
49 | python-version: "3.10"
50 | - name: Install dependencies
51 | run: make devenv
52 | - name: Setup Ray
53 | run: pip install -U ray[data]
54 | - name: Test
55 | run: make testray
56 |
--------------------------------------------------------------------------------
/setup.cfg:
--------------------------------------------------------------------------------
1 | [metadata]
2 | description-file = README.md
3 | license_files = LICENSE
4 |
5 | [testenv]
6 | setenv =
7 | COV_CORE_SOURCE=
8 | COV_CORE_CONFIG=.coveragerc
9 | COV_CORE_DATAFILE=.coverage
10 |
11 | [tool:pytest]
12 | addopts =
13 | -p pytest_cov
14 | --cov=fugue
15 | --cov=fugue_test
16 | --cov=fugue_spark
17 | --cov=fugue_dask
18 | --cov=fugue_ray
19 | --cov=fugue_duckdb
20 | --cov=fugue_ibis
21 | --cov=fugue_polars
22 | --ignore=tests/fugue_spark/test_spark_connect.py
23 | --cov-report=term-missing:skip-covered
24 | -vvv
25 | spark_options =
26 | spark.master: local[*]
27 | spark.sql.catalogImplementation: in-memory
28 | spark.sql.shuffle.partitions: 4
29 | spark.default.parallelism: 4
30 | spark.executor.cores: 4
31 | spark.sql.execution.arrow.pyspark.enabled: true
32 | spark.sql.adaptive.enabled: false
33 | fugue_test_conf =
34 | # don't move for testing purpose
35 | fugue.test.dummy=dummy
36 | fugue.test:bool=true
37 | # ray settings
38 | ray.num_cpus:int=2
39 | # dask settings
40 | dask.processes:bool=true
41 | dask.n_workers:int=3
42 | dask.threads_per_worker:int=1
43 |
44 |
45 |
46 | [coverage:run]
47 | omit =
48 | fugue_sql/_antlr/*
49 | fugue_test/plugins/*
50 | fugue_test/fixtures.py
51 | fugue_test/__init__.py
52 |
53 | [flake8]
54 | ignore = E24,E203,W503,C401,C408,C420,A001,A003,A005,W504,C407,C405,B023,B028
55 | max-line-length = 88
56 | format = pylint
57 | exclude = .svc,CVS,.bzr,.hg,.git,__pycache__,venv,tests/*,docs/*
58 | max-complexity = 10
59 |
--------------------------------------------------------------------------------
/.pre-commit-config.yaml:
--------------------------------------------------------------------------------
1 | default_language_version:
2 | python: python3
3 |
4 | exclude: |
5 | (?x)(
6 | ^tests/|
7 | ^docs/|
8 | ^fugue_sql/_antlr/
9 | )
10 | repos:
11 | - repo: https://github.com/pre-commit/pre-commit-hooks
12 | rev: v3.2.0
13 | hooks:
14 | - id: check-ast
15 | - id: check-docstring-first
16 | - id: check-executables-have-shebangs
17 | - id: check-json
18 | - id: check-merge-conflict
19 | - id: check-yaml
20 | - id: debug-statements
21 | - id: end-of-file-fixer
22 | - id: trailing-whitespace
23 | - id: check-vcs-permalinks
24 | - repo: https://github.com/pycqa/flake8
25 | rev: '3.8.3'
26 | hooks:
27 | - id: flake8
28 | types: [python]
29 | additional_dependencies:
30 | - flake8-bugbear
31 | - flake8-builtins
32 | # - flake8-docstrings # TODO: add back!
33 | # - flake8-rst-docstrings
34 | - flake8-comprehensions
35 | - flake8-tidy-imports
36 | - pycodestyle
37 | - repo: https://github.com/pre-commit/mirrors-mypy
38 | rev: v0.971
39 | hooks:
40 | - id: mypy
41 | - repo: https://github.com/PyCQA/pylint
42 | rev: v3.2.6
43 | hooks:
44 | - id: pylint
45 | - repo: https://github.com/ambv/black
46 | rev: 22.3.0
47 | hooks:
48 | - id: black
49 | types: [python]
50 | language_version: python3
51 |
--------------------------------------------------------------------------------
/fugue/dev.py:
--------------------------------------------------------------------------------
1 | """
2 | All modeuls for developing and extending Fugue
3 | """
4 | # flake8: noqa
5 | # pylint: disable-all
6 |
7 | from triad.collections.function_wrapper import AnnotatedParam
8 |
9 | from fugue.bag.bag import BagDisplay
10 | from fugue.collections.partition import PartitionCursor, PartitionSpec
11 | from fugue.collections.sql import StructuredRawSQL, TempTableName
12 | from fugue.collections.yielded import PhysicalYielded, Yielded
13 | from fugue.dataframe.function_wrapper import (
14 | DataFrameFunctionWrapper,
15 | DataFrameParam,
16 | LocalDataFrameParam,
17 | fugue_annotated_param,
18 | )
19 | from fugue.dataset import DatasetDisplay
20 | from fugue.execution.execution_engine import (
21 | EngineFacet,
22 | ExecutionEngineParam,
23 | MapEngine,
24 | SQLEngine,
25 | )
26 | from fugue.execution.factory import (
27 | is_pandas_or,
28 | make_execution_engine,
29 | make_sql_engine,
30 | register_default_execution_engine,
31 | register_default_sql_engine,
32 | register_execution_engine,
33 | register_sql_engine,
34 | )
35 | from fugue.execution.native_execution_engine import PandasMapEngine, QPDPandasEngine
36 | from fugue.rpc import (
37 | EmptyRPCHandler,
38 | RPCClient,
39 | RPCFunc,
40 | RPCHandler,
41 | RPCServer,
42 | make_rpc_server,
43 | to_rpc_handler,
44 | )
45 | from fugue.workflow._workflow_context import FugueWorkflowContext
46 | from fugue.workflow.module import module
47 | from fugue.workflow.workflow import FugueWorkflow, WorkflowDataFrame, WorkflowDataFrames
48 |
--------------------------------------------------------------------------------
/fugue/api.py:
--------------------------------------------------------------------------------
1 | # flake8: noqa
2 | # pylint: disable-all
3 | from .dataframe.api import (
4 | alter_columns,
5 | as_array,
6 | as_array_iterable,
7 | as_arrow,
8 | as_dict_iterable,
9 | as_dicts,
10 | as_fugue_df,
11 | as_pandas,
12 | drop_columns,
13 | get_column_names,
14 | get_native_as_df,
15 | get_schema,
16 | head,
17 | is_df,
18 | normalize_column_names,
19 | peek_array,
20 | peek_dict,
21 | rename,
22 | select_columns,
23 | )
24 | from .dataset.api import (
25 | as_fugue_dataset,
26 | as_local,
27 | as_local_bounded,
28 | count,
29 | get_num_partitions,
30 | is_bounded,
31 | is_empty,
32 | is_local,
33 | show,
34 | )
35 | from .execution.api import (
36 | aggregate,
37 | anti_join,
38 | as_fugue_engine_df,
39 | assign,
40 | broadcast,
41 | clear_global_engine,
42 | cross_join,
43 | distinct,
44 | dropna,
45 | engine_context,
46 | fillna,
47 | filter,
48 | full_outer_join,
49 | get_context_engine,
50 | get_current_conf,
51 | get_current_parallelism,
52 | inner_join,
53 | intersect,
54 | join,
55 | left_outer_join,
56 | load,
57 | persist,
58 | repartition,
59 | right_outer_join,
60 | run_engine_function,
61 | sample,
62 | save,
63 | select,
64 | semi_join,
65 | set_global_engine,
66 | subtract,
67 | take,
68 | union,
69 | )
70 | from .sql.api import fugue_sql, fugue_sql_flow
71 | from .workflow.api import out_transform, raw_sql, transform
72 |
--------------------------------------------------------------------------------
/fugue_contrib/seaborn/__init__.py:
--------------------------------------------------------------------------------
1 | import json
2 | from functools import partial
3 | from typing import Any, Tuple
4 |
5 | import matplotlib.pyplot as plt
6 | import pandas as pd
7 | import seaborn
8 |
9 | from fugue import Outputter
10 | from fugue.extensions import namespace_candidate, parse_outputter
11 |
12 | from ..viz._ext import Visualize
13 |
14 |
15 | @parse_outputter.candidate(namespace_candidate("sns", lambda x: isinstance(x, str)))
16 | def _parse_seaborn(obj: Tuple[str, str]) -> Outputter:
17 | return _SeabornVisualize(obj[1])
18 |
19 |
20 | class _SeabornVisualize(Visualize):
21 | def __init__(self, func: str) -> None:
22 | super().__init__(func)
23 | getattr(seaborn, func) # ensure the func exists
24 |
25 | def _plot(self, df: pd.DataFrame) -> None:
26 | params = dict(self.params)
27 | title: Any = None
28 | if len(self.partition_spec.partition_by) > 0:
29 | keys = df[self.partition_spec.partition_by].head(1).to_dict("records")[0]
30 | kt = json.dumps(keys)[1:-1]
31 | if "title" in params:
32 | params["title"] = params["title"] + " -- " + kt
33 | else:
34 | params["title"] = kt
35 | df = df.drop(self.partition_spec.partition_by, axis=1)
36 | func = self._get_func(df)
37 | title = params.pop("title", None)
38 | plt.figure(0)
39 | func(**params).set(title=title)
40 | plt.show()
41 |
42 | def _get_func(self, df: pd.DataFrame) -> Any:
43 | f = getattr(seaborn, self._func)
44 | return partial(f, df)
45 |
--------------------------------------------------------------------------------
/tests/fugue/rpc/test_flask.py:
--------------------------------------------------------------------------------
1 | import cloudpickle
2 | from triad import ParamDict
3 | from fugue.rpc import make_rpc_server
4 | import pytest
5 |
6 | pytest.importorskip("flask")
7 | pytest.importorskip("jinja2")
8 |
9 |
10 | def test_flask_service():
11 | # fugue.rpc.flask.FlaskRPCServer
12 | conf = ParamDict(
13 | {
14 | "fugue.rpc.server": "fugue.rpc.flask.FlaskRPCServer",
15 | "fugue.rpc.flask_server.host": "127.0.0.1",
16 | "fugue.rpc.flask_server.port": "1234",
17 | "fugue.rpc.flask_server.timeout": "2 sec",
18 | }
19 | )
20 |
21 | def k(value: str) -> str:
22 | return value + "x"
23 |
24 | def kk(a: int, b: int) -> int:
25 | return a + b
26 |
27 | def kkk(f: callable, a: int) -> int:
28 | return f(a)
29 |
30 | with make_rpc_server(conf).start() as server:
31 | assert "1234" == server.conf["fugue.rpc.flask_server.port"]
32 | with server.start(): # recursive start will take no effect
33 | client1 = cloudpickle.loads(cloudpickle.dumps(server.make_client(k)))
34 | assert "dddx" == client1("ddd")
35 | client2 = cloudpickle.loads(cloudpickle.dumps(server.make_client(kk)))
36 | assert 3 == client2(1, 2)
37 | assert "dddx" == client1("ddd")
38 | # function serialization has been disabled
39 | # client3 = cloudpickle.loads(cloudpickle.dumps(server.make_client(kkk)))
40 | # assert 3 == client3(lambda x: x + 1, 2)
41 | # assert 3 == client2(1, 2)
42 | server.stop() # extra stop in the end will take no effect
43 |
--------------------------------------------------------------------------------
/fugue_test/fixtures.py:
--------------------------------------------------------------------------------
1 | import pytest
2 |
3 | _DEFAULT_SCOPE = "module"
4 |
5 |
6 | @pytest.fixture(scope=_DEFAULT_SCOPE)
7 | def pandas_session():
8 | yield "pandas"
9 |
10 |
11 | @pytest.fixture(scope=_DEFAULT_SCOPE)
12 | def native_session():
13 | yield "native"
14 |
15 |
16 | @pytest.fixture(scope=_DEFAULT_SCOPE)
17 | def dask_session():
18 | from fugue_dask.tester import DaskTestBackend
19 |
20 | with DaskTestBackend.generate_session_fixture() as session:
21 | yield session
22 |
23 |
24 | @pytest.fixture(scope=_DEFAULT_SCOPE)
25 | def duckdb_session():
26 | from fugue_duckdb.tester import DuckDBTestBackend
27 |
28 | with DuckDBTestBackend.generate_session_fixture() as session:
29 | yield session
30 |
31 |
32 | @pytest.fixture(scope=_DEFAULT_SCOPE)
33 | def duckdask_session():
34 | from fugue_duckdb.tester import DuckDaskTestBackend
35 |
36 | with DuckDaskTestBackend.generate_session_fixture() as session:
37 | yield session
38 |
39 |
40 | @pytest.fixture(scope=_DEFAULT_SCOPE)
41 | def ray_session():
42 | from fugue_ray.tester import RayTestBackend
43 |
44 | with RayTestBackend.generate_session_fixture() as session:
45 | yield session
46 |
47 |
48 | @pytest.fixture(scope=_DEFAULT_SCOPE)
49 | def spark_session():
50 | from fugue_spark.tester import SparkTestBackend
51 |
52 | with SparkTestBackend.generate_session_fixture() as session:
53 | yield session
54 |
55 |
56 | @pytest.fixture(scope=_DEFAULT_SCOPE)
57 | def sparkconnect_session():
58 | from fugue_spark.tester import SparkConnectTestBackend
59 |
60 | with SparkConnectTestBackend.generate_session_fixture() as session:
61 | yield session
62 |
--------------------------------------------------------------------------------
/fugue/extensions/creator/creator.py:
--------------------------------------------------------------------------------
1 | from abc import ABC, abstractmethod
2 |
3 | from fugue.dataframe import DataFrame
4 | from fugue.extensions.context import ExtensionContext
5 |
6 |
7 | class Creator(ExtensionContext, ABC):
8 | """The interface is to generate single DataFrame from `params`.
9 | For example reading data from file should be a type of Creator.
10 | Creator is task level extension, running on driver, and execution engine aware.
11 |
12 | To implement this class, you should not have ``__init__``, please directly implement
13 | the interface functions.
14 |
15 | .. note::
16 |
17 | Before implementing this class, do you really need to implement this
18 | interface? Do you know the interfaceless feature of Fugue? Implementing Creator
19 | is commonly unnecessary. You can choose the interfaceless approach which may
20 | decouple your code from Fugue.
21 |
22 | .. seealso::
23 |
24 | Please read :doc:`Creator Tutorial `
25 | """
26 |
27 | @abstractmethod
28 | def create(self) -> DataFrame: # pragma: no cover
29 | """Create DataFrame on driver side
30 |
31 | .. note::
32 |
33 | * It runs on driver side
34 | * The output dataframe is not necessarily local, for example a SparkDataFrame
35 | * It is engine aware, you can put platform dependent code in it (for example
36 | native pyspark code) but by doing so your code may not be portable. If you
37 | only use the functions of the general ExecutionEngine interface, it's still
38 | portable.
39 |
40 | :return: result dataframe
41 | """
42 | raise NotImplementedError
43 |
--------------------------------------------------------------------------------
/tests/fugue/workflow/test_workflow_parallel.py:
--------------------------------------------------------------------------------
1 | from fugue import FugueWorkflow, DataFrame, NativeExecutionEngine
2 | from typing import List, Any
3 | from time import sleep
4 | from timeit import timeit
5 | from pytest import raises
6 |
7 |
8 | def test_parallel():
9 | dag = FugueWorkflow({"fugue.workflow.concurrency": 10})
10 | dag.create(create).process(process).output(display)
11 | dag.create(create).process(process).output(display)
12 |
13 | t = timeit(
14 | lambda: dag.run(),
15 | number=1,
16 | ) # warmup
17 | t = timeit(
18 | lambda: dag.run(),
19 | number=1,
20 | )
21 | assert t < 0.4
22 |
23 |
24 | def test_parallel_exception():
25 | dag = FugueWorkflow({"fugue.workflow.concurrency": 2})
26 | dag.create(create).process(process).process(process, params=dict(sec=0.5)).output(
27 | display
28 | )
29 | dag.create(create_e).process(process).output(display)
30 |
31 | def run(dag, *args):
32 | with raises(NotImplementedError):
33 | dag.run(*args)
34 |
35 | t = timeit(
36 | lambda: run(dag),
37 | number=1,
38 | ) # warmup
39 | t = timeit(
40 | lambda: run(dag),
41 | number=1,
42 | )
43 | assert t < 0.5
44 |
45 |
46 | # schema: a:int
47 | def create(sec: float = 0.1) -> List[List[Any]]:
48 | sleep(sec)
49 | return [[0]]
50 |
51 |
52 | # schema: a:int
53 | def create_e(sec: float = 0.1) -> List[List[Any]]:
54 | raise NotImplementedError
55 |
56 |
57 | def process(df: DataFrame, sec: float = 0.1) -> DataFrame:
58 | sleep(sec)
59 | return df
60 |
61 |
62 | def display(df: DataFrame, sec: float = 0.1) -> None:
63 | sleep(sec)
64 | df.show()
65 |
--------------------------------------------------------------------------------
/fugue_duckdb/tester.py:
--------------------------------------------------------------------------------
1 | from contextlib import contextmanager
2 | from typing import Any, Dict, Iterator
3 |
4 | import duckdb
5 |
6 | import fugue.test as ft
7 |
8 | try:
9 | import dask.distributed as dd
10 | import dask
11 |
12 | _HAS_DASK = True
13 | except ImportError: # pragma: no cover
14 | _HAS_DASK = False
15 |
16 |
17 | @ft.fugue_test_backend
18 | class DuckDBTestBackend(ft.FugueTestBackend):
19 | name = "duckdb"
20 |
21 | @classmethod
22 | @contextmanager
23 | def session_context(cls, session_conf: Dict[str, Any]) -> Iterator[Any]:
24 | with duckdb.connect(config=session_conf) as conn:
25 | yield conn
26 |
27 |
28 | if _HAS_DASK:
29 |
30 | @ft.fugue_test_backend
31 | class DuckDaskTestBackend(ft.FugueTestBackend):
32 | name = "duckdask"
33 |
34 | @classmethod
35 | def transform_session_conf(cls, conf: Dict[str, Any]) -> Dict[str, Any]:
36 | res = ft.extract_conf(conf, "duck.", remove_prefix=False)
37 | res.update(ft.extract_conf(conf, "dask.", remove_prefix=False))
38 | return res
39 |
40 | @classmethod
41 | @contextmanager
42 | def session_context(cls, session_conf: Dict[str, Any]) -> Iterator[Any]:
43 | duck_conf = ft.extract_conf(session_conf, "duck.", remove_prefix=True)
44 | dask_conf = ft.extract_conf(session_conf, "dask.", remove_prefix=True)
45 | with dd.Client(**dask_conf) as client:
46 | dask.config.set({"dataframe.shuffle.method": "tasks"})
47 | dask.config.set({"dataframe.convert-string": False})
48 | with duckdb.connect(config=duck_conf) as conn:
49 | yield [conn, client]
50 |
--------------------------------------------------------------------------------
/tests/fugue/utils/test_interfaceless.py:
--------------------------------------------------------------------------------
1 | from pytest import raises
2 |
3 | from fugue._utils.interfaceless import (
4 | is_class_method,
5 | parse_comment_annotation,
6 | parse_output_schema_from_comment,
7 | )
8 |
9 |
10 | def test_parse_comment_annotation():
11 | def a():
12 | pass
13 |
14 | # asdfasdf
15 | def b():
16 | pass
17 |
18 | # asdfasdf
19 | # schema : s:int
20 | # # # schema : a : int,b:str
21 | # schema : a : str ,b:str
22 | # asdfasdf
23 | def c():
24 | pass
25 |
26 | # schema:
27 | def d():
28 | pass
29 |
30 | assert parse_comment_annotation(a, "schema") is None
31 | assert parse_comment_annotation(b, "schema") is None
32 | assert "a : str ,b:str" == parse_comment_annotation(c, "schema")
33 | assert "" == parse_comment_annotation(d, "schema")
34 |
35 |
36 | def test_parse_output_schema_from_comment():
37 | def a():
38 | pass
39 |
40 | # asdfasdf
41 | def b():
42 | pass
43 |
44 | # asdfasdf
45 | # schema : s : int # more comment
46 | # # # schema : a : int,b:str
47 | # asdfasdf
48 | def c():
49 | pass
50 |
51 | # schema:
52 | def d():
53 | pass
54 |
55 | assert parse_output_schema_from_comment(a) is None
56 | assert parse_output_schema_from_comment(b) is None
57 | assert "s:int" == parse_output_schema_from_comment(c).replace(" ", "")
58 | raises(SyntaxError, lambda: parse_output_schema_from_comment(d))
59 |
60 |
61 | def test_is_class_method():
62 | def f1():
63 | pass
64 |
65 | class F(object):
66 | def f2(self):
67 | pass
68 |
69 | assert not is_class_method(f1)
70 | assert is_class_method(F.f2)
71 | assert not is_class_method(F().f2)
72 |
--------------------------------------------------------------------------------
/fugue/extensions/outputter/outputter.py:
--------------------------------------------------------------------------------
1 | from abc import ABC, abstractmethod
2 |
3 | from fugue.dataframe import DataFrames
4 | from fugue.extensions.context import ExtensionContext
5 |
6 |
7 | class Outputter(ExtensionContext, ABC):
8 | """The interface to process one or multiple incoming dataframes without returning
9 | anything. For example printing or saving dataframes should be a type of Outputter.
10 | Outputter is task level extension, running on driver, and execution engine aware.
11 |
12 | To implement this class, you should not have ``__init__``, please directly implement
13 | the interface functions.
14 |
15 | .. note::
16 |
17 | Before implementing this class, do you really need to implement this
18 | interface? Do you know the interfaceless feature of Fugue? Implementing Outputter
19 | is commonly unnecessary. You can choose the interfaceless approach which may
20 | decouple your code from Fugue.
21 |
22 | .. seealso::
23 |
24 | Please read
25 | :doc:`Outputter Tutorial `
26 | """
27 |
28 | @abstractmethod
29 | def process(self, dfs: DataFrames) -> None: # pragma: no cover
30 | """Process the collection of dataframes on driver side
31 |
32 | .. note::
33 |
34 | * It runs on driver side
35 | * The dataframes are not necessarily local, for example a SparkDataFrame
36 | * It is engine aware, you can put platform dependent code in it (for example
37 | native pyspark code) but by doing so your code may not be portable. If you
38 | only use the functions of the general ExecutionEngine, it's still portable.
39 |
40 | :param dfs: dataframe collection to process
41 | """
42 | raise NotImplementedError
43 |
--------------------------------------------------------------------------------
/fugue/exceptions.py:
--------------------------------------------------------------------------------
1 | class FugueError(Exception):
2 | """Fugue exceptions"""
3 |
4 |
5 | class FugueBug(FugueError):
6 | """Fugue internal bug"""
7 |
8 |
9 | class FugueInvalidOperation(FugueError):
10 | """Invalid operation on the Fugue framework"""
11 |
12 |
13 | class FuguePluginsRegistrationError(FugueError):
14 | """Fugue plugins registration error"""
15 |
16 |
17 | class FugueDataFrameError(FugueError):
18 | """Fugue dataframe related error"""
19 |
20 |
21 | class FugueDataFrameInitError(FugueDataFrameError):
22 | """Fugue dataframe initialization error"""
23 |
24 |
25 | class FugueDatasetEmptyError(FugueDataFrameError):
26 | """Fugue dataframe is empty"""
27 |
28 |
29 | class FugueDataFrameOperationError(FugueDataFrameError):
30 | """Fugue dataframe invalid operation"""
31 |
32 |
33 | class FugueWorkflowError(FugueError):
34 | """Fugue workflow exceptions"""
35 |
36 |
37 | class FugueWorkflowCompileError(FugueWorkflowError):
38 | """Fugue workflow compile time error"""
39 |
40 |
41 | class FugueWorkflowCompileValidationError(FugueWorkflowCompileError):
42 | """Fugue workflow compile time validation error"""
43 |
44 |
45 | class FugueInterfacelessError(FugueWorkflowCompileError):
46 | """Fugue interfaceless exceptions"""
47 |
48 |
49 | class FugueWorkflowRuntimeError(FugueWorkflowError):
50 | """Fugue workflow compile time error"""
51 |
52 |
53 | class FugueWorkflowRuntimeValidationError(FugueWorkflowRuntimeError):
54 | """Fugue workflow runtime validation error"""
55 |
56 |
57 | class FugueSQLError(FugueWorkflowCompileError):
58 | """Fugue SQL error"""
59 |
60 |
61 | class FugueSQLSyntaxError(FugueSQLError):
62 | """Fugue SQL syntax error"""
63 |
64 |
65 | class FugueSQLRuntimeError(FugueWorkflowRuntimeError):
66 | """Fugue SQL runtime error"""
67 |
--------------------------------------------------------------------------------
/fugue/extensions/processor/processor.py:
--------------------------------------------------------------------------------
1 | from abc import ABC, abstractmethod
2 |
3 | from fugue.dataframe import DataFrame, DataFrames
4 | from fugue.extensions.context import ExtensionContext
5 |
6 |
7 | class Processor(ExtensionContext, ABC):
8 | """The interface to process one or multiple incoming dataframes and return one
9 | DataFrame. For example dropping a column of df should be a type of Processor.
10 | Processor is task level extension, running on driver, and execution engine aware.
11 |
12 | To implement this class, you should not have ``__init__``, please directly implement
13 | the interface functions.
14 |
15 | .. note::
16 |
17 | Before implementing this class, do you really need to implement this
18 | interface? Do you know the interfaceless feature of Fugue? Implementing Processor
19 | is commonly unnecessary. You can choose the interfaceless approach which may
20 | decouple your code from Fugue.
21 |
22 | .. seealso::
23 |
24 | Please read
25 | :doc:`Processor Tutorial `
26 | """
27 |
28 | @abstractmethod
29 | def process(self, dfs: DataFrames) -> DataFrame: # pragma: no cover
30 | """Process the collection of dataframes on driver side
31 |
32 | .. note::
33 |
34 | * It runs on driver side
35 | * The dataframes are not necessarily local, for example a SparkDataFrame
36 | * It is engine aware, you can put platform dependent code in it (for example
37 | native pyspark code) but by doing so your code may not be portable. If you
38 | only use the functions of the general ExecutionEngine, it's still portable.
39 |
40 | :param dfs: dataframe collection to process
41 | :return: the result dataframe
42 | """
43 | raise NotImplementedError
44 |
--------------------------------------------------------------------------------
/tests/fugue_ibis/test_execution_engine.py:
--------------------------------------------------------------------------------
1 | import sys
2 |
3 | import pytest
4 |
5 | import fugue.test as ft
6 | from fugue_test.builtin_suite import BuiltInTests
7 | from fugue_test.execution_suite import ExecutionEngineTests
8 |
9 | from .mock.tester import mockibisduck_session # noqa: F401 # pylint: disable-all
10 |
11 |
12 | @ft.fugue_test_suite("mockibisduck", mark_test=True)
13 | class IbisExecutionEngineTests(ExecutionEngineTests.Tests):
14 | def test_select(self):
15 | # it can't work properly with DuckDB (hugeint is not recognized)
16 | pass
17 |
18 |
19 | @ft.fugue_test_suite(("mockibisduck", {"fugue.force_is_ibis": True}), mark_test=True)
20 | class IbisExecutionEngineForceIbisTests(ExecutionEngineTests.Tests):
21 | def test_properties(self):
22 | assert not self.engine.is_distributed
23 | assert not self.engine.map_engine.is_distributed
24 | assert not self.engine.sql_engine.is_distributed
25 |
26 | assert self.engine.sql_engine.get_temp_table_name(
27 | ) != self.engine.sql_engine.get_temp_table_name()
28 |
29 | def test_select(self):
30 | # it can't work properly with DuckDB (hugeint is not recognized)
31 | pass
32 |
33 | def test_get_parallelism(self):
34 | assert self.engine.get_current_parallelism() == 1
35 |
36 | def test_union(self):
37 | if sys.version_info >= (3, 9):
38 | # ibis 3.8 support no longer works
39 | return super().test_union()
40 |
41 |
42 | @ft.fugue_test_suite("mockibisduck", mark_test=True)
43 | class DuckBuiltInTests(BuiltInTests.Tests):
44 | def test_df_select(self):
45 | # it can't work properly with DuckDB (hugeint is not recognized)
46 | pass
47 |
48 |
49 | @ft.fugue_test_suite(("mockibisduck", {"fugue.force_is_ibis": True}), mark_test=True)
50 | class DuckBuiltInForceIbisTests(BuiltInTests.Tests):
51 | def test_df_select(self):
52 | # it can't work properly with DuckDB (hugeint is not recognized)
53 | pass
54 |
--------------------------------------------------------------------------------
/fugue_ray/registry.py:
--------------------------------------------------------------------------------
1 | from typing import Any
2 |
3 | import ray.data as rd
4 | from triad import run_at_def
5 |
6 | from fugue import DataFrame, register_execution_engine
7 | from fugue.dev import (
8 | DataFrameParam,
9 | ExecutionEngineParam,
10 | fugue_annotated_param,
11 | is_pandas_or,
12 | )
13 | from fugue.plugins import as_fugue_dataset, infer_execution_engine
14 |
15 | from .dataframe import RayDataFrame
16 | from .execution_engine import RayExecutionEngine
17 | from .tester import RayTestBackend # noqa: F401 # pylint: disable-all
18 |
19 |
20 | @infer_execution_engine.candidate(
21 | lambda objs: is_pandas_or(objs, (rd.Dataset, RayDataFrame))
22 | )
23 | def _infer_ray_client(objs: Any) -> Any:
24 | return "ray"
25 |
26 |
27 | @as_fugue_dataset.candidate(lambda df, **kwargs: isinstance(df, rd.Dataset))
28 | def _ray_as_fugue_df(df: rd.Dataset, **kwargs: Any) -> RayDataFrame:
29 | return RayDataFrame(df, **kwargs)
30 |
31 |
32 | def _register_engines() -> None:
33 | register_execution_engine(
34 | "ray", lambda conf, **kwargs: RayExecutionEngine(conf=conf), on_dup="ignore"
35 | )
36 |
37 |
38 | @fugue_annotated_param(RayExecutionEngine)
39 | class _RayExecutionEngineParam(ExecutionEngineParam):
40 | pass
41 |
42 |
43 | @fugue_annotated_param(rd.Dataset)
44 | class _RayDatasetParam(DataFrameParam):
45 | def to_input_data(self, df: DataFrame, ctx: Any) -> Any:
46 | assert isinstance(ctx, RayExecutionEngine)
47 | return ctx._to_ray_df(df).native
48 |
49 | def to_output_df(self, output: Any, schema: Any, ctx: Any) -> DataFrame:
50 | assert isinstance(output, rd.Dataset)
51 | assert isinstance(ctx, RayExecutionEngine)
52 | return RayDataFrame(output, schema=schema)
53 |
54 | def count(self, df: DataFrame) -> int: # pragma: no cover
55 | raise NotImplementedError("not allowed")
56 |
57 |
58 | @run_at_def
59 | def _register() -> None:
60 | """Register Ray Execution Engine"""
61 | _register_engines()
62 |
--------------------------------------------------------------------------------
/tests/fugue/workflow/test_runtime_exception.py:
--------------------------------------------------------------------------------
1 | import pandas as pd
2 | from fugue import FugueWorkflow
3 | import sys
4 | import traceback
5 | from fugue.constants import (
6 | FUGUE_CONF_WORKFLOW_EXCEPTION_HIDE,
7 | FUGUE_CONF_WORKFLOW_EXCEPTION_OPTIMIZE,
8 | )
9 |
10 |
11 | def test_runtime_exception():
12 | if sys.version_info < (3, 7):
13 | return
14 |
15 | def tr(df: pd.DataFrame) -> pd.DataFrame:
16 | raise Exception
17 |
18 | def show(df):
19 | df.show()
20 |
21 | dag = FugueWorkflow()
22 | df = dag.df([[0]], "a:int")
23 | df = df.transform(tr, schema="*")
24 | show(df)
25 |
26 | try:
27 | dag.run()
28 | except Exception:
29 | assert len(traceback.extract_tb(sys.exc_info()[2])) < 10
30 |
31 | dag = FugueWorkflow({FUGUE_CONF_WORKFLOW_EXCEPTION_OPTIMIZE: False})
32 | df = dag.df([[0]], "a:int")
33 | df = df.transform(tr, schema="*")
34 | show(df)
35 |
36 | try:
37 | dag.run("native")
38 | except Exception:
39 | assert len(traceback.extract_tb(sys.exc_info()[2])) > 10
40 |
41 | dag = FugueWorkflow({FUGUE_CONF_WORKFLOW_EXCEPTION_HIDE: ""})
42 | df = dag.df([[0]], "a:int")
43 | df = df.transform(tr, schema="*")
44 | show(df)
45 |
46 | try:
47 | dag.run("native")
48 | except Exception:
49 | assert len(traceback.extract_tb(sys.exc_info()[2])) > 10
50 |
51 |
52 | def test_modified_exception():
53 | if sys.version_info < (3, 7):
54 | return
55 |
56 | def tr(df: pd.DataFrame) -> pd.DataFrame:
57 | raise Exception
58 |
59 | def show(df):
60 | df.show()
61 |
62 | def tt(df):
63 | __modified_exception__ = NotImplementedError()
64 | return df.transform(tr, schema="*")
65 |
66 | dag = FugueWorkflow()
67 | df = dag.df([[0]], "a:int")
68 | df = tt(df)
69 | show(df)
70 |
71 | try:
72 | dag.run()
73 | except Exception as ex:
74 | assert isinstance(ex.__cause__, NotImplementedError)
75 |
--------------------------------------------------------------------------------
/.github/workflows/test_dask.yml:
--------------------------------------------------------------------------------
1 | # This workflow will install Python dependencies, run tests and lint with a variety of Python versions
2 | # For more information see: https://help.github.com/actions/language-and-framework-guides/using-python-with-github-actions
3 |
4 | name: Dask Tests
5 |
6 | on:
7 | push:
8 | branches: [ master ]
9 | paths-ignore:
10 | - 'docs/**'
11 | - '**.md'
12 | pull_request:
13 | branches: [ master ]
14 | paths-ignore:
15 | - 'docs/**'
16 | - '**.md'
17 |
18 | concurrency:
19 | group: ${{ github.workflow }}-${{ github.ref }}
20 | cancel-in-progress: true
21 |
22 | jobs:
23 | test_dask_lower_bound:
24 | name: Dask 2024.4.0
25 | runs-on: ubuntu-latest
26 |
27 | steps:
28 | - uses: actions/checkout@v2
29 | - name: Set up Python 3.10
30 | uses: actions/setup-python@v1
31 | with:
32 | python-version: "3.10"
33 | - name: Install dependencies
34 | run: make devenv
35 | - name: Setup Dask
36 | run: pip install pyarrow==7.0.0 pandas==2.0.2 dask[dataframe,distributed]==2024.4.0
37 | - name: Test
38 | run: make testdask
39 |
40 | test_dask_sql_latest:
41 | name: Dask with SQL Latest
42 | runs-on: ubuntu-latest
43 |
44 | steps:
45 | - uses: actions/checkout@v2
46 | - name: Set up Python 3.10
47 | uses: actions/setup-python@v1
48 | with:
49 | python-version: "3.10"
50 | - name: Install dependencies
51 | run: make devenv
52 | - name: Test
53 | run: make testdask
54 |
55 | test_dask_latest:
56 | name: Dask without SQL Latest
57 | runs-on: ubuntu-latest
58 |
59 | steps:
60 | - uses: actions/checkout@v2
61 | - name: Set up Python 3.11
62 | uses: actions/setup-python@v1
63 | with:
64 | python-version: "3.11"
65 | - name: Install dependencies
66 | run: make devenv
67 | - name: Remove Dask SQL
68 | run: pip uninstall -y dask-sql qpd fugue-sql-antlr sqlglot
69 | - name: Test
70 | run: make testdask
71 |
--------------------------------------------------------------------------------
/fugue/extensions/_builtins/creators.py:
--------------------------------------------------------------------------------
1 | from typing import Any, Callable, Optional
2 |
3 | from triad import Schema, assert_or_throw, to_uuid
4 |
5 | from fugue.collections.yielded import Yielded
6 | from fugue.dataframe import DataFrame
7 | from fugue.exceptions import FugueWorkflowCompileError
8 | from fugue.execution.api import as_fugue_engine_df
9 | from fugue.extensions.creator import Creator
10 |
11 |
12 | class Load(Creator):
13 | def create(self) -> DataFrame:
14 | kwargs = self.params.get("params", dict())
15 | path = self.params.get_or_throw("path", str)
16 | format_hint = self.params.get("fmt", "")
17 | columns = self.params.get_or_none("columns", object)
18 |
19 | return self.execution_engine.load_df(
20 | path=path, format_hint=format_hint, columns=columns, **kwargs
21 | )
22 |
23 |
24 | class CreateData(Creator):
25 | def __init__(
26 | self,
27 | df: Any,
28 | schema: Any = None,
29 | data_determiner: Optional[Callable[[Any], Any]] = None,
30 | ) -> None:
31 | if isinstance(df, Yielded):
32 | assert_or_throw(
33 | schema is None,
34 | FugueWorkflowCompileError("schema must be None when data is Yielded"),
35 | )
36 | super().__init__()
37 | self._df = df
38 | self._schema = schema if schema is None else Schema(schema)
39 | self._data_determiner = data_determiner
40 |
41 | def create(self) -> DataFrame:
42 | if isinstance(self._df, Yielded):
43 | return self.execution_engine.load_yielded(self._df)
44 | return as_fugue_engine_df(self.execution_engine, self._df, schema=self._schema)
45 |
46 | def _df_uid(self):
47 | if self._data_determiner is not None:
48 | return self._data_determiner(self._df)
49 | if isinstance(self._df, Yielded):
50 | return self._df
51 | return 1
52 |
53 | def __uuid__(self) -> str:
54 | return to_uuid(super().__uuid__(), self._df_uid(), self._schema)
55 |
--------------------------------------------------------------------------------
/.github/workflows/test_spark.yml:
--------------------------------------------------------------------------------
1 | # This workflow will install Python dependencies, run tests and lint with a variety of Python versions
2 | # For more information see: https://help.github.com/actions/language-and-framework-guides/using-python-with-github-actions
3 |
4 | name: Spark Tests
5 |
6 | on:
7 | push:
8 | branches: [ master ]
9 | paths-ignore:
10 | - 'docs/**'
11 | - '**.md'
12 | pull_request:
13 | branches: [ master ]
14 | paths-ignore:
15 | - 'docs/**'
16 | - '**.md'
17 |
18 | concurrency:
19 | group: ${{ github.workflow }}-${{ github.ref }}
20 | cancel-in-progress: true
21 |
22 | jobs:
23 | test_combinations:
24 | name: Spark ${{ matrix.spark-version }} Pandas ${{ matrix.pandas-version }}
25 | runs-on: ubuntu-latest
26 | strategy:
27 | matrix:
28 | spark-version: ["3.4.0","3.5.7"] # TODO: add 4.0.1
29 | pandas-version: ["1.5.3","2.0.1"]
30 |
31 | steps:
32 | - uses: actions/checkout@v2
33 | - name: Set up Python 3.10
34 | uses: actions/setup-python@v1
35 | with:
36 | python-version: "3.10"
37 | - name: Install dependencies
38 | run: make devenv
39 | - name: Install Spark ${{ matrix.spark-version }}
40 | run: pip install "pyspark==${{ matrix.spark-version }}"
41 | - name: Install Pandas ${{ matrix.pandas-version }}
42 | run: pip install "pandas==${{ matrix.pandas-version }}"
43 | - name: Downgrade Ibis
44 | if: matrix.spark-version < '3.4.0'
45 | run: pip install "ibis-framework<5"
46 | - name: Test
47 | run: make testspark
48 |
49 | test_connect:
50 | name: Spark Connect
51 | runs-on: ubuntu-latest
52 |
53 | steps:
54 | - uses: actions/checkout@v2
55 | - name: Set up Python 3.10
56 | uses: actions/setup-python@v1
57 | with:
58 | python-version: "3.10"
59 | - name: Install dependencies
60 | run: make devenv
61 | - name: Setup Spark
62 | run: |
63 | pip install "pyspark==3.5.7"
64 | make sparkconnect
65 | - name: Test
66 | run: make testsparkconnect
67 |
--------------------------------------------------------------------------------
/tests/fugue/extensions/creator/__init__.py:
--------------------------------------------------------------------------------
1 | from typing import Any, Dict, Iterable, List
2 |
3 | from fugue.dataframe import ArrayDataFrame
4 | from fugue.exceptions import FugueInterfacelessError
5 | from fugue.extensions.transformer import Transformer, _to_transformer, transformer
6 | from pytest import raises
7 | from triad.collections.schema import Schema
8 |
9 |
10 | def test_transformer():
11 | assert isinstance(t1, Transformer)
12 | df = ArrayDataFrame([[0]], "a:int")
13 | t1._output_schema = t1.get_output_schema(df)
14 | assert t1.output_schema == "a:int,b:int"
15 | t2._output_schema = t2.get_output_schema(df)
16 | assert t2.output_schema == "b:int,a:int"
17 | assert [[0, 1]] == list(t3(df.as_array_iterable()))
18 |
19 |
20 | def test__to_transformer():
21 | a = _to_transformer(t1, None)
22 | assert isinstance(a, Transformer)
23 | a._x = 1
24 | # every parse should produce a different transformer even the input is
25 | # a transformer instance
26 | b = _to_transformer(t1, None)
27 | assert isinstance(b, Transformer)
28 | assert "_x" not in b.__dict__
29 | c = _to_transformer("t1", None)
30 | assert isinstance(c, Transformer)
31 | assert "_x" not in c.__dict__
32 | c._x = 1
33 | d = _to_transformer("t1", None)
34 | assert isinstance(d, Transformer)
35 | assert "_x" not in d.__dict__
36 | raises(FugueInterfacelessError, lambda: _to_transformer(t4, None))
37 | raises(FugueInterfacelessError, lambda: _to_transformer("t4", None))
38 | e = _to_transformer("t4", "*,b:int")
39 | assert isinstance(e, Transformer)
40 |
41 |
42 | @transformer(["*", None, "b:int"])
43 | def t1(df: Iterable[Dict[str, Any]]) -> Iterable[Dict[str, Any]]:
44 | for r in df:
45 | r["b"] = 1
46 | yield r
47 |
48 |
49 | @transformer([Schema("b:int"), "*"])
50 | def t2(df: Iterable[Dict[str, Any]]) -> Iterable[Dict[str, Any]]:
51 | for r in df:
52 | r["b"] = 1
53 | yield r
54 |
55 |
56 | @transformer("*, b:int")
57 | def t3(df: Iterable[List[Any]]) -> Iterable[List[Any]]:
58 | for r in df:
59 | r += [1]
60 | yield r
61 |
62 |
63 | def t4(df: Iterable[List[Any]]) -> Iterable[List[Any]]:
64 | for r in df:
65 | r += [1]
66 | yield r
67 |
--------------------------------------------------------------------------------
/tests/fugue/extensions/outputter/__init__.py:
--------------------------------------------------------------------------------
1 | from typing import Any, Dict, Iterable, List
2 |
3 | from fugue.dataframe import ArrayDataFrame
4 | from fugue.exceptions import FugueInterfacelessError
5 | from fugue.extensions.transformer import Transformer, _to_transformer, transformer
6 | from pytest import raises
7 | from triad.collections.schema import Schema
8 |
9 |
10 | def test_transformer():
11 | assert isinstance(t1, Transformer)
12 | df = ArrayDataFrame([[0]], "a:int")
13 | t1._output_schema = t1.get_output_schema(df)
14 | assert t1.output_schema == "a:int,b:int"
15 | t2._output_schema = t2.get_output_schema(df)
16 | assert t2.output_schema == "b:int,a:int"
17 | assert [[0, 1]] == list(t3(df.as_array_iterable()))
18 |
19 |
20 | def test__to_transformer():
21 | a = _to_transformer(t1, None)
22 | assert isinstance(a, Transformer)
23 | a._x = 1
24 | # every parse should produce a different transformer even the input is
25 | # a transformer instance
26 | b = _to_transformer(t1, None)
27 | assert isinstance(b, Transformer)
28 | assert "_x" not in b.__dict__
29 | c = _to_transformer("t1", None)
30 | assert isinstance(c, Transformer)
31 | assert "_x" not in c.__dict__
32 | c._x = 1
33 | d = _to_transformer("t1", None)
34 | assert isinstance(d, Transformer)
35 | assert "_x" not in d.__dict__
36 | raises(FugueInterfacelessError, lambda: _to_transformer(t4, None))
37 | raises(FugueInterfacelessError, lambda: _to_transformer("t4", None))
38 | e = _to_transformer("t4", "*,b:int")
39 | assert isinstance(e, Transformer)
40 |
41 |
42 | @transformer(["*", None, "b:int"])
43 | def t1(df: Iterable[Dict[str, Any]]) -> Iterable[Dict[str, Any]]:
44 | for r in df:
45 | r["b"] = 1
46 | yield r
47 |
48 |
49 | @transformer([Schema("b:int"), "*"])
50 | def t2(df: Iterable[Dict[str, Any]]) -> Iterable[Dict[str, Any]]:
51 | for r in df:
52 | r["b"] = 1
53 | yield r
54 |
55 |
56 | @transformer("*, b:int")
57 | def t3(df: Iterable[List[Any]]) -> Iterable[List[Any]]:
58 | for r in df:
59 | r += [1]
60 | yield r
61 |
62 |
63 | def t4(df: Iterable[List[Any]]) -> Iterable[List[Any]]:
64 | for r in df:
65 | r += [1]
66 | yield r
67 |
--------------------------------------------------------------------------------
/tests/fugue/extensions/processor/__init__.py:
--------------------------------------------------------------------------------
1 | from typing import Any, Dict, Iterable, List
2 |
3 | from fugue.dataframe import ArrayDataFrame
4 | from fugue.exceptions import FugueInterfacelessError
5 | from fugue.extensions.transformer import Transformer, _to_transformer, transformer
6 | from pytest import raises
7 | from triad.collections.schema import Schema
8 |
9 |
10 | def test_transformer():
11 | assert isinstance(t1, Transformer)
12 | df = ArrayDataFrame([[0]], "a:int")
13 | t1._output_schema = t1.get_output_schema(df)
14 | assert t1.output_schema == "a:int,b:int"
15 | t2._output_schema = t2.get_output_schema(df)
16 | assert t2.output_schema == "b:int,a:int"
17 | assert [[0, 1]] == list(t3(df.as_array_iterable()))
18 |
19 |
20 | def test__to_transformer():
21 | a = _to_transformer(t1, None)
22 | assert isinstance(a, Transformer)
23 | a._x = 1
24 | # every parse should produce a different transformer even the input is
25 | # a transformer instance
26 | b = _to_transformer(t1, None)
27 | assert isinstance(b, Transformer)
28 | assert "_x" not in b.__dict__
29 | c = _to_transformer("t1", None)
30 | assert isinstance(c, Transformer)
31 | assert "_x" not in c.__dict__
32 | c._x = 1
33 | d = _to_transformer("t1", None)
34 | assert isinstance(d, Transformer)
35 | assert "_x" not in d.__dict__
36 | raises(FugueInterfacelessError, lambda: _to_transformer(t4, None))
37 | raises(FugueInterfacelessError, lambda: _to_transformer("t4", None))
38 | e = _to_transformer("t4", "*,b:int")
39 | assert isinstance(e, Transformer)
40 |
41 |
42 | @transformer(["*", None, "b:int"])
43 | def t1(df: Iterable[Dict[str, Any]]) -> Iterable[Dict[str, Any]]:
44 | for r in df:
45 | r["b"] = 1
46 | yield r
47 |
48 |
49 | @transformer([Schema("b:int"), "*"])
50 | def t2(df: Iterable[Dict[str, Any]]) -> Iterable[Dict[str, Any]]:
51 | for r in df:
52 | r["b"] = 1
53 | yield r
54 |
55 |
56 | @transformer("*, b:int")
57 | def t3(df: Iterable[List[Any]]) -> Iterable[List[Any]]:
58 | for r in df:
59 | r += [1]
60 | yield r
61 |
62 |
63 | def t4(df: Iterable[List[Any]]) -> Iterable[List[Any]]:
64 | for r in df:
65 | r += [1]
66 | yield r
67 |
--------------------------------------------------------------------------------
/fugue/sql/_utils.py:
--------------------------------------------------------------------------------
1 | import re
2 | from typing import Any, Dict, Optional
3 |
4 | from triad import assert_or_throw
5 |
6 | from ..collections.yielded import Yielded
7 | from ..exceptions import FugueSQLError
8 | from ..workflow.workflow import FugueWorkflow, WorkflowDataFrame
9 |
10 | MATCH_QUOTED_STRING = r"([\"'])(({|%|})*)\1"
11 |
12 |
13 | def fill_sql_template(sql: str, params: Dict[str, Any]):
14 | """Prepare string to be executed, inserts params into sql template
15 | ---
16 | :param sql: jinja compatible template
17 | :param params: params to be inserted into template
18 | """
19 | import jinja2
20 | from jinja2 import Template
21 |
22 | try:
23 | if "self" in params:
24 | params = {k: v for k, v in params.items() if k != "self"}
25 | single_quote_pattern = "'{{% raw %}}{}{{% endraw %}}'"
26 | double_quote_pattern = '"{{% raw %}}{}{{% endraw %}}"'
27 | new_sql = re.sub(
28 | MATCH_QUOTED_STRING,
29 | lambda pattern: double_quote_pattern.format(pattern.group(2))
30 | if pattern.group(1) == '"'
31 | else single_quote_pattern.format(pattern.group(2)),
32 | sql,
33 | )
34 |
35 | template = Template(new_sql)
36 |
37 | except jinja2.exceptions.TemplateSyntaxError:
38 |
39 | template = Template(sql)
40 |
41 | return template.render(**params)
42 |
43 |
44 | class LazyWorkflowDataFrame:
45 | def __init__(self, key: str, df: Any, workflow: FugueWorkflow):
46 | self._key = key
47 | self._df = df
48 | self._workflow = workflow
49 | self._wdf: Optional[WorkflowDataFrame] = None
50 |
51 | def get_df(self) -> WorkflowDataFrame:
52 | if self._wdf is None:
53 | self._wdf = self._get_df()
54 | return self._wdf
55 |
56 | def _get_df(self) -> WorkflowDataFrame:
57 | if isinstance(self._df, Yielded):
58 | return self._workflow.df(self._df)
59 | if isinstance(self._df, WorkflowDataFrame):
60 | assert_or_throw(
61 | self._df.workflow is self._workflow,
62 | lambda: FugueSQLError(
63 | f"{self._key}, {self._df} is from another workflow"
64 | ),
65 | )
66 | return self._df
67 | return self._workflow.df(self._df)
68 |
--------------------------------------------------------------------------------
/fugue/collections/yielded.py:
--------------------------------------------------------------------------------
1 | from typing import Any
2 |
3 | from triad import assert_or_throw
4 | from triad.utils.hash import to_uuid
5 |
6 |
7 | class Yielded(object):
8 | """Yields from :class:`~fugue.workflow.workflow.FugueWorkflow`.
9 | Users shouldn't create this object directly.
10 |
11 | :param yid: unique id for determinism
12 | """
13 |
14 | def __init__(self, yid: str):
15 | self._yid = to_uuid(yid)
16 |
17 | def __uuid__(self) -> str:
18 | """uuid of the instance"""
19 | return self._yid
20 |
21 | @property
22 | def is_set(self) -> bool: # pragma: no cover
23 | """Whether the value is set. It can be false if the parent workflow
24 | has not been executed.
25 | """
26 | raise NotImplementedError
27 |
28 | def __copy__(self) -> Any: # pragma: no cover
29 | """``copy`` should have no effect"""
30 | return self
31 |
32 | def __deepcopy__(self, memo: Any) -> Any: # pragma: no cover
33 | """``deepcopy`` should have no effect"""
34 | return self
35 |
36 |
37 | class PhysicalYielded(Yielded):
38 | """Physical yielded object from :class:`~fugue.workflow.workflow.FugueWorkflow`.
39 | Users shouldn't create this object directly.
40 |
41 | :param yid: unique id for determinism
42 | :param storage_type: ``file`` or ``table``
43 | """
44 |
45 | def __init__(self, yid: str, storage_type: str):
46 | super().__init__(yid)
47 | self._name = ""
48 | assert_or_throw(
49 | storage_type in ["file", "table"],
50 | ValueError(f"{storage_type} not in (file, table) "),
51 | )
52 | self._storage_type = storage_type
53 |
54 | @property
55 | def is_set(self) -> bool:
56 | return self._name != ""
57 |
58 | def set_value(self, name: str) -> None:
59 | """Set the storage name after compute
60 |
61 | :param name: name reference of the storage
62 | """
63 | self._name = name
64 |
65 | @property
66 | def name(self) -> str:
67 | """The name reference of the yield"""
68 | assert_or_throw(self.is_set, "value is not set")
69 | return self._name
70 |
71 | @property
72 | def storage_type(self) -> str:
73 | """The storage type of this yield"""
74 | return self._storage_type
75 |
--------------------------------------------------------------------------------
/tests/fugue/dataframe/test_arrow_dataframe.py:
--------------------------------------------------------------------------------
1 | from typing import Any
2 |
3 | import pandas as pd
4 | import pyarrow as pa
5 | from pytest import raises
6 |
7 | import fugue.api as fa
8 | import fugue.test as ft
9 | from fugue.dataframe import ArrowDataFrame
10 | from fugue_test.dataframe_suite import DataFrameTests
11 |
12 |
13 | @ft.fugue_test_suite("native", mark_test=True)
14 | class ArrowDataFrameTests(DataFrameTests.Tests):
15 | def df(self, data: Any = None, schema: Any = None) -> ArrowDataFrame:
16 | return ArrowDataFrame(data, schema)
17 |
18 |
19 | @ft.fugue_test_suite("native", mark_test=True)
20 | class NativeArrowDataFrameTests(DataFrameTests.NativeTests):
21 | def df(self, data: Any = None, schema: Any = None) -> pd.DataFrame:
22 | return ArrowDataFrame(data, schema).as_arrow()
23 |
24 | def to_native_df(self, pdf: pd.DataFrame) -> Any: # pragma: no cover
25 | return pa.Table.from_pandas(pdf)
26 |
27 | def test_num_partitions(self):
28 | assert fa.get_num_partitions(self.df([[0, 1]], "a:int,b:int")) == 1
29 |
30 |
31 | def test_init():
32 | df = ArrowDataFrame(schema="a:str,b:int")
33 | assert df.empty
34 | assert df.schema == "a:str,b:int"
35 | assert df.is_bounded
36 |
37 | df = ArrowDataFrame(pd.DataFrame([], columns=["a", "b"]), schema="a:str,b:int")
38 | assert df.empty
39 | assert df.schema == "a:str,b:int"
40 | assert df.is_bounded
41 |
42 | data = [["a", "1"], ["b", "2"]]
43 | df = ArrowDataFrame(data, "a:str,b:str")
44 | assert [["a", "1"], ["b", "2"]] == df.as_array(type_safe=True)
45 | data = [["a", 1], ["b", 2]]
46 | df = ArrowDataFrame(data, "a:str,b:int")
47 | assert [["a", 1.0], ["b", 2.0]] == df.as_array(type_safe=True)
48 | df = ArrowDataFrame(data, "a:str,b:double")
49 | assert [["a", 1.0], ["b", 2.0]] == df.as_array(type_safe=True)
50 |
51 | ddf = ArrowDataFrame(df.native)
52 | assert [["a", 1.0], ["b", 2.0]] == ddf.as_array(type_safe=True)
53 |
54 | df = ArrowDataFrame(df.as_pandas(), "a:str,b:double")
55 | assert [["a", 1.0], ["b", 2.0]] == df.as_array(type_safe=True)
56 | df = ArrowDataFrame(df.as_pandas()["b"])
57 | assert [[1.0], [2.0]] == df.as_array(type_safe=True)
58 |
59 | df = ArrowDataFrame([], "x:str,y:double")
60 | assert df.empty
61 | assert df.is_local
62 | assert df.is_bounded
63 |
64 | raises(Exception, lambda: ArrowDataFrame(123))
65 |
--------------------------------------------------------------------------------
/tests/fugue/dataframe/test_dataframes.py:
--------------------------------------------------------------------------------
1 | from fugue.dataframe import DataFrames
2 | from fugue.dataframe.array_dataframe import ArrayDataFrame
3 | from fugue.dataframe.pandas_dataframe import PandasDataFrame
4 | from pytest import raises
5 | from triad.exceptions import InvalidOperationError
6 |
7 |
8 | def test_dataframes():
9 | df1 = ArrayDataFrame([[0]], "a:int")
10 | df2 = ArrayDataFrame([[1]], "a:int")
11 | dfs = DataFrames(a=df1, b=df2)
12 | assert dfs[0] is df1
13 | assert dfs[1] is df2
14 |
15 | dfs = DataFrames([df1, df2], df1)
16 | assert not dfs.has_key
17 | assert dfs[0] is df1
18 | assert dfs[1] is df2
19 | assert dfs[2] is df1
20 |
21 | dfs2 = DataFrames(dfs, dfs, df2)
22 | assert not dfs2.has_key
23 | assert dfs2[0] is df1
24 | assert dfs2[1] is df2
25 | assert dfs2[2] is df1
26 | assert dfs2[3] is df1
27 | assert dfs2[4] is df2
28 | assert dfs2[5] is df1
29 | assert dfs2[6] is df2
30 |
31 | dfs = DataFrames([("a", df1), ("b", df2)])
32 | assert dfs.has_key
33 | assert dfs[0] is df1
34 | assert dfs[1] is df2
35 | assert dfs["a"] is df1
36 | assert dfs["b"] is df2
37 |
38 | with raises(ValueError):
39 | dfs["c"] = 1
40 |
41 | with raises(ValueError):
42 | dfs2 = DataFrames(1)
43 |
44 | with raises(ValueError):
45 | dfs2 = DataFrames(a=df1, b=2)
46 |
47 | with raises(InvalidOperationError):
48 | dfs2 = DataFrames(dict(a=df1), df2)
49 |
50 | with raises(InvalidOperationError):
51 | dfs2 = DataFrames(df2, dict(a=df1))
52 |
53 | with raises(InvalidOperationError):
54 | dfs2 = DataFrames(df1, a=df2)
55 |
56 | with raises(InvalidOperationError):
57 | dfs2 = DataFrames(DataFrames(df1, df2), x=df2)
58 |
59 | dfs2 = DataFrames(dfs)
60 | assert dfs2.has_key
61 | assert dfs2[0] is df1
62 | assert dfs2[1] is df2
63 |
64 | dfs1 = DataFrames(a=df1, b=df2)
65 | dfs2 = dfs1.convert(lambda x: PandasDataFrame(x.as_array(), x.schema))
66 | assert len(dfs1) == len(dfs2)
67 | assert dfs2.has_key
68 | assert isinstance(dfs2["a"], PandasDataFrame)
69 | assert isinstance(dfs2["b"], PandasDataFrame)
70 |
71 | dfs1 = DataFrames(df1, df2)
72 | dfs2 = dfs1.convert(lambda x: PandasDataFrame(x.as_array(), x.schema))
73 | assert len(dfs1) == len(dfs2)
74 | assert not dfs2.has_key
75 | assert isinstance(dfs2[0], PandasDataFrame)
76 | assert isinstance(dfs2[1], PandasDataFrame)
--------------------------------------------------------------------------------
/images/logo.svg:
--------------------------------------------------------------------------------
1 |
2 |
3 |
4 |
23 |
--------------------------------------------------------------------------------
/docs/_static/logo.svg:
--------------------------------------------------------------------------------
1 |
2 |
3 |
4 |
23 |
--------------------------------------------------------------------------------
/docs/_static/logo_doc.svg:
--------------------------------------------------------------------------------
1 |
2 |
3 |
4 |
23 |
--------------------------------------------------------------------------------
/fugue/_utils/interfaceless.py:
--------------------------------------------------------------------------------
1 | import inspect
2 | from typing import Callable, Optional
3 |
4 | from triad.utils.assertion import assert_or_throw
5 |
6 | _COMMENT_SCHEMA_ANNOTATION = "schema"
7 |
8 |
9 | def parse_comment_annotation(func: Callable, annotation: str) -> Optional[str]:
10 | """Parse comment annotation above the function. It try to find
11 | comment lines starts with the annotation from bottom up, and will use the first
12 | occurrance as the result.
13 |
14 | :param func: the function
15 | :param annotation: the annotation string
16 | :return: schema hint string
17 |
18 | .. admonition:: Examples
19 |
20 | .. code-block:: python
21 |
22 | # schema: a:int,b:str
23 | #schema:a:int,b:int # more comment
24 | # some comment
25 | def dummy():
26 | pass
27 |
28 | assert "a:int,b:int" == parse_comment_annotation(dummy, "schema:")
29 | """
30 | for orig in reversed((inspect.getcomments(func) or "").splitlines()):
31 | start = orig.find(":")
32 | if start <= 0:
33 | continue
34 | actual = orig[:start].replace("#", "", 1).strip()
35 | if actual != annotation:
36 | continue
37 | end = orig.find("#", start)
38 | s = orig[start + 1 : (end if end > 0 else len(orig))].strip()
39 | return s
40 | return None
41 |
42 |
43 | def parse_output_schema_from_comment(func: Callable) -> Optional[str]:
44 | """Parse schema hint from the comments above the function. It try to find
45 | comment lines starts with `schema:` from bottom up, and will use the first
46 | occurrance as the hint.
47 |
48 | :param func: the function
49 | :return: schema hint string
50 |
51 | .. admonition:: Examples
52 |
53 | .. code-block:: python
54 |
55 | # schema: a:int,b:str
56 | #schema:a:int,b:int # more comment
57 | # some comment
58 | def dummy():
59 | pass
60 |
61 | assert "a:int,b:int" == parse_output_schema_from_comment(dummy)
62 | """
63 | res = parse_comment_annotation(func, _COMMENT_SCHEMA_ANNOTATION)
64 | if res is None:
65 | return None
66 | assert_or_throw(res != "", SyntaxError("incorrect schema annotation"))
67 | return res.strip()
68 |
69 |
70 | def is_class_method(func: Callable) -> bool:
71 | sig = inspect.signature(func)
72 | # TODO: this is not the best way
73 | return "self" in sig.parameters
74 |
--------------------------------------------------------------------------------
/tests/fugue/rpc/test_base.py:
--------------------------------------------------------------------------------
1 | import pickle
2 |
3 | from fugue.rpc import make_rpc_server, to_rpc_handler, RPCFunc, EmptyRPCHandler
4 | from pytest import raises
5 | from triad import ParamDict
6 |
7 |
8 | def test_default_server():
9 | def k(value: str) -> str:
10 | return value + "x"
11 |
12 | def kk(value: str) -> str:
13 | return value + "xx"
14 |
15 | conf = {"x": "y"}
16 |
17 | with make_rpc_server(conf).start() as server:
18 | assert "y" == server.conf["x"]
19 | with server.start(): # recursive start will take no effect
20 | client = server.make_client(k)
21 | assert "dddx" == client("ddd")
22 | client = server.make_client(kk)
23 | assert "dddxx" == client("ddd")
24 | server.stop() # extra stop in the end will take no effect
25 |
26 | with raises(pickle.PicklingError):
27 | pickle.dumps(client)
28 |
29 | with raises(pickle.PicklingError):
30 | pickle.dumps(server)
31 |
32 |
33 | def test_server_handlers():
34 | func = lambda x: x + "aa"
35 |
36 | class _Dict(RPCFunc):
37 | def __init__(self, obj):
38 | super().__init__(obj)
39 | self.start_called = 0
40 | self.stop_called = 0
41 |
42 | def start_handler(self):
43 | self.start_called += 1
44 |
45 | def stop_handler(self):
46 | self.stop_called += 1
47 |
48 | server = make_rpc_server({})
49 | server.start()
50 | d1 = _Dict(func)
51 | c1 = server.make_client(d1)
52 | assert "xaa" == c1("x")
53 | assert 1 == d1.start_called
54 | assert 0 == d1.stop_called
55 | server.stop()
56 | assert 1 == d1.start_called
57 | assert 1 == d1.stop_called
58 |
59 | with server.start():
60 | d2 = _Dict(func)
61 | c1 = server.make_client(d2)
62 | server.start()
63 | assert "xaa" == c1("x")
64 | assert 1 == d2.start_called
65 | assert 0 == d2.stop_called
66 | assert 1 == d1.start_called
67 | assert 1 == d1.stop_called
68 | server.stop()
69 | assert 1 == d2.start_called
70 | assert 1 == d2.stop_called
71 | assert 1 == d1.start_called
72 | assert 1 == d1.stop_called
73 |
74 |
75 | def test_to_rpc_handler():
76 | assert isinstance(to_rpc_handler(None), EmptyRPCHandler)
77 | assert isinstance(to_rpc_handler(lambda x: x), RPCFunc)
78 | handler = to_rpc_handler(lambda x: x)
79 | assert handler is to_rpc_handler(handler)
80 | raises(ValueError, lambda: to_rpc_handler(1))
81 |
--------------------------------------------------------------------------------
/fugue_dask/registry.py:
--------------------------------------------------------------------------------
1 | from typing import Any
2 |
3 | import dask.dataframe as dd
4 | from dask.distributed import Client
5 |
6 | from fugue import DataFrame
7 | from fugue.dev import (
8 | DataFrameParam,
9 | ExecutionEngineParam,
10 | fugue_annotated_param,
11 | is_pandas_or,
12 | )
13 | from fugue.plugins import (
14 | as_fugue_dataset,
15 | infer_execution_engine,
16 | parse_execution_engine,
17 | )
18 | from fugue_dask._utils import DASK_UTILS
19 | from fugue_dask.dataframe import DaskDataFrame
20 | from fugue_dask.execution_engine import DaskExecutionEngine
21 |
22 | from .tester import DaskTestBackend # noqa: F401 # pylint: disable-all
23 |
24 |
25 | @infer_execution_engine.candidate(
26 | lambda objs: is_pandas_or(objs, (dd.DataFrame, DaskDataFrame))
27 | )
28 | def _infer_dask_client(objs: Any) -> Any:
29 | return DASK_UTILS.get_or_create_client()
30 |
31 |
32 | @as_fugue_dataset.candidate(lambda df, **kwargs: isinstance(df, dd.DataFrame))
33 | def _dask_as_fugue_df(df: dd.DataFrame, **kwargs: Any) -> DaskDataFrame:
34 | return DaskDataFrame(df, **kwargs)
35 |
36 |
37 | @parse_execution_engine.candidate(
38 | lambda engine, conf, **kwargs: isinstance(engine, Client),
39 | priority=4, # TODO: this is to overwrite dask-sql fugue integration
40 | )
41 | def _parse_dask_client(engine: Client, conf: Any, **kwargs: Any) -> DaskExecutionEngine:
42 | return DaskExecutionEngine(dask_client=engine, conf=conf)
43 |
44 |
45 | @parse_execution_engine.candidate(
46 | lambda engine, conf, **kwargs: isinstance(engine, str) and engine == "dask",
47 | priority=4, # TODO: this is to overwrite dask-sql fugue integration
48 | )
49 | def _parse_dask_str(engine: str, conf: Any, **kwargs: Any) -> DaskExecutionEngine:
50 | return DaskExecutionEngine(conf=conf)
51 |
52 |
53 | @fugue_annotated_param(DaskExecutionEngine)
54 | class _DaskExecutionEngineParam(ExecutionEngineParam):
55 | pass
56 |
57 |
58 | @fugue_annotated_param(dd.DataFrame)
59 | class _DaskDataFrameParam(DataFrameParam):
60 | def to_input_data(self, df: DataFrame, ctx: Any) -> Any:
61 | assert isinstance(ctx, DaskExecutionEngine)
62 | return ctx.to_df(df).native
63 |
64 | def to_output_df(self, output: Any, schema: Any, ctx: Any) -> DataFrame:
65 | assert isinstance(output, dd.DataFrame)
66 | assert isinstance(ctx, DaskExecutionEngine)
67 | return ctx.to_df(output, schema=schema)
68 |
69 | def count(self, df: DataFrame) -> int: # pragma: no cover
70 | raise NotImplementedError("not allowed")
71 |
--------------------------------------------------------------------------------
/tests/fugue/dataframe/test_dataframe.py:
--------------------------------------------------------------------------------
1 | import copy
2 |
3 | import pandas as pd
4 | from pytest import raises
5 | from triad.collections.schema import Schema
6 |
7 | from fugue.dataframe import ArrayDataFrame, DataFrame
8 | from fugue.api import as_fugue_df, get_native_as_df
9 | from fugue.bag.array_bag import ArrayBag
10 |
11 |
12 | def test_as_fugue_df():
13 | with raises(NotImplementedError):
14 | as_fugue_df(10)
15 | with raises(TypeError):
16 | as_fugue_df(ArrayBag([1, 2]))
17 | df = pd.DataFrame([[0]], columns=["a"])
18 | assert isinstance(as_fugue_df(df), DataFrame)
19 |
20 |
21 | def test_get_native_as_df():
22 | with raises(NotImplementedError):
23 | get_native_as_df(10)
24 | # other tests are in the suites
25 |
26 |
27 | def test_show():
28 | df = ArrayDataFrame(schema="a:str,b:str")
29 | df.show()
30 |
31 | assert repr(df) == df._repr_html_()
32 |
33 | s = " ".join(["x"] * 2)
34 | df = ArrayDataFrame([[s, 1], ["b", 2]], "a:str,b:str")
35 | df.show()
36 |
37 | s = " ".join(["x"] * 200)
38 | df = ArrayDataFrame([[s, 1], ["b", 2]], "a:str,b:str")
39 | df.show()
40 |
41 | s = " ".join(["x"] * 200)
42 | df = ArrayDataFrame([[s, 1], ["b", s]], "a:str,b:str")
43 | df.show()
44 |
45 | s = "".join(["x"] * 2000)
46 | df = ArrayDataFrame([[s, 1], ["b", None]], "a:str,b:str")
47 | df.show()
48 |
49 | s = " ".join(["x"] * 20)
50 | schema = [f"a{x}:str" for x in range(20)]
51 | data = [[f"aasdfjasdfka;sdf{x}:str" for x in range(20)]]
52 | df = ArrayDataFrame(data, schema)
53 | df.show()
54 |
55 | s = " ".join(["x"] * 200)
56 | df = ArrayDataFrame([[s, 1], ["b", "s"]], "a:str,b:str")
57 | df.show(n=1, with_count=True, title="abc")
58 |
59 |
60 | def test_lazy_schema():
61 | df = MockDF([["a", 1], ["b", 2]], "a:str,b:str")
62 | assert callable(df._schema)
63 | assert df.schema == "a:str,b:str"
64 |
65 |
66 | def test_get_info_str():
67 | df = ArrayDataFrame([["a", 1], ["b", 2]], "a:str,b:str")
68 | assert '{"schema": "a:str,b:str", "type": '
69 | '"tests.collections.dataframe.test_dataframe.MockDF", "metadata": {}}' == df.get_info_str()
70 |
71 |
72 | def test_copy():
73 | df = ArrayDataFrame([["a", 1], ["b", 2]], "a:str,b:str")
74 | assert copy.copy(df) is df
75 | assert copy.deepcopy(df) is df
76 |
77 |
78 | class MockDF(ArrayDataFrame):
79 | def __init__(self, df=None, schema=None):
80 | super().__init__(df=df, schema=schema)
81 | DataFrame.__init__(self, lambda: Schema(schema))
82 |
--------------------------------------------------------------------------------
/fugue/_utils/exception.py:
--------------------------------------------------------------------------------
1 | from types import FrameType, TracebackType
2 | from typing import Callable, List, Optional
3 |
4 | _MODIFIED_EXCEPTION_VAR_NAME = "__modified_exception__"
5 |
6 |
7 | def frames_to_traceback(
8 | frame: Optional[FrameType],
9 | limit: int,
10 | should_prune: Optional[Callable[[str], bool]] = None,
11 | ) -> Optional[TracebackType]:
12 | ctb: Optional[TracebackType] = None
13 | skipped = False
14 | while frame is not None and limit > 0:
15 | if _MODIFIED_EXCEPTION_VAR_NAME in frame.f_locals:
16 | return TracebackType(
17 | tb_next=None,
18 | tb_frame=frame,
19 | tb_lasti=frame.f_lasti,
20 | tb_lineno=frame.f_lineno,
21 | )
22 | if not skipped:
23 | if should_prune is not None and should_prune(frame.f_globals["__name__"]):
24 | frame = frame.f_back
25 | continue
26 | skipped = True
27 | if should_prune is None or not should_prune(frame.f_globals["__name__"]):
28 | ctb = TracebackType(
29 | tb_next=ctb,
30 | tb_frame=frame,
31 | tb_lasti=frame.f_lasti,
32 | tb_lineno=frame.f_lineno,
33 | )
34 | limit -= 1
35 | frame = frame.f_back
36 | continue
37 | break # pragma: no cover
38 |
39 | return ctb
40 |
41 |
42 | def modify_traceback(
43 | traceback: Optional[TracebackType],
44 | should_prune: Optional[Callable[[str], bool]] = None,
45 | add_traceback: Optional[TracebackType] = None,
46 | ) -> Optional[TracebackType]:
47 | ctb: Optional[TracebackType] = None
48 |
49 | # get stack
50 | stack: List[TracebackType] = []
51 |
52 | if add_traceback is not None:
53 | f: Optional[TracebackType] = add_traceback
54 | while f is not None:
55 | stack.append(f)
56 | f = f.tb_next
57 | f = traceback
58 | while f is not None:
59 | stack.append(f)
60 | f = f.tb_next
61 | stack.reverse()
62 |
63 | # prune and reconstruct
64 | for n, f in enumerate(stack):
65 | if (
66 | n == 0
67 | or should_prune is None
68 | or not should_prune(f.tb_frame.f_globals["__name__"])
69 | ):
70 | ctb = TracebackType(
71 | tb_next=ctb,
72 | tb_frame=f.tb_frame,
73 | tb_lasti=f.tb_lasti,
74 | tb_lineno=f.tb_lineno,
75 | )
76 |
77 | return ctb
78 |
--------------------------------------------------------------------------------
/fugue_test/__init__.py:
--------------------------------------------------------------------------------
1 | from typing import Any, Dict, Tuple
2 |
3 | import pyarrow as pa
4 | import pytest
5 | from triad.utils.pyarrow import to_pa_datatype
6 |
7 | _FUGUE_TEST_CONF_NAME = "fugue_test_conf"
8 |
9 |
10 | def pytest_addoption(parser: Any): # pragma: no cover
11 | parser.addini(
12 | _FUGUE_TEST_CONF_NAME,
13 | help="Configs for fugue testing execution engines",
14 | type="linelist",
15 | )
16 |
17 |
18 | def pytest_configure(config: Any):
19 | from fugue.test.plugins import _set_global_conf
20 |
21 | options = config.getini(_FUGUE_TEST_CONF_NAME)
22 | conf: Dict[str, Any] = {}
23 | if options:
24 | for line in options:
25 | line = line.strip()
26 | if not line.startswith("#"):
27 | k, v = _parse_line(line)
28 | conf[k] = v
29 | _set_global_conf(conf)
30 |
31 |
32 | def pytest_report_header(config, start_path):
33 | from fugue.test.plugins import _get_all_ini_conf
34 |
35 | header_lines = []
36 | header_lines.append("Fugue tests will be initialized with options:")
37 | for k, v in _get_all_ini_conf().items():
38 | header_lines.append(f"\t{k} = {v}")
39 | return "\n".join(header_lines)
40 |
41 |
42 | def _parse_line(line: str) -> Tuple[str, Any]:
43 | try:
44 | kv = line.split("=", 1)
45 | if len(kv) == 1:
46 | raise ValueError()
47 | kt = kv[0].split(":", 1)
48 | if len(kt) == 1:
49 | tp = pa.string()
50 | else:
51 | tp = to_pa_datatype(kt[1].strip())
52 | key = kt[0].strip()
53 | if key == "":
54 | raise ValueError()
55 | value = pa.compute.cast([kv[1].strip()], tp).to_pylist()[0]
56 | return key, value
57 | except Exception:
58 | raise ValueError(
59 | f"Invalid config line: {line}, it must be in format: key[:type]=value"
60 | )
61 |
62 |
63 | @pytest.fixture(scope="class")
64 | def backend_context(request: Any):
65 | from fugue.test.plugins import _make_backend_context, _parse_backend
66 |
67 | c, _ = _parse_backend(request.param)
68 | session = request.getfixturevalue(c + "_session")
69 | with _make_backend_context(request.param, session) as ctx:
70 | yield ctx
71 |
72 |
73 | @pytest.fixture(scope="class")
74 | def _class_backend_context(request, backend_context):
75 | from fugue.test.plugins import FugueTestContext
76 |
77 | request.cls._test_context = FugueTestContext(
78 | engine=backend_context.engine,
79 | session=backend_context.session,
80 | name=backend_context.name,
81 | )
82 | yield
83 |
--------------------------------------------------------------------------------
/docs/api_sql/fugue_sql.rst:
--------------------------------------------------------------------------------
1 | fugue\_sql
2 | ===========
3 |
4 | .. |SchemaLikeObject| replace:: :ref:`Schema like object `
5 | .. |ParamsLikeObject| replace:: :ref:`Parameters like object `
6 | .. |DataFrameLikeObject| replace:: :ref:`DataFrame like object `
7 | .. |DataFramesLikeObject| replace:: :ref:`DataFrames like object `
8 | .. |PartitionLikeObject| replace:: :ref:`Partition like object `
9 | .. |RPCHandlerLikeObject| replace:: :ref:`RPChandler like object `
10 |
11 | .. |ExecutionEngine| replace:: :class:`~fugue.execution.execution_engine.ExecutionEngine`
12 | .. |NativeExecutionEngine| replace:: :class:`~fugue.execution.native_execution_engine.NativeExecutionEngine`
13 | .. |FugueWorkflow| replace:: :class:`~fugue.workflow.workflow.FugueWorkflow`
14 |
15 | .. |ReadJoin| replace:: Read Join tutorials on :ref:`workflow ` and :ref:`engine ` for details
16 | .. |FugueConfig| replace:: :doc:`the Fugue Configuration Tutorial `
17 | .. |PartitionTutorial| replace:: :doc:`the Partition Tutorial `
18 | .. |FugueSQLTutorial| replace:: :doc:`the Fugue SQL Tutorial `
19 | .. |DataFrameTutorial| replace:: :ref:`the DataFrame Tutorial `
20 | .. |ExecutionEngineTutorial| replace:: :doc:`the ExecutionEngine Tutorial `
21 | .. |ZipComap| replace:: :ref:`Zip & Comap `
22 | .. |LoadSave| replace:: :ref:`Load & Save `
23 | .. |AutoPersist| replace:: :ref:`Auto Persist `
24 | .. |TransformerTutorial| replace:: :doc:`the Transformer Tutorial `
25 | .. |CoTransformer| replace:: :ref:`CoTransformer `
26 | .. |CoTransformerTutorial| replace:: :doc:`the CoTransformer Tutorial `
27 | .. |FugueDataTypes| replace:: :doc:`Fugue Data Types `
28 |
29 |
30 | fugue\_sql.exceptions
31 | ---------------------
32 |
33 | .. automodule:: fugue_sql.exceptions
34 | :members:
35 | :undoc-members:
36 | :show-inheritance:
37 |
38 |
--------------------------------------------------------------------------------
/.gitignore:
--------------------------------------------------------------------------------
1 | # Byte-compiled / optimized / DLL files
2 | __pycache__/
3 | *.py[cod]
4 | *$py.class
5 |
6 | # C extensions
7 | *.so
8 |
9 | # Distribution / packaging
10 | .Python
11 | build/
12 | develop-eggs/
13 | dist/
14 | downloads/
15 | eggs/
16 | .eggs/
17 | lib/
18 | lib64/
19 | parts/
20 | sdist/
21 | var/
22 | wheels/
23 | pip-wheel-metadata/
24 | share/python-wheels/
25 | *.egg-info/
26 | .installed.cfg
27 | *.egg
28 | MANIFEST
29 |
30 | # PyInstaller
31 | # Usually these files are written by a python script from a template
32 | # before PyInstaller builds the exe, so as to inject date/other infos into it.
33 | *.manifest
34 | *.spec
35 |
36 | # Installer logs
37 | pip-log.txt
38 | pip-delete-this-directory.txt
39 |
40 | # Unit test / coverage reports
41 | htmlcov/
42 | .tox/
43 | .nox/
44 | .coverage
45 | .coverage.*
46 | .cache
47 | nosetests.xml
48 | coverage.xml
49 | *.cover
50 | *.py,cover
51 | .hypothesis/
52 | .pytest_cache/
53 |
54 | # Translations
55 | *.mo
56 | *.pot
57 |
58 | # Django stuff:
59 | *.log
60 | local_settings.py
61 | db.sqlite3
62 | db.sqlite3-journal
63 |
64 | # Flask stuff:
65 | instance/
66 | .webassets-cache
67 |
68 | # Scrapy stuff:
69 | .scrapy
70 |
71 | # Sphinx documentation
72 | docs/_build/
73 |
74 | # PyBuilder
75 | target/
76 |
77 | # Jupyter Notebook
78 | .ipynb_checkpoints
79 |
80 | # IPython
81 | profile_default/
82 | ipython_config.py
83 |
84 | # pyenv
85 | .python-version
86 |
87 | # pipenv
88 | # According to pypa/pipenv#598, it is recommended to include Pipfile.lock in version control.
89 | # However, in case of collaboration, if having platform-specific dependencies or dependencies
90 | # having no cross-platform support, pipenv may install dependencies that don't work, or not
91 | # install all needed dependencies.
92 | #Pipfile.lock
93 |
94 | # PEP 582; used by e.g. github.com/David-OConnor/pyflow
95 | __pypackages__/
96 |
97 | # Celery stuff
98 | celerybeat-schedule
99 | celerybeat.pid
100 |
101 | # SageMath parsed files
102 | *.sage.py
103 |
104 | # Environments
105 | .env
106 | .venv
107 | env/
108 | venv/
109 | ENV/
110 | env.bak/
111 | venv.bak/
112 | pythonenv*
113 |
114 | # Spyder project settings
115 | .spyderproject
116 | .spyproject
117 |
118 | # Rope project settings
119 | .ropeproject
120 |
121 | # mkdocs documentation
122 | /site
123 | .virtual_documents
124 |
125 | # mypy
126 | .mypy_cache
127 | .dmypy.json
128 | dmypy.json
129 |
130 | # Pyre type checker
131 | .pyre/
132 |
133 | .vscode
134 | tmp
135 |
136 | # Antlr
137 | .antlr
138 |
139 | # dask
140 | dask-worker-space
141 |
142 | # spark
143 | spark-warehourse
144 | =*
145 |
146 | # DS_Store
147 | *.DS_Store
148 |
--------------------------------------------------------------------------------
/fugue_spark/tester.py:
--------------------------------------------------------------------------------
1 | from contextlib import contextmanager
2 | from typing import Any, Dict, Iterator
3 |
4 | from pyspark.sql import SparkSession
5 |
6 | import fugue.test as ft
7 |
8 | from ._utils.misc import SparkConnectSession
9 |
10 |
11 | @ft.fugue_test_backend
12 | class SparkTestBackend(ft.FugueTestBackend):
13 | name = "spark"
14 | default_session_conf = {
15 | "spark.app.name": "fugue-test-spark",
16 | "spark.master": "local[*]",
17 | "spark.default.parallelism": 4,
18 | "spark.dynamicAllocation.enabled": "false",
19 | "spark.executor.cores": 4,
20 | "spark.executor.instances": 1,
21 | "spark.io.compression.codec": "lz4",
22 | "spark.rdd.compress": "false",
23 | "spark.sql.shuffle.partitions": 4,
24 | "spark.shuffle.compress": "false",
25 | "spark.sql.catalogImplementation": "in-memory",
26 | "spark.sql.execution.arrow.pyspark.enabled": True,
27 | "spark.sql.adaptive.enabled": False,
28 | }
29 |
30 | @classmethod
31 | def transform_session_conf(cls, conf: Dict[str, Any]) -> Dict[str, Any]:
32 | return ft.extract_conf(conf, "spark.", remove_prefix=False)
33 |
34 | @classmethod
35 | @contextmanager
36 | def session_context(cls, session_conf: Dict[str, Any]) -> Iterator[Any]:
37 | with _create_session(session_conf).getOrCreate() as spark:
38 | yield spark
39 |
40 |
41 | if SparkConnectSession is not None:
42 |
43 | @ft.fugue_test_backend
44 | class SparkConnectTestBackend(SparkTestBackend):
45 | name = "sparkconnect"
46 | default_session_conf = {
47 | "spark.default.parallelism": 4,
48 | "spark.sql.shuffle.partitions": 4,
49 | "spark.sql.execution.arrow.pyspark.enabled": True,
50 | "spark.sql.adaptive.enabled": False,
51 | }
52 |
53 | @classmethod
54 | def transform_session_conf(
55 | cls, conf: Dict[str, Any]
56 | ) -> Dict[str, Any]: # pragma: no cover
57 | # replace sparkconnect. with spark.
58 | return {
59 | "spark." + k: v
60 | for k, v in ft.extract_conf(
61 | conf, cls.name + ".", remove_prefix=True
62 | ).items()
63 | }
64 |
65 | @classmethod
66 | @contextmanager
67 | def session_context(
68 | cls, session_conf: Dict[str, Any]
69 | ) -> Iterator[Any]: # pragma: no cover
70 | spark = _create_session(session_conf).remote("sc://localhost").getOrCreate()
71 | yield spark
72 |
73 |
74 | def _create_session(conf: Dict[str, Any]) -> Any:
75 | sb = SparkSession.builder
76 | for k, v in conf.items():
77 | sb = sb.config(k, v)
78 | return sb
79 |
--------------------------------------------------------------------------------
/docs/api/fugue.rpc.rst:
--------------------------------------------------------------------------------
1 | fugue.rpc
2 | ==========
3 |
4 | .. |SchemaLikeObject| replace:: :ref:`Schema like object `
5 | .. |ParamsLikeObject| replace:: :ref:`Parameters like object `
6 | .. |DataFrameLikeObject| replace:: :ref:`DataFrame like object `
7 | .. |DataFramesLikeObject| replace:: :ref:`DataFrames like object `
8 | .. |PartitionLikeObject| replace:: :ref:`Partition like object `
9 | .. |RPCHandlerLikeObject| replace:: :ref:`RPChandler like object `
10 |
11 | .. |ExecutionEngine| replace:: :class:`~fugue.execution.execution_engine.ExecutionEngine`
12 | .. |NativeExecutionEngine| replace:: :class:`~fugue.execution.native_execution_engine.NativeExecutionEngine`
13 | .. |FugueWorkflow| replace:: :class:`~fugue.workflow.workflow.FugueWorkflow`
14 |
15 | .. |ReadJoin| replace:: Read Join tutorials on :ref:`workflow ` and :ref:`engine ` for details
16 | .. |FugueConfig| replace:: :doc:`the Fugue Configuration Tutorial `
17 | .. |PartitionTutorial| replace:: :doc:`the Partition Tutorial `
18 | .. |FugueSQLTutorial| replace:: :doc:`the Fugue SQL Tutorial `
19 | .. |DataFrameTutorial| replace:: :ref:`the DataFrame Tutorial `
20 | .. |ExecutionEngineTutorial| replace:: :doc:`the ExecutionEngine Tutorial `
21 | .. |ZipComap| replace:: :ref:`Zip & Comap `
22 | .. |LoadSave| replace:: :ref:`Load & Save `
23 | .. |AutoPersist| replace:: :ref:`Auto Persist `
24 | .. |TransformerTutorial| replace:: :doc:`the Transformer Tutorial `
25 | .. |CoTransformer| replace:: :ref:`CoTransformer `
26 | .. |CoTransformerTutorial| replace:: :doc:`the CoTransformer Tutorial `
27 | .. |FugueDataTypes| replace:: :doc:`Fugue Data Types `
28 |
29 |
30 | fugue.rpc.base
31 | --------------
32 |
33 | .. automodule:: fugue.rpc.base
34 | :members:
35 | :undoc-members:
36 | :show-inheritance:
37 |
38 | fugue.rpc.flask
39 | ---------------
40 |
41 | .. automodule:: fugue.rpc.flask
42 | :members:
43 | :undoc-members:
44 | :show-inheritance:
45 |
46 |
--------------------------------------------------------------------------------
/docs/api/fugue.sql.rst:
--------------------------------------------------------------------------------
1 | fugue.sql
2 | ==========
3 |
4 | .. |SchemaLikeObject| replace:: :ref:`Schema like object `
5 | .. |ParamsLikeObject| replace:: :ref:`Parameters like object `
6 | .. |DataFrameLikeObject| replace:: :ref:`DataFrame like object `
7 | .. |DataFramesLikeObject| replace:: :ref:`DataFrames like object `
8 | .. |PartitionLikeObject| replace:: :ref:`Partition like object `
9 | .. |RPCHandlerLikeObject| replace:: :ref:`RPChandler like object `
10 |
11 | .. |ExecutionEngine| replace:: :class:`~fugue.execution.execution_engine.ExecutionEngine`
12 | .. |NativeExecutionEngine| replace:: :class:`~fugue.execution.native_execution_engine.NativeExecutionEngine`
13 | .. |FugueWorkflow| replace:: :class:`~fugue.workflow.workflow.FugueWorkflow`
14 |
15 | .. |ReadJoin| replace:: Read Join tutorials on :ref:`workflow ` and :ref:`engine ` for details
16 | .. |FugueConfig| replace:: :doc:`the Fugue Configuration Tutorial `
17 | .. |PartitionTutorial| replace:: :doc:`the Partition Tutorial `
18 | .. |FugueSQLTutorial| replace:: :doc:`the Fugue SQL Tutorial `
19 | .. |DataFrameTutorial| replace:: :ref:`the DataFrame Tutorial `
20 | .. |ExecutionEngineTutorial| replace:: :doc:`the ExecutionEngine Tutorial `
21 | .. |ZipComap| replace:: :ref:`Zip & Comap `
22 | .. |LoadSave| replace:: :ref:`Load & Save `
23 | .. |AutoPersist| replace:: :ref:`Auto Persist `
24 | .. |TransformerTutorial| replace:: :doc:`the Transformer Tutorial `
25 | .. |CoTransformer| replace:: :ref:`CoTransformer `
26 | .. |CoTransformerTutorial| replace:: :doc:`the CoTransformer Tutorial `
27 | .. |FugueDataTypes| replace:: :doc:`Fugue Data Types `
28 |
29 |
30 | fugue.sql.api
31 | -------------
32 |
33 | .. automodule:: fugue.sql.api
34 | :members:
35 | :undoc-members:
36 | :show-inheritance:
37 |
38 | fugue.sql.workflow
39 | ------------------
40 |
41 | .. automodule:: fugue.sql.workflow
42 | :members:
43 | :undoc-members:
44 | :show-inheritance:
45 |
46 |
--------------------------------------------------------------------------------
/docs/api/fugue.bag.rst:
--------------------------------------------------------------------------------
1 | fugue.bag
2 | ==========
3 |
4 | .. |SchemaLikeObject| replace:: :ref:`Schema like object `
5 | .. |ParamsLikeObject| replace:: :ref:`Parameters like object `
6 | .. |DataFrameLikeObject| replace:: :ref:`DataFrame like object `
7 | .. |DataFramesLikeObject| replace:: :ref:`DataFrames like object `
8 | .. |PartitionLikeObject| replace:: :ref:`Partition like object `
9 | .. |RPCHandlerLikeObject| replace:: :ref:`RPChandler like object `
10 |
11 | .. |ExecutionEngine| replace:: :class:`~fugue.execution.execution_engine.ExecutionEngine`
12 | .. |NativeExecutionEngine| replace:: :class:`~fugue.execution.native_execution_engine.NativeExecutionEngine`
13 | .. |FugueWorkflow| replace:: :class:`~fugue.workflow.workflow.FugueWorkflow`
14 |
15 | .. |ReadJoin| replace:: Read Join tutorials on :ref:`workflow ` and :ref:`engine ` for details
16 | .. |FugueConfig| replace:: :doc:`the Fugue Configuration Tutorial `
17 | .. |PartitionTutorial| replace:: :doc:`the Partition Tutorial `
18 | .. |FugueSQLTutorial| replace:: :doc:`the Fugue SQL Tutorial `
19 | .. |DataFrameTutorial| replace:: :ref:`the DataFrame Tutorial `
20 | .. |ExecutionEngineTutorial| replace:: :doc:`the ExecutionEngine Tutorial `
21 | .. |ZipComap| replace:: :ref:`Zip & Comap `
22 | .. |LoadSave| replace:: :ref:`Load & Save `
23 | .. |AutoPersist| replace:: :ref:`Auto Persist `
24 | .. |TransformerTutorial| replace:: :doc:`the Transformer Tutorial `
25 | .. |CoTransformer| replace:: :ref:`CoTransformer `
26 | .. |CoTransformerTutorial| replace:: :doc:`the CoTransformer Tutorial `
27 | .. |FugueDataTypes| replace:: :doc:`Fugue Data Types `
28 |
29 |
30 | fugue.bag.array\_bag
31 | --------------------
32 |
33 | .. automodule:: fugue.bag.array_bag
34 | :members:
35 | :undoc-members:
36 | :show-inheritance:
37 |
38 | fugue.bag.bag
39 | -------------
40 |
41 | .. automodule:: fugue.bag.bag
42 | :members:
43 | :undoc-members:
44 | :show-inheritance:
45 |
46 |
--------------------------------------------------------------------------------
/fugue_test/bag_suite.py:
--------------------------------------------------------------------------------
1 | # pylint: disable-all
2 | # flake8: noqa
3 |
4 | from datetime import date, datetime
5 | from typing import Any
6 | from unittest import TestCase
7 | import copy
8 | import numpy as np
9 | import pandas as pd
10 | from fugue.bag import Bag, LocalBag
11 | from fugue.exceptions import FugueDataFrameOperationError, FugueDatasetEmptyError
12 | from pytest import raises
13 | from triad.collections.schema import Schema
14 |
15 |
16 | class BagTests(object):
17 | """DataFrame level general test suite.
18 | All new DataFrame types should pass this test suite.
19 | """
20 |
21 | class Tests(TestCase):
22 | @classmethod
23 | def setUpClass(cls):
24 | pass
25 |
26 | @classmethod
27 | def tearDownClass(cls):
28 | pass
29 |
30 | def bg(self, data: Any = None) -> Bag: # pragma: no cover
31 | raise NotImplementedError
32 |
33 | def test_init_basic(self):
34 | raises(Exception, lambda: self.bg())
35 | bg = self.bg([])
36 | assert bg.empty
37 | assert copy.copy(bg) is bg
38 | assert copy.deepcopy(bg) is bg
39 |
40 | def test_peek(self):
41 | bg = self.bg([])
42 | raises(FugueDatasetEmptyError, lambda: bg.peek())
43 |
44 | bg = self.bg(["x"])
45 | assert not bg.is_bounded or 1 == bg.count()
46 | assert not bg.empty
47 | assert "x" == bg.peek()
48 |
49 | def test_as_array(self):
50 | bg = self.bg([2, 1, "a"])
51 | assert set([1, 2, "a"]) == set(bg.as_array())
52 |
53 | def test_as_array_special_values(self):
54 | bg = self.bg([2, None, "a"])
55 | assert set([None, 2, "a"]) == set(bg.as_array())
56 |
57 | bg = self.bg([np.float16(0.1)])
58 | assert set([np.float16(0.1)]) == set(bg.as_array())
59 |
60 | def test_head(self):
61 | bg = self.bg([])
62 | assert [] == bg.head(0).as_array()
63 | assert [] == bg.head(1).as_array()
64 | bg = self.bg([["a", 1]])
65 | if bg.is_bounded:
66 | assert [["a", 1]] == bg.head(1).as_array()
67 | assert [] == bg.head(0).as_array()
68 |
69 | bg = self.bg([1, 2, 3, 4])
70 | assert 2 == bg.head(2).count()
71 | bg = self.bg([1, 2, 3, 4])
72 | assert 4 == bg.head(10).count()
73 | h = bg.head(10)
74 | assert h.is_local and h.is_bounded
75 |
76 | def test_show(self):
77 | bg = self.bg(["a", 1])
78 | bg.show()
79 | bg.show(n=0)
80 | bg.show(n=1)
81 | bg.show(n=2)
82 | bg.show(title="title")
83 | bg.metadata["m"] = 1
84 | bg.show()
85 |
--------------------------------------------------------------------------------
/fugue/workflow/_workflow_context.py:
--------------------------------------------------------------------------------
1 | from typing import Any, Dict
2 | from uuid import uuid4
3 |
4 | from adagio.instances import (
5 | NoOpCache,
6 | ParallelExecutionEngine,
7 | WorkflowContext,
8 | WorkflowHooks,
9 | )
10 | from adagio.specs import WorkflowSpec
11 | from fugue.constants import FUGUE_CONF_WORKFLOW_CONCURRENCY
12 | from fugue.dataframe import DataFrame
13 | from fugue.execution.execution_engine import ExecutionEngine
14 | from fugue.rpc.base import make_rpc_server, RPCServer
15 | from fugue.workflow._checkpoint import CheckpointPath
16 | from triad import SerializableRLock, ParamDict
17 |
18 |
19 | class FugueWorkflowContext(WorkflowContext):
20 | def __init__(
21 | self,
22 | engine: ExecutionEngine,
23 | compile_conf: Any = None,
24 | cache: Any = NoOpCache,
25 | workflow_engine: Any = None,
26 | hooks: Any = WorkflowHooks,
27 | ):
28 | conf = ParamDict(compile_conf)
29 | self._fugue_engine = engine
30 | self._lock = SerializableRLock()
31 | self._results: Dict[Any, DataFrame] = {}
32 | self._execution_id = ""
33 | self._checkpoint_path = CheckpointPath(self.execution_engine)
34 | self._rpc_server = make_rpc_server(engine.conf)
35 | if workflow_engine is None:
36 | workflow_engine = ParallelExecutionEngine(
37 | conf.get_or_throw(FUGUE_CONF_WORKFLOW_CONCURRENCY, int),
38 | self,
39 | )
40 | super().__init__(
41 | cache=cache,
42 | engine=workflow_engine,
43 | hooks=hooks,
44 | logger=self.execution_engine.log,
45 | config=conf,
46 | )
47 |
48 | def run(self, spec: WorkflowSpec, conf: Dict[str, Any]) -> None:
49 | try:
50 | self._execution_id = str(uuid4())
51 | self._checkpoint_path = CheckpointPath(self.execution_engine)
52 | self._checkpoint_path.init_temp_path(self._execution_id)
53 | self._rpc_server.start()
54 | super().run(spec, conf)
55 | finally:
56 | self._checkpoint_path.remove_temp_path()
57 | self._rpc_server.stop()
58 | self._execution_id = ""
59 |
60 | @property
61 | def checkpoint_path(self) -> CheckpointPath:
62 | return self._checkpoint_path
63 |
64 | @property
65 | def execution_engine(self) -> ExecutionEngine:
66 | return self._fugue_engine
67 |
68 | @property
69 | def rpc_server(self) -> RPCServer:
70 | return self._rpc_server
71 |
72 | def set_result(self, key: Any, df: DataFrame) -> None:
73 | with self._lock:
74 | self._results[key] = df
75 |
76 | def get_result(self, key: Any) -> DataFrame:
77 | with self._lock:
78 | return self._results[key]
79 |
--------------------------------------------------------------------------------
/docs/api/fugue.dataset.rst:
--------------------------------------------------------------------------------
1 | fugue.dataset
2 | ==============
3 |
4 | .. |SchemaLikeObject| replace:: :ref:`Schema like object `
5 | .. |ParamsLikeObject| replace:: :ref:`Parameters like object `
6 | .. |DataFrameLikeObject| replace:: :ref:`DataFrame like object `
7 | .. |DataFramesLikeObject| replace:: :ref:`DataFrames like object `
8 | .. |PartitionLikeObject| replace:: :ref:`Partition like object `
9 | .. |RPCHandlerLikeObject| replace:: :ref:`RPChandler like object `
10 |
11 | .. |ExecutionEngine| replace:: :class:`~fugue.execution.execution_engine.ExecutionEngine`
12 | .. |NativeExecutionEngine| replace:: :class:`~fugue.execution.native_execution_engine.NativeExecutionEngine`
13 | .. |FugueWorkflow| replace:: :class:`~fugue.workflow.workflow.FugueWorkflow`
14 |
15 | .. |ReadJoin| replace:: Read Join tutorials on :ref:`workflow ` and :ref:`engine ` for details
16 | .. |FugueConfig| replace:: :doc:`the Fugue Configuration Tutorial `
17 | .. |PartitionTutorial| replace:: :doc:`the Partition Tutorial `
18 | .. |FugueSQLTutorial| replace:: :doc:`the Fugue SQL Tutorial `
19 | .. |DataFrameTutorial| replace:: :ref:`the DataFrame Tutorial `
20 | .. |ExecutionEngineTutorial| replace:: :doc:`the ExecutionEngine Tutorial `
21 | .. |ZipComap| replace:: :ref:`Zip & Comap `
22 | .. |LoadSave| replace:: :ref:`Load & Save `
23 | .. |AutoPersist| replace:: :ref:`Auto Persist `
24 | .. |TransformerTutorial| replace:: :doc:`the Transformer Tutorial `
25 | .. |CoTransformer| replace:: :ref:`CoTransformer `
26 | .. |CoTransformerTutorial| replace:: :doc:`the CoTransformer Tutorial `
27 | .. |FugueDataTypes| replace:: :doc:`Fugue Data Types `
28 |
29 |
30 | fugue.dataset.api
31 | -----------------
32 |
33 | .. automodule:: fugue.dataset.api
34 | :members:
35 | :undoc-members:
36 | :show-inheritance:
37 |
38 | fugue.dataset.dataset
39 | ---------------------
40 |
41 | .. automodule:: fugue.dataset.dataset
42 | :members:
43 | :undoc-members:
44 | :show-inheritance:
45 |
46 |
--------------------------------------------------------------------------------
/docs/api/fugue.extensions.rst:
--------------------------------------------------------------------------------
1 | fugue.extensions
2 | =================
3 |
4 | .. toctree::
5 | :maxdepth: 4
6 |
7 | fugue.extensions.creator
8 | fugue.extensions.outputter
9 | fugue.extensions.processor
10 | fugue.extensions.transformer
11 |
12 | .. |SchemaLikeObject| replace:: :ref:`Schema like object `
13 | .. |ParamsLikeObject| replace:: :ref:`Parameters like object `
14 | .. |DataFrameLikeObject| replace:: :ref:`DataFrame like object `
15 | .. |DataFramesLikeObject| replace:: :ref:`DataFrames like object `
16 | .. |PartitionLikeObject| replace:: :ref:`Partition like object `
17 | .. |RPCHandlerLikeObject| replace:: :ref:`RPChandler like object `
18 |
19 | .. |ExecutionEngine| replace:: :class:`~fugue.execution.execution_engine.ExecutionEngine`
20 | .. |NativeExecutionEngine| replace:: :class:`~fugue.execution.native_execution_engine.NativeExecutionEngine`
21 | .. |FugueWorkflow| replace:: :class:`~fugue.workflow.workflow.FugueWorkflow`
22 |
23 | .. |ReadJoin| replace:: Read Join tutorials on :ref:`workflow ` and :ref:`engine ` for details
24 | .. |FugueConfig| replace:: :doc:`the Fugue Configuration Tutorial `
25 | .. |PartitionTutorial| replace:: :doc:`the Partition Tutorial `
26 | .. |FugueSQLTutorial| replace:: :doc:`the Fugue SQL Tutorial `
27 | .. |DataFrameTutorial| replace:: :ref:`the DataFrame Tutorial `
28 | .. |ExecutionEngineTutorial| replace:: :doc:`the ExecutionEngine Tutorial `
29 | .. |ZipComap| replace:: :ref:`Zip & Comap `
30 | .. |LoadSave| replace:: :ref:`Load & Save `
31 | .. |AutoPersist| replace:: :ref:`Auto Persist `
32 | .. |TransformerTutorial| replace:: :doc:`the Transformer Tutorial `
33 | .. |CoTransformer| replace:: :ref:`CoTransformer `
34 | .. |CoTransformerTutorial| replace:: :doc:`the CoTransformer Tutorial `
35 | .. |FugueDataTypes| replace:: :doc:`Fugue Data Types `
36 |
37 |
38 | fugue.extensions.context
39 | ------------------------
40 |
41 | .. automodule:: fugue.extensions.context
42 | :members:
43 | :undoc-members:
44 | :show-inheritance:
45 |
46 |
--------------------------------------------------------------------------------
/tests/fugue/execution/test_execution_engine.py:
--------------------------------------------------------------------------------
1 | from typing import Type
2 |
3 | from pytest import raises
4 | from triad.collections.dict import ParamDict
5 | from triad.utils.convert import get_full_type_path
6 |
7 | from fugue import ExecutionEngine, NativeExecutionEngine, register_global_conf
8 | from fugue.constants import FUGUE_CONF_SQL_IGNORE_CASE
9 | from fugue.rpc.base import NativeRPCServer
10 | from fugue_duckdb import DuckDBEngine
11 |
12 |
13 | class _MockSQLEngine(DuckDBEngine):
14 | @property
15 | def execution_engine_constraint(self) -> Type[ExecutionEngine]:
16 | return _MockExecutionEngine
17 |
18 |
19 | class _MockExecutionEngine(NativeExecutionEngine):
20 | def __init__(self, conf=None):
21 | super().__init__(conf=conf)
22 | self._stop = 0
23 |
24 | def stop_engine(self):
25 | self._stop += 1
26 |
27 | def create_default_sql_engine(self):
28 | return _MockSQLEngine(self)
29 |
30 |
31 | class _MockRPC(NativeRPCServer):
32 | _start = 0
33 | _stop = 0
34 |
35 | def __init__(self, conf):
36 | super().__init__(conf)
37 | _MockRPC._start = 0
38 | _MockRPC._stop = 0
39 |
40 | def start_handler(self):
41 | _MockRPC._start += 1
42 |
43 | def stop_handler(self):
44 | _MockRPC._stop += 1
45 |
46 |
47 | def test_sql_engine_init():
48 | engine = _MockExecutionEngine()
49 | assert isinstance(engine.sql_engine, _MockSQLEngine)
50 |
51 | with raises(TypeError):
52 | _MockSQLEngine(NativeExecutionEngine())
53 |
54 |
55 | def test_start_stop():
56 | conf = {"fugue.rpc.server": get_full_type_path(_MockRPC)}
57 | engine = _MockExecutionEngine(conf=conf)
58 | engine.stop()
59 | assert 1 == engine._stop
60 | engine.stop() # stop will be called only once
61 | assert 1 == engine._stop
62 |
63 |
64 | def test_global_conf():
65 | register_global_conf({"ftest.a": 1})
66 | engine = _MockExecutionEngine()
67 | assert 1 == engine.conf.get_or_throw("ftest.a", int)
68 | engine = _MockExecutionEngine({"ftest.a": 2})
69 | assert 2 == engine.conf.get_or_throw("ftest.a", int)
70 | assert not engine.conf.get_or_throw(FUGUE_CONF_SQL_IGNORE_CASE, bool)
71 |
72 | # with duplicated value but it's the same as existing ones
73 | register_global_conf({"ftest.a": 1, "ftest.b": 2}, on_dup=ParamDict.THROW)
74 | engine = _MockExecutionEngine()
75 | assert 1 == engine.conf.get_or_throw("ftest.a", int)
76 | assert 2 == engine.conf.get_or_throw("ftest.b", int)
77 |
78 | # transactional, of one value has problem, the whole conf will not be added
79 | with raises(ValueError):
80 | register_global_conf({"ftest.a": 2, "ftest.c": 3}, on_dup=ParamDict.THROW)
81 | assert 1 == engine.conf.get_or_throw("ftest.a", int)
82 | assert 2 == engine.conf.get_or_throw("ftest.b", int)
83 | assert "ftest.c" not in engine.conf
84 |
--------------------------------------------------------------------------------
/fugue_dask/_dask_sql_wrapper.py:
--------------------------------------------------------------------------------
1 | from typing import Any, Optional
2 |
3 | import dask.dataframe as dd
4 |
5 | try:
6 | from dask.dataframe.dask_expr.io.parquet import ReadParquet
7 |
8 | HAS_DASK_EXPR = True # newer dask
9 | except ImportError: # pragma: no cover
10 | HAS_DASK_EXPR = False # older dask
11 |
12 | if not HAS_DASK_EXPR: # pragma: no cover
13 | try:
14 | from dask_sql import Context as ContextWrapper # pylint: disable-all
15 | except ImportError: # pragma: no cover
16 | raise ImportError(
17 | "dask-sql is not installed. Please install it with `pip install dask-sql`"
18 | )
19 | else:
20 | from triad.utils.assertion import assert_or_throw
21 |
22 | try:
23 | from dask_sql import Context
24 | from dask_sql.datacontainer import Statistics
25 | from dask_sql.input_utils import InputUtil
26 | except ImportError: # pragma: no cover
27 | raise ImportError(
28 | "dask-sql is not installed. Please install it with `pip install dask-sql`"
29 | )
30 |
31 | class ContextWrapper(Context): # type: ignore
32 | def create_table(
33 | self,
34 | table_name: str,
35 | input_table: dd.DataFrame,
36 | format: Optional[str] = None, # noqa
37 | persist: bool = False,
38 | schema_name: Optional[str] = None,
39 | statistics: Optional[Statistics] = None,
40 | gpu: bool = False,
41 | **kwargs: Any,
42 | ) -> None: # pragma: no cover
43 | assert_or_throw(
44 | isinstance(input_table, dd.DataFrame),
45 | lambda: ValueError(
46 | f"input_table must be a dask dataframe, but got {type(input_table)}"
47 | ),
48 | )
49 | assert_or_throw(
50 | dd._dask_expr_enabled(), lambda: ValueError("Dask expr must be enabled")
51 | )
52 | schema_name = schema_name or self.schema_name
53 |
54 | dc = InputUtil.to_dc(
55 | input_table,
56 | table_name=table_name,
57 | format=format,
58 | persist=persist,
59 | gpu=gpu,
60 | **kwargs,
61 | )
62 |
63 | dask_filepath = None
64 | operations = input_table.find_operations(ReadParquet)
65 | for op in operations:
66 | dask_filepath = op._args[0]
67 |
68 | dc.filepath = dask_filepath
69 | self.schema[schema_name].filepaths[table_name.lower()] = dask_filepath
70 |
71 | if not statistics:
72 | statistics = Statistics(float("nan"))
73 | dc.statistics = statistics
74 |
75 | self.schema[schema_name].tables[table_name.lower()] = dc
76 | self.schema[schema_name].statistics[table_name.lower()] = statistics
77 |
--------------------------------------------------------------------------------
/docs/api/fugue.extensions.creator.rst:
--------------------------------------------------------------------------------
1 | fugue.extensions.creator
2 | =========================
3 |
4 | .. |SchemaLikeObject| replace:: :ref:`Schema like object `
5 | .. |ParamsLikeObject| replace:: :ref:`Parameters like object `
6 | .. |DataFrameLikeObject| replace:: :ref:`DataFrame like object `
7 | .. |DataFramesLikeObject| replace:: :ref:`DataFrames like object `
8 | .. |PartitionLikeObject| replace:: :ref:`Partition like object `
9 | .. |RPCHandlerLikeObject| replace:: :ref:`RPChandler like object `
10 |
11 | .. |ExecutionEngine| replace:: :class:`~fugue.execution.execution_engine.ExecutionEngine`
12 | .. |NativeExecutionEngine| replace:: :class:`~fugue.execution.native_execution_engine.NativeExecutionEngine`
13 | .. |FugueWorkflow| replace:: :class:`~fugue.workflow.workflow.FugueWorkflow`
14 |
15 | .. |ReadJoin| replace:: Read Join tutorials on :ref:`workflow ` and :ref:`engine ` for details
16 | .. |FugueConfig| replace:: :doc:`the Fugue Configuration Tutorial `
17 | .. |PartitionTutorial| replace:: :doc:`the Partition Tutorial `
18 | .. |FugueSQLTutorial| replace:: :doc:`the Fugue SQL Tutorial `
19 | .. |DataFrameTutorial| replace:: :ref:`the DataFrame Tutorial `
20 | .. |ExecutionEngineTutorial| replace:: :doc:`the ExecutionEngine Tutorial `
21 | .. |ZipComap| replace:: :ref:`Zip & Comap `
22 | .. |LoadSave| replace:: :ref:`Load & Save `
23 | .. |AutoPersist| replace:: :ref:`Auto Persist `
24 | .. |TransformerTutorial| replace:: :doc:`the Transformer Tutorial `
25 | .. |CoTransformer| replace:: :ref:`CoTransformer `
26 | .. |CoTransformerTutorial| replace:: :doc:`the CoTransformer Tutorial `
27 | .. |FugueDataTypes| replace:: :doc:`Fugue Data Types `
28 |
29 |
30 | fugue.extensions.creator.convert
31 | --------------------------------
32 |
33 | .. automodule:: fugue.extensions.creator.convert
34 | :members:
35 | :undoc-members:
36 | :show-inheritance:
37 |
38 | fugue.extensions.creator.creator
39 | --------------------------------
40 |
41 | .. automodule:: fugue.extensions.creator.creator
42 | :members:
43 | :undoc-members:
44 | :show-inheritance:
45 |
46 |
--------------------------------------------------------------------------------
/docs/api_ibis/fugue_ibis.execution.rst:
--------------------------------------------------------------------------------
1 | fugue\_ibis.execution
2 | ======================
3 |
4 | .. |SchemaLikeObject| replace:: :ref:`Schema like object `
5 | .. |ParamsLikeObject| replace:: :ref:`Parameters like object `
6 | .. |DataFrameLikeObject| replace:: :ref:`DataFrame like object `
7 | .. |DataFramesLikeObject| replace:: :ref:`DataFrames like object `
8 | .. |PartitionLikeObject| replace:: :ref:`Partition like object `
9 | .. |RPCHandlerLikeObject| replace:: :ref:`RPChandler like object `
10 |
11 | .. |ExecutionEngine| replace:: :class:`~fugue.execution.execution_engine.ExecutionEngine`
12 | .. |NativeExecutionEngine| replace:: :class:`~fugue.execution.native_execution_engine.NativeExecutionEngine`
13 | .. |FugueWorkflow| replace:: :class:`~fugue.workflow.workflow.FugueWorkflow`
14 |
15 | .. |ReadJoin| replace:: Read Join tutorials on :ref:`workflow ` and :ref:`engine ` for details
16 | .. |FugueConfig| replace:: :doc:`the Fugue Configuration Tutorial `
17 | .. |PartitionTutorial| replace:: :doc:`the Partition Tutorial `
18 | .. |FugueSQLTutorial| replace:: :doc:`the Fugue SQL Tutorial `
19 | .. |DataFrameTutorial| replace:: :ref:`the DataFrame Tutorial `
20 | .. |ExecutionEngineTutorial| replace:: :doc:`the ExecutionEngine Tutorial `
21 | .. |ZipComap| replace:: :ref:`Zip & Comap `
22 | .. |LoadSave| replace:: :ref:`Load & Save `
23 | .. |AutoPersist| replace:: :ref:`Auto Persist `
24 | .. |TransformerTutorial| replace:: :doc:`the Transformer Tutorial `
25 | .. |CoTransformer| replace:: :ref:`CoTransformer `
26 | .. |CoTransformerTutorial| replace:: :doc:`the CoTransformer Tutorial `
27 | .. |FugueDataTypes| replace:: :doc:`Fugue Data Types `
28 |
29 |
30 | fugue\_ibis.execution.ibis\_engine
31 | ----------------------------------
32 |
33 | .. automodule:: fugue_ibis.execution.ibis_engine
34 | :members:
35 | :undoc-members:
36 | :show-inheritance:
37 |
38 | fugue\_ibis.execution.pandas\_backend
39 | -------------------------------------
40 |
41 | .. automodule:: fugue_ibis.execution.pandas_backend
42 | :members:
43 | :undoc-members:
44 | :show-inheritance:
45 |
46 |
--------------------------------------------------------------------------------
/docs/api/fugue.extensions.outputter.rst:
--------------------------------------------------------------------------------
1 | fugue.extensions.outputter
2 | ===========================
3 |
4 | .. |SchemaLikeObject| replace:: :ref:`Schema like object `
5 | .. |ParamsLikeObject| replace:: :ref:`Parameters like object `
6 | .. |DataFrameLikeObject| replace:: :ref:`DataFrame like object `
7 | .. |DataFramesLikeObject| replace:: :ref:`DataFrames like object `
8 | .. |PartitionLikeObject| replace:: :ref:`Partition like object `
9 | .. |RPCHandlerLikeObject| replace:: :ref:`RPChandler like object `
10 |
11 | .. |ExecutionEngine| replace:: :class:`~fugue.execution.execution_engine.ExecutionEngine`
12 | .. |NativeExecutionEngine| replace:: :class:`~fugue.execution.native_execution_engine.NativeExecutionEngine`
13 | .. |FugueWorkflow| replace:: :class:`~fugue.workflow.workflow.FugueWorkflow`
14 |
15 | .. |ReadJoin| replace:: Read Join tutorials on :ref:`workflow ` and :ref:`engine ` for details
16 | .. |FugueConfig| replace:: :doc:`the Fugue Configuration Tutorial `
17 | .. |PartitionTutorial| replace:: :doc:`the Partition Tutorial `
18 | .. |FugueSQLTutorial| replace:: :doc:`the Fugue SQL Tutorial `
19 | .. |DataFrameTutorial| replace:: :ref:`the DataFrame Tutorial `
20 | .. |ExecutionEngineTutorial| replace:: :doc:`the ExecutionEngine Tutorial `
21 | .. |ZipComap| replace:: :ref:`Zip & Comap `
22 | .. |LoadSave| replace:: :ref:`Load & Save `
23 | .. |AutoPersist| replace:: :ref:`Auto Persist `
24 | .. |TransformerTutorial| replace:: :doc:`the Transformer Tutorial `
25 | .. |CoTransformer| replace:: :ref:`CoTransformer `
26 | .. |CoTransformerTutorial| replace:: :doc:`the CoTransformer Tutorial `
27 | .. |FugueDataTypes| replace:: :doc:`Fugue Data Types `
28 |
29 |
30 | fugue.extensions.outputter.convert
31 | ----------------------------------
32 |
33 | .. automodule:: fugue.extensions.outputter.convert
34 | :members:
35 | :undoc-members:
36 | :show-inheritance:
37 |
38 | fugue.extensions.outputter.outputter
39 | ------------------------------------
40 |
41 | .. automodule:: fugue.extensions.outputter.outputter
42 | :members:
43 | :undoc-members:
44 | :show-inheritance:
45 |
46 |
--------------------------------------------------------------------------------
/docs/api/fugue.extensions.processor.rst:
--------------------------------------------------------------------------------
1 | fugue.extensions.processor
2 | ===========================
3 |
4 | .. |SchemaLikeObject| replace:: :ref:`Schema like object `
5 | .. |ParamsLikeObject| replace:: :ref:`Parameters like object `
6 | .. |DataFrameLikeObject| replace:: :ref:`DataFrame like object `
7 | .. |DataFramesLikeObject| replace:: :ref:`DataFrames like object `
8 | .. |PartitionLikeObject| replace:: :ref:`Partition like object `
9 | .. |RPCHandlerLikeObject| replace:: :ref:`RPChandler like object `
10 |
11 | .. |ExecutionEngine| replace:: :class:`~fugue.execution.execution_engine.ExecutionEngine`
12 | .. |NativeExecutionEngine| replace:: :class:`~fugue.execution.native_execution_engine.NativeExecutionEngine`
13 | .. |FugueWorkflow| replace:: :class:`~fugue.workflow.workflow.FugueWorkflow`
14 |
15 | .. |ReadJoin| replace:: Read Join tutorials on :ref:`workflow ` and :ref:`engine ` for details
16 | .. |FugueConfig| replace:: :doc:`the Fugue Configuration Tutorial `
17 | .. |PartitionTutorial| replace:: :doc:`the Partition Tutorial `
18 | .. |FugueSQLTutorial| replace:: :doc:`the Fugue SQL Tutorial `
19 | .. |DataFrameTutorial| replace:: :ref:`the DataFrame Tutorial `
20 | .. |ExecutionEngineTutorial| replace:: :doc:`the ExecutionEngine Tutorial `
21 | .. |ZipComap| replace:: :ref:`Zip & Comap `
22 | .. |LoadSave| replace:: :ref:`Load & Save `
23 | .. |AutoPersist| replace:: :ref:`Auto Persist `
24 | .. |TransformerTutorial| replace:: :doc:`the Transformer Tutorial `
25 | .. |CoTransformer| replace:: :ref:`CoTransformer `
26 | .. |CoTransformerTutorial| replace:: :doc:`the CoTransformer Tutorial `
27 | .. |FugueDataTypes| replace:: :doc:`Fugue Data Types `
28 |
29 |
30 | fugue.extensions.processor.convert
31 | ----------------------------------
32 |
33 | .. automodule:: fugue.extensions.processor.convert
34 | :members:
35 | :undoc-members:
36 | :show-inheritance:
37 |
38 | fugue.extensions.processor.processor
39 | ------------------------------------
40 |
41 | .. automodule:: fugue.extensions.processor.processor
42 | :members:
43 | :undoc-members:
44 | :show-inheritance:
45 |
46 |
--------------------------------------------------------------------------------
/tests/fugue/execution/test_api.py:
--------------------------------------------------------------------------------
1 | from pytest import raises
2 |
3 | import fugue.api as fa
4 | from fugue import NativeExecutionEngine, register_global_conf
5 | from fugue.exceptions import FugueInvalidOperation
6 |
7 |
8 | class MyEngine(NativeExecutionEngine):
9 | def __init__(self, conf=None):
10 | super().__init__(conf)
11 | self.pre_enter_state = []
12 | self.post_exit_state = []
13 | self.stop_calls = 0
14 |
15 | def on_enter_context(self) -> None:
16 | self.pre_enter_state += [self.in_context]
17 |
18 | def on_exit_context(self) -> None:
19 | self.post_exit_state += [self.in_context]
20 |
21 | def stop_engine(self) -> None:
22 | self.stop_calls += 1
23 |
24 |
25 | def test_engine_operations():
26 | o = MyEngine()
27 | assert fa.get_current_conf().get("fugue.x", 0) == 0
28 | register_global_conf({"fugue.x": 1})
29 | assert fa.get_current_conf().get("fugue.x", 0) == 1
30 | e = fa.set_global_engine(o, {"fugue.x": 2})
31 | assert e.pre_enter_state == [False]
32 | assert e.post_exit_state == []
33 | assert fa.get_current_conf().get("fugue.x", 0) == 2
34 | assert isinstance(e, NativeExecutionEngine)
35 | assert e.in_context and e.is_global
36 | assert fa.get_context_engine() is e
37 | with fa.engine_context("duckdb", {"fugue.x": 3}) as e2:
38 | assert fa.get_current_conf().get("fugue.x", 0) == 3
39 | assert fa.get_context_engine() is e2
40 | assert not e2.is_global and e2.in_context
41 | with e.as_context():
42 | assert e.pre_enter_state == [False, True]
43 | assert e.post_exit_state == []
44 | assert fa.get_current_conf().get("fugue.x", 0) == 2
45 | assert not e2.is_global and e2.in_context
46 | assert e.in_context and e.is_global
47 | assert fa.get_context_engine() is e
48 | assert e.stop_calls == 0
49 | assert e.pre_enter_state == [False, True]
50 | assert e.post_exit_state == [True]
51 | assert fa.get_current_conf().get("fugue.x", 0) == 3
52 | assert e.in_context and e.is_global
53 | assert fa.get_context_engine() is e2
54 | assert e.stop_calls == 0
55 | assert e.pre_enter_state == [False, True]
56 | assert e.post_exit_state == [True]
57 | assert fa.get_current_conf().get("fugue.x", 0) == 2
58 | assert not e2.is_global and not e2.in_context
59 | assert e.in_context and e.is_global
60 | e3 = fa.set_global_engine("duckdb", {"fugue.x": 4})
61 | assert e.stop_calls == 1
62 | assert e.pre_enter_state == [False, True]
63 | assert e.post_exit_state == [True, False]
64 | assert fa.get_current_conf().get("fugue.x", 0) == 4
65 | assert not e.in_context and not e.is_global
66 | assert e3.in_context and e3.is_global
67 | fa.clear_global_engine()
68 | assert not e3.in_context and not e3.is_global
69 | assert fa.get_current_conf().get("fugue.x", 0) == 1
70 | raises(FugueInvalidOperation, lambda: fa.get_context_engine())
71 |
--------------------------------------------------------------------------------
/docs/api/fugue.column.rst:
--------------------------------------------------------------------------------
1 | fugue.column
2 | =============
3 |
4 | .. |SchemaLikeObject| replace:: :ref:`Schema like object `
5 | .. |ParamsLikeObject| replace:: :ref:`Parameters like object `
6 | .. |DataFrameLikeObject| replace:: :ref:`DataFrame like object `
7 | .. |DataFramesLikeObject| replace:: :ref:`DataFrames like object `
8 | .. |PartitionLikeObject| replace:: :ref:`Partition like object `
9 | .. |RPCHandlerLikeObject| replace:: :ref:`RPChandler like object `
10 |
11 | .. |ExecutionEngine| replace:: :class:`~fugue.execution.execution_engine.ExecutionEngine`
12 | .. |NativeExecutionEngine| replace:: :class:`~fugue.execution.native_execution_engine.NativeExecutionEngine`
13 | .. |FugueWorkflow| replace:: :class:`~fugue.workflow.workflow.FugueWorkflow`
14 |
15 | .. |ReadJoin| replace:: Read Join tutorials on :ref:`workflow ` and :ref:`engine ` for details
16 | .. |FugueConfig| replace:: :doc:`the Fugue Configuration Tutorial `
17 | .. |PartitionTutorial| replace:: :doc:`the Partition Tutorial `
18 | .. |FugueSQLTutorial| replace:: :doc:`the Fugue SQL Tutorial `
19 | .. |DataFrameTutorial| replace:: :ref:`the DataFrame Tutorial `
20 | .. |ExecutionEngineTutorial| replace:: :doc:`the ExecutionEngine Tutorial `
21 | .. |ZipComap| replace:: :ref:`Zip & Comap `
22 | .. |LoadSave| replace:: :ref:`Load & Save `
23 | .. |AutoPersist| replace:: :ref:`Auto Persist `
24 | .. |TransformerTutorial| replace:: :doc:`the Transformer Tutorial `
25 | .. |CoTransformer| replace:: :ref:`CoTransformer `
26 | .. |CoTransformerTutorial| replace:: :doc:`the CoTransformer Tutorial `
27 | .. |FugueDataTypes| replace:: :doc:`Fugue Data Types `
28 |
29 |
30 | fugue.column.expressions
31 | ------------------------
32 |
33 | .. automodule:: fugue.column.expressions
34 | :members:
35 | :undoc-members:
36 | :show-inheritance:
37 |
38 | fugue.column.functions
39 | ----------------------
40 |
41 | .. automodule:: fugue.column.functions
42 | :members:
43 | :undoc-members:
44 | :show-inheritance:
45 |
46 | fugue.column.sql
47 | ----------------
48 |
49 | .. automodule:: fugue.column.sql
50 | :members:
51 | :undoc-members:
52 | :show-inheritance:
53 |
54 |
--------------------------------------------------------------------------------
/docs/api_ray/fugue_ray.rst:
--------------------------------------------------------------------------------
1 | fugue\_ray
2 | ===========
3 |
4 | .. |SchemaLikeObject| replace:: :ref:`Schema like object `
5 | .. |ParamsLikeObject| replace:: :ref:`Parameters like object `
6 | .. |DataFrameLikeObject| replace:: :ref:`DataFrame like object `
7 | .. |DataFramesLikeObject| replace:: :ref:`DataFrames like object `
8 | .. |PartitionLikeObject| replace:: :ref:`Partition like object `
9 | .. |RPCHandlerLikeObject| replace:: :ref:`RPChandler like object `
10 |
11 | .. |ExecutionEngine| replace:: :class:`~fugue.execution.execution_engine.ExecutionEngine`
12 | .. |NativeExecutionEngine| replace:: :class:`~fugue.execution.native_execution_engine.NativeExecutionEngine`
13 | .. |FugueWorkflow| replace:: :class:`~fugue.workflow.workflow.FugueWorkflow`
14 |
15 | .. |ReadJoin| replace:: Read Join tutorials on :ref:`workflow ` and :ref:`engine ` for details
16 | .. |FugueConfig| replace:: :doc:`the Fugue Configuration Tutorial `
17 | .. |PartitionTutorial| replace:: :doc:`the Partition Tutorial `
18 | .. |FugueSQLTutorial| replace:: :doc:`the Fugue SQL Tutorial `
19 | .. |DataFrameTutorial| replace:: :ref:`the DataFrame Tutorial `
20 | .. |ExecutionEngineTutorial| replace:: :doc:`the ExecutionEngine Tutorial `
21 | .. |ZipComap| replace:: :ref:`Zip & Comap `
22 | .. |LoadSave| replace:: :ref:`Load & Save `
23 | .. |AutoPersist| replace:: :ref:`Auto Persist `
24 | .. |TransformerTutorial| replace:: :doc:`the Transformer Tutorial `
25 | .. |CoTransformer| replace:: :ref:`CoTransformer `
26 | .. |CoTransformerTutorial| replace:: :doc:`the CoTransformer Tutorial `
27 | .. |FugueDataTypes| replace:: :doc:`Fugue Data Types `
28 |
29 |
30 | fugue\_ray.dataframe
31 | --------------------
32 |
33 | .. automodule:: fugue_ray.dataframe
34 | :members:
35 | :undoc-members:
36 | :show-inheritance:
37 |
38 | fugue\_ray.execution\_engine
39 | ----------------------------
40 |
41 | .. automodule:: fugue_ray.execution_engine
42 | :members:
43 | :undoc-members:
44 | :show-inheritance:
45 |
46 | fugue\_ray.registry
47 | -------------------
48 |
49 | .. automodule:: fugue_ray.registry
50 | :members:
51 | :undoc-members:
52 | :show-inheritance:
53 |
54 |
--------------------------------------------------------------------------------
/tests/fugue_ibis/test_dataframe.py:
--------------------------------------------------------------------------------
1 | import sys
2 | from datetime import datetime
3 | from typing import Any
4 |
5 | import pandas as pd
6 | import pytest
7 |
8 | import fugue.api as fe
9 | import fugue.test as ft
10 | from fugue import ArrowDataFrame
11 | from fugue.exceptions import FugueDataFrameOperationError
12 | from fugue_test.dataframe_suite import DataFrameTests
13 |
14 | from .mock.dataframe import MockDuckDataFrame
15 | from .mock.tester import mockibisduck_session # noqa: F401 # pylint: disable-all
16 | from uuid import uuid4
17 |
18 |
19 | @ft.fugue_test_suite("mockibisduck", mark_test=True)
20 | class IbisDataFrameTests(DataFrameTests.Tests):
21 | def df(self, data: Any = None, schema: Any = None) -> MockDuckDataFrame:
22 | df = ArrowDataFrame(data, schema)
23 | name = "_" + str(uuid4())[:5]
24 | con = self.context.engine.sql_engine.backend
25 | con.create_table(name, df.native, overwrite=True)
26 | return MockDuckDataFrame(con.table(name), schema=schema)
27 |
28 | def test_init_df(self):
29 | df = self.df([["x", 1]], "a:str,b:int")
30 | df = MockDuckDataFrame(df.native, "a:str,b:long")
31 | assert df.schema == "a:str,b:long"
32 |
33 | def test_is_local(self):
34 | df = self.df([["x", 1]], "a:str,b:int")
35 | assert not fe.is_local(df)
36 | assert fe.is_bounded(df)
37 |
38 | def test_map_type(self):
39 | pass
40 |
41 | def test_as_arrow(self):
42 | # empty
43 | df = self.df([], "a:int,b:int")
44 | assert [] == list(ArrowDataFrame(df.as_arrow()).as_dict_iterable())
45 | # pd.Nat
46 | df = self.df([[pd.NaT, 1]], "a:datetime,b:int")
47 | assert [dict(a=None, b=1)] == list(
48 | ArrowDataFrame(df.as_arrow()).as_dict_iterable()
49 | )
50 | # pandas timestamps
51 | df = self.df([[pd.Timestamp("2020-01-01"), 1]], "a:datetime,b:int")
52 | assert [dict(a=datetime(2020, 1, 1), b=1)] == list(
53 | ArrowDataFrame(df.as_arrow()).as_dict_iterable()
54 | )
55 |
56 | def test_deep_nested_types(self):
57 | pass
58 |
59 | def test_list_type(self):
60 | pass
61 |
62 | def test_native_table(self):
63 | df = self.df([["x", 1]], "a:str,b:int").native
64 | assert fe.get_schema(fe.rename(df, dict())) == "a:str,b:int"
65 | assert fe.get_schema(fe.rename(df, dict(a="c"))) == "c:str,b:int"
66 |
67 | with pytest.raises(Exception):
68 | fe.rename(df, dict(a="b"))
69 |
70 | with pytest.raises(FugueDataFrameOperationError):
71 | fe.rename(df, dict(x="y"))
72 |
73 | assert fe.get_schema(fe.drop_columns(df, [])) == "a:str,b:int"
74 | assert fe.get_schema(fe.drop_columns(df, ["a"])) == "b:int"
75 |
76 | with pytest.raises(FugueDataFrameOperationError):
77 | fe.get_schema(fe.drop_columns(df, ["a", "b"]))
78 |
79 | with pytest.raises(FugueDataFrameOperationError):
80 | fe.get_schema(fe.drop_columns(df, ["a", "c"]))
81 |
--------------------------------------------------------------------------------
/fugue/__init__.py:
--------------------------------------------------------------------------------
1 | # flake8: noqa
2 | from triad.collections import Schema
3 |
4 | from fugue.api import out_transform, transform
5 | from fugue.bag.array_bag import ArrayBag
6 | from fugue.bag.bag import Bag, BagDisplay
7 | from fugue.collections.partition import PartitionCursor, PartitionSpec
8 | from fugue.collections.sql import StructuredRawSQL, TempTableName
9 | from fugue.collections.yielded import PhysicalYielded, Yielded
10 | from fugue.constants import register_global_conf
11 | from fugue.dataframe.array_dataframe import ArrayDataFrame
12 | from fugue.dataframe.arrow_dataframe import ArrowDataFrame
13 | from fugue.dataframe.dataframe import (
14 | AnyDataFrame,
15 | DataFrame,
16 | DataFrameDisplay,
17 | LocalBoundedDataFrame,
18 | LocalDataFrame,
19 | )
20 | from fugue.dataframe.dataframe_iterable_dataframe import (
21 | IterableArrowDataFrame,
22 | IterablePandasDataFrame,
23 | LocalDataFrameIterableDataFrame,
24 | )
25 | from fugue.dataframe.dataframes import DataFrames
26 | from fugue.dataframe.iterable_dataframe import IterableDataFrame
27 | from fugue.dataframe.pandas_dataframe import PandasDataFrame
28 | from fugue.dataset import (
29 | AnyDataset,
30 | Dataset,
31 | DatasetDisplay,
32 | as_fugue_dataset,
33 | get_dataset_display,
34 | )
35 | from fugue.execution.execution_engine import (
36 | AnyExecutionEngine,
37 | EngineFacet,
38 | ExecutionEngine,
39 | MapEngine,
40 | SQLEngine,
41 | )
42 | from fugue.execution.factory import (
43 | is_pandas_or,
44 | make_execution_engine,
45 | make_sql_engine,
46 | register_default_execution_engine,
47 | register_default_sql_engine,
48 | register_execution_engine,
49 | register_sql_engine,
50 | )
51 | from fugue.execution.native_execution_engine import (
52 | NativeExecutionEngine,
53 | PandasMapEngine,
54 | QPDPandasEngine,
55 | )
56 | from fugue.extensions.creator import Creator, creator, register_creator
57 | from fugue.extensions.outputter import Outputter, outputter, register_outputter
58 | from fugue.extensions.processor import Processor, processor, register_processor
59 | from fugue.extensions.transformer import (
60 | CoTransformer,
61 | OutputCoTransformer,
62 | OutputTransformer,
63 | Transformer,
64 | cotransformer,
65 | output_cotransformer,
66 | output_transformer,
67 | register_output_transformer,
68 | register_transformer,
69 | transformer,
70 | )
71 | from fugue.registry import _register
72 | from fugue.rpc import (
73 | EmptyRPCHandler,
74 | RPCClient,
75 | RPCFunc,
76 | RPCHandler,
77 | RPCServer,
78 | make_rpc_server,
79 | to_rpc_handler,
80 | )
81 | from fugue.sql.api import fugue_sql_flow as fsql
82 | from fugue.sql.workflow import FugueSQLWorkflow
83 | from fugue.workflow._workflow_context import FugueWorkflowContext
84 | from fugue.workflow.module import module
85 | from fugue.workflow.workflow import FugueWorkflow, WorkflowDataFrame, WorkflowDataFrames
86 | from fugue_version import __version__
87 |
88 | from .dev import *
89 |
90 | _register()
91 |
--------------------------------------------------------------------------------
/docs/api/fugue.collections.rst:
--------------------------------------------------------------------------------
1 | fugue.collections
2 | ==================
3 |
4 | .. |SchemaLikeObject| replace:: :ref:`Schema like object `
5 | .. |ParamsLikeObject| replace:: :ref:`Parameters like object `
6 | .. |DataFrameLikeObject| replace:: :ref:`DataFrame like object `
7 | .. |DataFramesLikeObject| replace:: :ref:`DataFrames like object `
8 | .. |PartitionLikeObject| replace:: :ref:`Partition like object