├── tests ├── __init__.py ├── test_flow │ ├── __init__.py │ ├── test_persistence_compatibility │ │ ├── artifacts │ │ │ ├── total_sum │ │ │ │ ├── 2f000e88-5a8f-4762-b7c4-77eb444348f6 │ │ │ │ │ └── total_sum.json │ │ │ │ └── faed6d2b-5b8c-449a-9fd6-946bec4f5b0d │ │ │ │ │ └── total_sum.json │ │ │ ├── lowercase_sum │ │ │ │ ├── 1c22c085-6fa5-4df7-a69a-a03a3e880e90 │ │ │ │ │ └── lowercase_sum.json │ │ │ │ └── 44556b2a-bd8f-44c6-a1bb-1a03ed2a839f │ │ │ │ │ └── lowercase_sum.json │ │ │ ├── uppercase_sum │ │ │ │ ├── 5998ef92-4102-4e9c-9ef5-f996da3a9fd9 │ │ │ │ │ └── uppercase_sum.json │ │ │ │ └── fbdc03e4-c713-4a7f-aca6-79bd31bb9d62 │ │ │ │ │ └── uppercase_sum.json │ │ │ ├── lowercase_chars │ │ │ │ └── cfe1e872-5b26-4733-9859-4d323d667ae5 │ │ │ │ │ └── lowercase_chars.setpkl │ │ │ │ │ ├── type.pkl │ │ │ │ │ └── items │ │ │ │ │ ├── 007b2b8ca6c265851d06cf3ba2ffcb4d6acc7e23883fbbde5b73afad3444260f_0.pkl │ │ │ │ │ ├── 0b5417898974f490fdf4a442f711925284da871660232c34ba2f8d98cae479dc_0.pkl │ │ │ │ │ ├── 730c53e7abe3c1fa5ec658e2c1139bf73026d56b3b933cf34c7b663d905b28bf_0.pkl │ │ │ │ │ ├── 897f2e2b559dd876ad870c82283197b8cfecdf84736192ea6fb9ee5a5080a3a4_0.pkl │ │ │ │ │ ├── bb2940ae26249720daf30d8464d1002c8c09d8f87688aab9cfbbddcdaf22f79f_0.pkl │ │ │ │ │ └── e4e3cdb83096746758d4f418c1c11d93ffdfbab5a4eebffef734e4396c2ce181_0.pkl │ │ │ └── uppercase_chars │ │ │ │ └── 185898d4-eaeb-46dc-85db-498018b29756 │ │ │ │ └── uppercase_chars.setpkl │ │ │ │ ├── type.pkl │ │ │ │ └── items │ │ │ │ ├── 0e359834dbf9b14f902538ac42ef4ce523a7f665f04a0a985c5e7fe83df360a3_0.pkl │ │ │ │ ├── 21c27bb5c58f87daff8b16ac6dcd17b62345515033e5d8fa66fd44bcfb357780_0.pkl │ │ │ │ ├── 2432d9437cf69add843d4b37526aafb6e28b4edbd3b65a13bec0c99b4628304b_0.pkl │ │ │ │ ├── 3523c5c4504ff1e243867443a194deac2b64c05fd43f6eee5b4c172fcfd5f5bf_0.pkl │ │ │ │ ├── b511f210249bc8eb40056e3fc2383161ca20585396904cc84f77c31f289be4aa_0.pkl │ │ │ │ └── df62494217bc7fffc20d07542eeb8e269a35b6616fd80d26ada10561af513314_0.pkl │ │ └── inventory │ │ │ ├── lowercase_chars │ │ │ └── c41a252b102715306f81212ca8465bed426ce061c4123344374beef09d8f3c19 │ │ │ │ └── d7da6d04d6196967cff9964eda07ad47ec9006da10f9a88b9882e5697fdb47a4 │ │ │ │ └── metadata_0c0598ce0c72797d2da87fde651ed6df34f0a7477bcebac7fb3cf0699c3c3f0f.yaml │ │ │ ├── uppercase_chars │ │ │ └── 9a35aac5b21b31f32254590e01830edd7bd3df6b03c93b7186f733b5e6aeaa45 │ │ │ │ └── 12aa58ae3a54347d38eef097626b97fd71ee9d5054f1c65b4a5ec40de608b975 │ │ │ │ └── metadata_2f13ca96c050b75e0a719b2dab30735d7a894dbf636cf756f109d671d10087c0.yaml │ │ │ ├── lowercase_sum │ │ │ ├── 18f9fabca61690edac92e2e690a0238243a6765a5a323023ea921df8d167b365 │ │ │ │ └── abdeaf50842c524bad26317b37054a082ee1c42365af2cccc3ef44963c4e5ab7 │ │ │ │ │ └── metadata_d6cdbcac0ffae0019872657e0074ec86ba77748544bf477ebeb02c7fe1491beb.yaml │ │ │ └── df26876dd4463a18ff0c4fe5ed4088f6642b919fa5690e7c36314dab74b6aeae │ │ │ │ └── 928386e200f120009b0fba16b2f0de0c22974433d0e30690957ef441b254b74f │ │ │ │ └── metadata_68023f00b2b8d8baf1e747165c0432eabc0148ce5f801b3da9103ede202da633.yaml │ │ │ ├── uppercase_sum │ │ │ ├── 4eebae0400c86e94dbe61d3669149a8818be6c07a985c723330b97b87547e7b3 │ │ │ │ └── 2131eba21ea8c21c6e4830c8be043b71ee21edd9caa43657e5647fe5e72feb91 │ │ │ │ │ └── metadata_dcc2a4fa9aaf6a7e06f5761c52c3ff9f00772e3c169286c31530badd602a4ed5.yaml │ │ │ └── 77f703a7588b04005d8ba8db4cf58b8accd9b951a7c301e0bd7a844315aab6b8 │ │ │ │ └── 25476be4b0032b37f58d1721d86043060b4c8647fe9fdeefc8dd30636231f542 │ │ │ │ └── metadata_154a158abcc7649a0948905714394ab5816346d05843df523261458828e60035.yaml │ │ │ └── total_sum │ │ │ ├── 401bf02deffc7c8d58ba69ed3187ed5b47c8b69fe92da389d7161a78581ce1d0 │ │ │ └── f723892d217a64ac4124c1e92c00d73b8d8986cf17f50a4de70ad5224e54d17b │ │ │ │ └── metadata_3e3b9ef2b6a3946f569202f99045b3b61d7b1a8e327566282ec558dd0254af34.yaml │ │ │ └── e06c59dc8c0982d1a495ac7525ed5f3b6cc09fcfe9b82b905f199adbaebc7d98 │ │ │ └── e34bb2007d114a11e1fd278c00dbf9a244d935eeb2e4a788b50d542152ae655c │ │ │ └── metadata_354167e0f9a7bdc5d7170980d1c0e278e2875bde04db5ac6ccd7bd7f1a73bd6d.yaml │ ├── test_execution.py │ ├── test_relative_cache_path.py │ ├── test_plotting.py │ ├── test_persistence_compatibility.py │ ├── test_join.py │ ├── test_interactions.py │ ├── test_multi_out.py │ ├── generate_test_compatibility_cache.py │ ├── test_new_api.py │ ├── test_logging.py │ ├── test_outputs.py │ ├── test_copy.py │ ├── test_dagviz.py │ ├── test_persistence_aip.py │ └── test_executor.py ├── test_utils │ ├── __init__.py │ ├── test_urls.py │ ├── test_keyed_priority_stack.py │ └── test_misc.py ├── test_optdep.py ├── test_tokenize.py ├── test_helpers.py └── conftest.py ├── bionic ├── aip │ ├── __init__.py │ ├── client.py │ ├── state.py │ ├── main.py │ ├── docker_image_builder.py │ └── task.py ├── flake8 │ └── __init__.py ├── descriptors │ └── __init__.py ├── core │ └── __init__.py ├── utils │ ├── __init__.py │ ├── gcp_auth.py │ ├── files.py │ ├── reload.py │ ├── urls.py │ └── keyed_priority_stack.py ├── deps │ ├── __init__.py │ ├── extras.py │ └── optdep.py ├── util.py ├── __init__.py ├── interpret.py ├── exception.py ├── filecopier.py ├── tokenization.py ├── gcs.py ├── decoration.py └── protocol.py ├── example ├── __init__.py ├── basic_workflow.py ├── hello_world.py ├── intro_workflow.py ├── ml_workflow_cli.py └── ml_workflow.py ├── .dockerignore ├── MANIFEST.in ├── docs ├── tutorials │ ├── _tutorial_setup.py │ └── hello_world.ipynb ├── api │ ├── index.rst │ ├── util.rst │ ├── flow.rst │ ├── decorators.rst │ └── protocols.rst ├── get-help.rst ├── Makefile ├── maintaining.rst ├── index.rst ├── get-started.rst ├── future.rst ├── contributing.rst ├── warnings.rst └── conf.py ├── .bumpversion.cfg ├── .readthedocs.yml ├── .github ├── CODEOWNERS └── workflows │ ├── publish.yml │ └── bionic-test.yml ├── Dockerfile ├── setup.cfg ├── .pre-commit-config.yaml ├── .gitignore ├── README.md └── setup.py /tests/__init__.py: -------------------------------------------------------------------------------- 1 | -------------------------------------------------------------------------------- /bionic/aip/__init__.py: -------------------------------------------------------------------------------- 1 | -------------------------------------------------------------------------------- /example/__init__.py: -------------------------------------------------------------------------------- 1 | -------------------------------------------------------------------------------- /bionic/flake8/__init__.py: -------------------------------------------------------------------------------- 1 | -------------------------------------------------------------------------------- /tests/test_flow/__init__.py: -------------------------------------------------------------------------------- 1 | -------------------------------------------------------------------------------- /tests/test_utils/__init__.py: -------------------------------------------------------------------------------- 1 | -------------------------------------------------------------------------------- /bionic/descriptors/__init__.py: -------------------------------------------------------------------------------- 1 | -------------------------------------------------------------------------------- /.dockerignore: -------------------------------------------------------------------------------- 1 | # Bionic cache files 2 | bndata 3 | -------------------------------------------------------------------------------- /bionic/core/__init__.py: -------------------------------------------------------------------------------- 1 | """ 2 | Contains Bionic core logic to execute tasks and their dependencies. 3 | """ 4 | -------------------------------------------------------------------------------- /bionic/utils/__init__.py: -------------------------------------------------------------------------------- 1 | """ 2 | Contains reusable utility functions that don't have any Bionic-specific logic. 3 | """ 4 | -------------------------------------------------------------------------------- /tests/test_flow/test_persistence_compatibility/artifacts/total_sum/2f000e88-5a8f-4762-b7c4-77eb444348f6/total_sum.json: -------------------------------------------------------------------------------- 1 | 1002 -------------------------------------------------------------------------------- /tests/test_flow/test_persistence_compatibility/artifacts/total_sum/faed6d2b-5b8c-449a-9fd6-946bec4f5b0d/total_sum.json: -------------------------------------------------------------------------------- 1 | 1002 -------------------------------------------------------------------------------- /bionic/deps/__init__.py: -------------------------------------------------------------------------------- 1 | """ 2 | Contains modules for defining and importing Bionic's optional 3 | dependency packages. 4 | """ 5 | -------------------------------------------------------------------------------- /tests/test_flow/test_persistence_compatibility/artifacts/lowercase_sum/1c22c085-6fa5-4df7-a69a-a03a3e880e90/lowercase_sum.json: -------------------------------------------------------------------------------- 1 | 597 -------------------------------------------------------------------------------- /tests/test_flow/test_persistence_compatibility/artifacts/lowercase_sum/44556b2a-bd8f-44c6-a1bb-1a03ed2a839f/lowercase_sum.json: -------------------------------------------------------------------------------- 1 | 597 -------------------------------------------------------------------------------- /tests/test_flow/test_persistence_compatibility/artifacts/uppercase_sum/5998ef92-4102-4e9c-9ef5-f996da3a9fd9/uppercase_sum.json: -------------------------------------------------------------------------------- 1 | 405 -------------------------------------------------------------------------------- /tests/test_flow/test_persistence_compatibility/artifacts/uppercase_sum/fbdc03e4-c713-4a7f-aca6-79bd31bb9d62/uppercase_sum.json: -------------------------------------------------------------------------------- 1 | 405 -------------------------------------------------------------------------------- /MANIFEST.in: -------------------------------------------------------------------------------- 1 | include README.md 2 | include LICENSE.txt 3 | graft bionic 4 | graft tests 5 | graft example 6 | global-exclude __pycache__ *.py[co] *.sw[po] 7 | -------------------------------------------------------------------------------- /docs/tutorials/_tutorial_setup.py: -------------------------------------------------------------------------------- 1 | import sys 2 | from pathlib import Path 3 | 4 | project_path = str(Path("../..").resolve()) 5 | if project_path not in sys.path: 6 | sys.path.insert(0, project_path) 7 | -------------------------------------------------------------------------------- /docs/api/index.rst: -------------------------------------------------------------------------------- 1 | ============= 2 | API Reference 3 | ============= 4 | 5 | These are the APIs provided by Bionic. 6 | 7 | .. toctree:: 8 | 9 | flow 10 | decorators 11 | protocols 12 | util 13 | -------------------------------------------------------------------------------- /.bumpversion.cfg: -------------------------------------------------------------------------------- 1 | [bumpversion] 2 | current_version = 0.11.1 3 | commit = True 4 | tag = False 5 | 6 | [bumpversion:file:setup.py] 7 | 8 | [bumpversion:file:docs/conf.py] 9 | 10 | [bumpversion:file:bionic/__init__.py] 11 | -------------------------------------------------------------------------------- /.readthedocs.yml: -------------------------------------------------------------------------------- 1 | version: 2 2 | formats: [] 3 | python: 4 | version: 3.7 5 | install: 6 | - method: pip 7 | path: . 8 | extra_requirements: 9 | - dev 10 | system_packages: true 11 | -------------------------------------------------------------------------------- /tests/test_flow/test_persistence_compatibility/artifacts/lowercase_chars/cfe1e872-5b26-4733-9859-4d323d667ae5/lowercase_chars.setpkl/type.pkl: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/square/bionic/HEAD/tests/test_flow/test_persistence_compatibility/artifacts/lowercase_chars/cfe1e872-5b26-4733-9859-4d323d667ae5/lowercase_chars.setpkl/type.pkl -------------------------------------------------------------------------------- /tests/test_flow/test_persistence_compatibility/artifacts/uppercase_chars/185898d4-eaeb-46dc-85db-498018b29756/uppercase_chars.setpkl/type.pkl: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/square/bionic/HEAD/tests/test_flow/test_persistence_compatibility/artifacts/uppercase_chars/185898d4-eaeb-46dc-85db-498018b29756/uppercase_chars.setpkl/type.pkl -------------------------------------------------------------------------------- /bionic/utils/gcp_auth.py: -------------------------------------------------------------------------------- 1 | from bionic.deps.optdep import import_optional_dependency 2 | 3 | 4 | def get_gcp_project_id(): 5 | google_auth = import_optional_dependency( 6 | "google.auth", purpose="Get GCP project id from the environment" 7 | ) 8 | _, project = google_auth.default() 9 | return project 10 | -------------------------------------------------------------------------------- /.github/CODEOWNERS: -------------------------------------------------------------------------------- 1 | # Lines starting with '#' are comments. 2 | # Each line is a file pattern followed by one or more owners. 3 | # Check out the link below for more information. 4 | # https://help.github.com/en/github/creating-cloning-and-archiving-repositories/about-code-owners 5 | 6 | # These owners will be the default owners for everything in the repo. 7 | * @jqmp @namanjain @simonafk 8 | -------------------------------------------------------------------------------- /bionic/util.py: -------------------------------------------------------------------------------- 1 | """ 2 | This module is deprecated and exists only for backwards compatibility. 3 | 4 | Some older documentation recommended using `bionic.util.init_basic_logging` to expose 5 | Bionic's logs. This function is now located at `bionic.utils.misc.init_basic_logging`. 6 | Eventually we should remove the need for this function and deprecate it there too. 7 | """ 8 | 9 | from .utils.misc import init_basic_logging # noqa: F401 10 | -------------------------------------------------------------------------------- /tests/test_flow/test_persistence_compatibility/artifacts/lowercase_chars/cfe1e872-5b26-4733-9859-4d323d667ae5/lowercase_chars.setpkl/items/007b2b8ca6c265851d06cf3ba2ffcb4d6acc7e23883fbbde5b73afad3444260f_0.pkl: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/square/bionic/HEAD/tests/test_flow/test_persistence_compatibility/artifacts/lowercase_chars/cfe1e872-5b26-4733-9859-4d323d667ae5/lowercase_chars.setpkl/items/007b2b8ca6c265851d06cf3ba2ffcb4d6acc7e23883fbbde5b73afad3444260f_0.pkl -------------------------------------------------------------------------------- /tests/test_flow/test_persistence_compatibility/artifacts/lowercase_chars/cfe1e872-5b26-4733-9859-4d323d667ae5/lowercase_chars.setpkl/items/0b5417898974f490fdf4a442f711925284da871660232c34ba2f8d98cae479dc_0.pkl: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/square/bionic/HEAD/tests/test_flow/test_persistence_compatibility/artifacts/lowercase_chars/cfe1e872-5b26-4733-9859-4d323d667ae5/lowercase_chars.setpkl/items/0b5417898974f490fdf4a442f711925284da871660232c34ba2f8d98cae479dc_0.pkl -------------------------------------------------------------------------------- /tests/test_flow/test_persistence_compatibility/artifacts/lowercase_chars/cfe1e872-5b26-4733-9859-4d323d667ae5/lowercase_chars.setpkl/items/730c53e7abe3c1fa5ec658e2c1139bf73026d56b3b933cf34c7b663d905b28bf_0.pkl: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/square/bionic/HEAD/tests/test_flow/test_persistence_compatibility/artifacts/lowercase_chars/cfe1e872-5b26-4733-9859-4d323d667ae5/lowercase_chars.setpkl/items/730c53e7abe3c1fa5ec658e2c1139bf73026d56b3b933cf34c7b663d905b28bf_0.pkl -------------------------------------------------------------------------------- /tests/test_flow/test_persistence_compatibility/artifacts/lowercase_chars/cfe1e872-5b26-4733-9859-4d323d667ae5/lowercase_chars.setpkl/items/897f2e2b559dd876ad870c82283197b8cfecdf84736192ea6fb9ee5a5080a3a4_0.pkl: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/square/bionic/HEAD/tests/test_flow/test_persistence_compatibility/artifacts/lowercase_chars/cfe1e872-5b26-4733-9859-4d323d667ae5/lowercase_chars.setpkl/items/897f2e2b559dd876ad870c82283197b8cfecdf84736192ea6fb9ee5a5080a3a4_0.pkl -------------------------------------------------------------------------------- /tests/test_flow/test_persistence_compatibility/artifacts/lowercase_chars/cfe1e872-5b26-4733-9859-4d323d667ae5/lowercase_chars.setpkl/items/bb2940ae26249720daf30d8464d1002c8c09d8f87688aab9cfbbddcdaf22f79f_0.pkl: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/square/bionic/HEAD/tests/test_flow/test_persistence_compatibility/artifacts/lowercase_chars/cfe1e872-5b26-4733-9859-4d323d667ae5/lowercase_chars.setpkl/items/bb2940ae26249720daf30d8464d1002c8c09d8f87688aab9cfbbddcdaf22f79f_0.pkl -------------------------------------------------------------------------------- /tests/test_flow/test_persistence_compatibility/artifacts/lowercase_chars/cfe1e872-5b26-4733-9859-4d323d667ae5/lowercase_chars.setpkl/items/e4e3cdb83096746758d4f418c1c11d93ffdfbab5a4eebffef734e4396c2ce181_0.pkl: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/square/bionic/HEAD/tests/test_flow/test_persistence_compatibility/artifacts/lowercase_chars/cfe1e872-5b26-4733-9859-4d323d667ae5/lowercase_chars.setpkl/items/e4e3cdb83096746758d4f418c1c11d93ffdfbab5a4eebffef734e4396c2ce181_0.pkl -------------------------------------------------------------------------------- /tests/test_flow/test_persistence_compatibility/artifacts/uppercase_chars/185898d4-eaeb-46dc-85db-498018b29756/uppercase_chars.setpkl/items/0e359834dbf9b14f902538ac42ef4ce523a7f665f04a0a985c5e7fe83df360a3_0.pkl: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/square/bionic/HEAD/tests/test_flow/test_persistence_compatibility/artifacts/uppercase_chars/185898d4-eaeb-46dc-85db-498018b29756/uppercase_chars.setpkl/items/0e359834dbf9b14f902538ac42ef4ce523a7f665f04a0a985c5e7fe83df360a3_0.pkl -------------------------------------------------------------------------------- /tests/test_flow/test_persistence_compatibility/artifacts/uppercase_chars/185898d4-eaeb-46dc-85db-498018b29756/uppercase_chars.setpkl/items/21c27bb5c58f87daff8b16ac6dcd17b62345515033e5d8fa66fd44bcfb357780_0.pkl: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/square/bionic/HEAD/tests/test_flow/test_persistence_compatibility/artifacts/uppercase_chars/185898d4-eaeb-46dc-85db-498018b29756/uppercase_chars.setpkl/items/21c27bb5c58f87daff8b16ac6dcd17b62345515033e5d8fa66fd44bcfb357780_0.pkl -------------------------------------------------------------------------------- /tests/test_flow/test_persistence_compatibility/artifacts/uppercase_chars/185898d4-eaeb-46dc-85db-498018b29756/uppercase_chars.setpkl/items/2432d9437cf69add843d4b37526aafb6e28b4edbd3b65a13bec0c99b4628304b_0.pkl: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/square/bionic/HEAD/tests/test_flow/test_persistence_compatibility/artifacts/uppercase_chars/185898d4-eaeb-46dc-85db-498018b29756/uppercase_chars.setpkl/items/2432d9437cf69add843d4b37526aafb6e28b4edbd3b65a13bec0c99b4628304b_0.pkl -------------------------------------------------------------------------------- /tests/test_flow/test_persistence_compatibility/artifacts/uppercase_chars/185898d4-eaeb-46dc-85db-498018b29756/uppercase_chars.setpkl/items/3523c5c4504ff1e243867443a194deac2b64c05fd43f6eee5b4c172fcfd5f5bf_0.pkl: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/square/bionic/HEAD/tests/test_flow/test_persistence_compatibility/artifacts/uppercase_chars/185898d4-eaeb-46dc-85db-498018b29756/uppercase_chars.setpkl/items/3523c5c4504ff1e243867443a194deac2b64c05fd43f6eee5b4c172fcfd5f5bf_0.pkl -------------------------------------------------------------------------------- /tests/test_flow/test_persistence_compatibility/artifacts/uppercase_chars/185898d4-eaeb-46dc-85db-498018b29756/uppercase_chars.setpkl/items/b511f210249bc8eb40056e3fc2383161ca20585396904cc84f77c31f289be4aa_0.pkl: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/square/bionic/HEAD/tests/test_flow/test_persistence_compatibility/artifacts/uppercase_chars/185898d4-eaeb-46dc-85db-498018b29756/uppercase_chars.setpkl/items/b511f210249bc8eb40056e3fc2383161ca20585396904cc84f77c31f289be4aa_0.pkl -------------------------------------------------------------------------------- /tests/test_flow/test_persistence_compatibility/artifacts/uppercase_chars/185898d4-eaeb-46dc-85db-498018b29756/uppercase_chars.setpkl/items/df62494217bc7fffc20d07542eeb8e269a35b6616fd80d26ada10561af513314_0.pkl: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/square/bionic/HEAD/tests/test_flow/test_persistence_compatibility/artifacts/uppercase_chars/185898d4-eaeb-46dc-85db-498018b29756/uppercase_chars.setpkl/items/df62494217bc7fffc20d07542eeb8e269a35b6616fd80d26ada10561af513314_0.pkl -------------------------------------------------------------------------------- /Dockerfile: -------------------------------------------------------------------------------- 1 | # An example docker image that can be used to test the AIP integration 2 | FROM python:3.8 3 | 4 | WORKDIR /code 5 | 6 | COPY README.md setup.py /code/ 7 | COPY bionic/deps/ /code/bionic/deps 8 | RUN ls /code/* 9 | 10 | 11 | RUN python setup.py egg_info && \ 12 | sed '/^\[/d' bionic.egg-info/requires.txt | sort | uniq >> requirements.txt && \ 13 | pip install -r requirements.txt 14 | 15 | COPY . ./ 16 | 17 | RUN pip install -e . 18 | -------------------------------------------------------------------------------- /tests/test_flow/test_execution.py: -------------------------------------------------------------------------------- 1 | import pytest 2 | 3 | import os 4 | 5 | 6 | @pytest.mark.allows_parallel 7 | def test_execution_mode(builder, parallel_execution_enabled): 8 | @builder 9 | def pid(): 10 | return os.getpid() 11 | 12 | current_pid = os.getpid() 13 | returned_pid = builder.build().get("pid") 14 | 15 | if parallel_execution_enabled: 16 | current_pid != returned_pid 17 | else: 18 | current_pid == returned_pid 19 | -------------------------------------------------------------------------------- /bionic/__init__.py: -------------------------------------------------------------------------------- 1 | from .flow import Flow, FlowBuilder # noqa: F401 2 | from .decorators import ( # noqa: F401 3 | version, 4 | version_no_warnings, 5 | output, 6 | outputs, 7 | docs, 8 | gather, 9 | persist, 10 | memoize, 11 | pyplot, 12 | immediate, 13 | changes_per_run, 14 | accepts, 15 | returns, 16 | run_in_aip, 17 | ) 18 | 19 | from . import protocol # noqa: F401 20 | from . import util # noqa: F401 21 | 22 | __version__ = "0.11.1" 23 | -------------------------------------------------------------------------------- /docs/get-help.rst: -------------------------------------------------------------------------------- 1 | ======== 2 | Get Help 3 | ======== 4 | 5 | For help using Bionic, please post your question on `Stack Overflow 6 | `_. Until Bionic has its own `tag 7 | `_, it's good to use all three of the following 8 | words in your question so we can find it easily: "bionic", "framework", and 9 | "python". 10 | 11 | For bug reports and feature requests: please use our `GitHub Issue 12 | Tracker `_. 13 | -------------------------------------------------------------------------------- /example/basic_workflow.py: -------------------------------------------------------------------------------- 1 | import bionic as bn 2 | 3 | builder = bn.FlowBuilder("basic_workflow") 4 | 5 | builder.assign("x", values=[2, 3]) 6 | builder.assign("y", values=[5, 7]) 7 | 8 | 9 | @builder 10 | def x_plus_y(x, y): 11 | return x + y 12 | 13 | 14 | flow = builder.build() 15 | 16 | if __name__ == "__main__": 17 | bn.utils.misc.init_basic_logging() 18 | 19 | for _, row in flow.get("x_plus_y", "series").reset_index().iterrows(): 20 | print(f"{row['x']} + {row['y']} = {row['x_plus_y']}") 21 | -------------------------------------------------------------------------------- /docs/api/util.rst: -------------------------------------------------------------------------------- 1 | ==================== 2 | Utilities 3 | ==================== 4 | 5 | FileCopier 6 | ------------ 7 | When called with the ``mode='FileCopier'`` argument, 8 | :meth:`Flow.get ` can return a 9 | :class:`FileCopier ` instance. This is simply a 10 | utility class that exposes a 11 | :meth:`copy ` method, enabling the 12 | user to copy files around without knowing any internal details about where 13 | Bionic stores them. 14 | 15 | FileCopier API 16 | --------------- 17 | 18 | .. autoclass:: bionic.filecopier.FileCopier 19 | :members: -------------------------------------------------------------------------------- /bionic/aip/client.py: -------------------------------------------------------------------------------- 1 | import logging 2 | 3 | from bionic.deps.optdep import import_optional_dependency 4 | 5 | logger = logging.getLogger(__name__) 6 | 7 | _cached_aip_client = None 8 | 9 | 10 | def get_aip_client(cache_value=True): 11 | if cache_value: 12 | global _cached_aip_client 13 | if _cached_aip_client is None: 14 | _cached_aip_client = get_aip_client(cache_value=False) 15 | return _cached_aip_client 16 | 17 | discovery = import_optional_dependency( 18 | "googleapiclient.discovery", raise_on_missing=True 19 | ) 20 | logger.info("Initializing AIP client ...") 21 | return discovery.build("ml", "v1", cache_discovery=False) 22 | -------------------------------------------------------------------------------- /tests/test_flow/test_relative_cache_path.py: -------------------------------------------------------------------------------- 1 | import os 2 | import shutil 3 | 4 | 5 | def test_move_cache_files(builder, tmp_path): 6 | builder.assign("x", 2) 7 | builder.assign("y", 3) 8 | 9 | @builder 10 | def xy(x, y): 11 | return x * y 12 | 13 | cur_dir = os.path.join(tmp_path, "current") 14 | new_dir = os.path.join(tmp_path, "new") 15 | 16 | builder.set("core__persistent_cache__flow_dir", cur_dir) 17 | flow = builder.build() 18 | # call a method to create cache 19 | assert flow.get("xy") == 6 20 | 21 | shutil.copytree(cur_dir, new_dir) 22 | 23 | builder.set("core__persistent_cache__flow_dir", new_dir) 24 | flow = builder.build() 25 | assert flow.get("xy") == 6 26 | -------------------------------------------------------------------------------- /.github/workflows/publish.yml: -------------------------------------------------------------------------------- 1 | name: Upload Python Package 2 | 3 | on: 4 | release: 5 | types: [published] 6 | 7 | jobs: 8 | deploy: 9 | 10 | runs-on: ubuntu-latest 11 | 12 | steps: 13 | - uses: actions/checkout@v2 14 | - name: Set up Python 15 | uses: actions/setup-python@v2 16 | with: 17 | python-version: '3.7' 18 | - name: Install dependencies 19 | run: | 20 | python -m pip install --upgrade pip 21 | pip install setuptools wheel twine 22 | - name: Build and publish 23 | env: 24 | TWINE_USERNAME: __token__ 25 | TWINE_PASSWORD: ${{ secrets.PYPI_PASSWORD }} 26 | run: | 27 | python setup.py sdist bdist_wheel --universal 28 | twine upload --verbose dist/* 29 | -------------------------------------------------------------------------------- /tests/test_optdep.py: -------------------------------------------------------------------------------- 1 | import pytest 2 | 3 | from bionic.deps.optdep import ( 4 | import_optional_dependency, 5 | TEST_EXTRA_NAME, 6 | TEST_PACKAGE_NAME, 7 | ) 8 | 9 | 10 | def test_import_missing_dependency(): 11 | with pytest.raises( 12 | ImportError, 13 | match=".*%s.*PURPOSE.*pip install 'bionic\\[%s\\]'.*" 14 | % (TEST_PACKAGE_NAME, TEST_EXTRA_NAME), 15 | ): 16 | import_optional_dependency(TEST_PACKAGE_NAME, purpose="PURPOSE") 17 | 18 | 19 | def test_import_missing_dependency_without_raising(): 20 | module = import_optional_dependency(TEST_PACKAGE_NAME, raise_on_missing=False) 21 | assert module is None 22 | 23 | 24 | def test_import_unrecognized_dependency(): 25 | with pytest.raises(AssertionError): 26 | import_optional_dependency("_UNKNOWN_PACKAGE_", purpose="PURPOSE") 27 | -------------------------------------------------------------------------------- /bionic/aip/state.py: -------------------------------------------------------------------------------- 1 | from enum import Enum, auto 2 | 3 | 4 | class AipError(Exception): 5 | pass 6 | 7 | 8 | class State(Enum): 9 | STATE_UNSPECIFIED = auto() 10 | QUEUED = auto() 11 | PREPARING = auto() 12 | RUNNING = auto() 13 | SUCCEEDED = auto() 14 | FAILED = auto() 15 | CANCELLING = auto() 16 | CANCELLED = auto() 17 | 18 | def is_executing(self): 19 | return self in { 20 | State.STATE_UNSPECIFIED, 21 | State.QUEUED, 22 | State.PREPARING, 23 | State.RUNNING, 24 | } 25 | 26 | def is_cancelled(self): 27 | return self in {State.CANCELLING, State.CANCELLED} 28 | 29 | def is_finished(self): 30 | return self in { 31 | State.SUCCEEDED, 32 | State.FAILED, 33 | State.CANCELLING, 34 | State.CANCELLED, 35 | } 36 | -------------------------------------------------------------------------------- /bionic/utils/files.py: -------------------------------------------------------------------------------- 1 | """ 2 | Utilities for working with files. 3 | """ 4 | 5 | import shutil 6 | 7 | 8 | def ensure_parent_dir_exists(path): 9 | ensure_dir_exists(path.parent) 10 | 11 | 12 | def ensure_dir_exists(path): 13 | path.mkdir(parents=True, exist_ok=True) 14 | 15 | 16 | def recursively_copy_path(src_path, dst_path): 17 | if not src_path.exists(): 18 | raise ValueError(f"Path does not exist: {src_path}") 19 | ensure_parent_dir_exists(dst_path) 20 | 21 | if src_path.is_file(): 22 | shutil.copyfile(str(src_path), str(dst_path)) 23 | else: 24 | shutil.copytree(str(src_path), str(dst_path)) 25 | 26 | 27 | def recursively_delete_path(path): 28 | if not path.exists(): 29 | raise ValueError(f"Path does not exist: {path}") 30 | 31 | if path.is_file(): 32 | path.unlink() 33 | else: 34 | shutil.rmtree(path) 35 | -------------------------------------------------------------------------------- /docs/Makefile: -------------------------------------------------------------------------------- 1 | # You can set these variables from the command line, and also from the environment for 2 | # the first two. 3 | SPHINXOPTS ?= 4 | SPHINXBUILD ?= sphinx-build 5 | SOURCEDIR = . 6 | BUILDDIR = _build 7 | 8 | # Put this first so that "make" without argument is like "make help". 9 | help: 10 | @$(SPHINXBUILD) -M help "$(SOURCEDIR)" "$(BUILDDIR)" $(SPHINXOPTS) $(O) 11 | 12 | open: html 13 | open _build/html/index.html 14 | 15 | livehtml: 16 | sphinx-autobuild --ignore '*.swp' --ignore 'tutorials/bndata/**/*' --ignore 'tutorials/.ipynb_checkpoints/**/*' -b html $(ALLSPHINXOPTS) . $(BUILDDIR)/html 17 | 18 | .PHONY: help Makefile 19 | 20 | # Catch-all target: route all unknown targets to Sphinx using the new 21 | # "make mode" option. $(O) is meant as a shortcut for $(SPHINXOPTS). 22 | %: Makefile 23 | @$(SPHINXBUILD) -M $@ "$(SOURCEDIR)" "$(BUILDDIR)" $(SPHINXOPTS) $(O) 24 | -------------------------------------------------------------------------------- /bionic/interpret.py: -------------------------------------------------------------------------------- 1 | """ 2 | Convenience functions for handling arguments based on their type. These can be 3 | used to provide "Pandas-like" APIs that accept (e.g.) either a string or a list 4 | of strings. 5 | """ 6 | 7 | 8 | def str_or_seq_as_list(value): 9 | if isinstance(value, str): 10 | return [value] 11 | elif is_iterable(value): 12 | return list(value) 13 | else: 14 | raise TypeError(f"Expected a string or sequence; got {value!r}") 15 | 16 | 17 | def str_or_seq_or_none_as_list(value): 18 | if isinstance(value, str): 19 | return [value] 20 | elif is_iterable(value): 21 | return list(value) 22 | elif value is None: 23 | return [] 24 | else: 25 | raise TypeError(f"Expected a string or sequence or None; got {value!r}") 26 | 27 | 28 | def is_iterable(x): 29 | try: 30 | iter(x) 31 | return True 32 | except TypeError: 33 | return False 34 | -------------------------------------------------------------------------------- /example/hello_world.py: -------------------------------------------------------------------------------- 1 | import bionic as bn 2 | 3 | # Initialize the builder object we'll use to construct our flow. 4 | builder = bn.FlowBuilder("hello_world") 5 | 6 | # Define new entities "greeting" and "subject" with fixed values. 7 | builder.assign("greeting", "Hello") 8 | builder.assign("subject", "world") 9 | 10 | 11 | # Define a "message" entity, constructed by taking the values of "greeting" and 12 | # "subject" and combining them in a sentence. 13 | # The `@builder` decorator tells Bionic to define a new derived entity; Bionic 14 | # infers the name of the new entity ("message") and the names of its 15 | # dependencies ("greeting" and "subject"). 16 | @builder 17 | def message(greeting, subject): 18 | return f"{greeting} {subject}!" 19 | 20 | 21 | # Assemble the flow object, which is capable of computing any of the entities 22 | # we've defined. 23 | flow = builder.build() 24 | 25 | if __name__ == "__main__": 26 | # Use our flow to compute the message "Hello world!" 27 | print(flow.get("message")) 28 | -------------------------------------------------------------------------------- /setup.cfg: -------------------------------------------------------------------------------- 1 | [flake8] 2 | max-line-length=88 3 | exclude = docs,.venv 4 | ignore = 5 | # These rules are not compatible with black (our code formatter). 6 | E203 # "whitespace before ':'" 7 | W503 # "line break occurred before a binary operator" 8 | # Black handles line lengths for us (slightly less strictly than flake8). 9 | E501 # "line too long" 10 | # We allow TODO and XXX comments in code. 11 | # (But we don't allow FIX-ME -- hyphen inserted so this string doesn't show up in 12 | # searches.) 13 | T101 # "fixme found (TODO)" 14 | T102 # "fixme found (XXX)" 15 | per-file-ignores = 16 | # Allow print statements in example code. 17 | build/lib/example/*:T201 18 | example/*:T201 19 | 20 | # NOTE On my MacBook this plugin adds about 1 extra second to Flake8's runtime, making 21 | # it about 5s total. That's not trivial, so it might not be worth it to have this 22 | # enabled all the time. 23 | [flake8:local-plugins] 24 | extension = 25 | DNM1 = bionic.flake8.check_dnode_match:Checker 26 | 27 | [tool:pytest] 28 | filterwarnings=ignore::DeprecationWarning 29 | -------------------------------------------------------------------------------- /tests/test_flow/test_plotting.py: -------------------------------------------------------------------------------- 1 | import pytest 2 | 3 | import bionic as bn 4 | 5 | 6 | def test_pyplot_no_parens(builder): 7 | @builder 8 | @bn.pyplot 9 | def plot(pyplot): 10 | ax = pyplot.subplot() 11 | ax.plot([1, 2, 3], [1, 3, 9]) 12 | 13 | img = builder.build().get("plot") 14 | assert img.width > 0 15 | assert img.height > 0 16 | 17 | 18 | def test_pyplot_no_args(builder): 19 | @builder 20 | @bn.pyplot() 21 | def plot(pyplot): 22 | ax = pyplot.subplot() 23 | ax.plot([1, 2, 3], [1, 3, 9]) 24 | 25 | img = builder.build().get("plot") 26 | assert img.width > 0 27 | assert img.height > 0 28 | 29 | 30 | def test_pyplot_name_arg(builder): 31 | @builder 32 | @bn.pyplot("plt") 33 | def plot(plt): 34 | ax = plt.subplot() 35 | ax.plot([1, 2, 3], [1, 3, 9]) 36 | 37 | img = builder.build().get("plot") 38 | assert img.width > 0 39 | assert img.height > 0 40 | 41 | 42 | def test_pyplot_missing_dep(builder): 43 | with pytest.raises(ValueError): 44 | 45 | @builder 46 | @bn.pyplot 47 | def plot(some_arg): 48 | pass 49 | -------------------------------------------------------------------------------- /bionic/exception.py: -------------------------------------------------------------------------------- 1 | """ 2 | Bionic-specific exception classes. 3 | """ 4 | 5 | 6 | class UndefinedEntityError(KeyError): 7 | @classmethod 8 | def for_name(cls, name): 9 | return cls(f"Entity {name!r} is not defined") 10 | 11 | 12 | class AlreadyDefinedEntityError(ValueError): 13 | @classmethod 14 | def for_name(cls, name): 15 | return cls(f"Entity {name!r} is already defined") 16 | 17 | 18 | class UnsetEntityError(ValueError): 19 | pass 20 | 21 | 22 | class IncompatibleEntityError(ValueError): 23 | pass 24 | 25 | 26 | class UnsupportedSerializedValueError(Exception): 27 | pass 28 | 29 | 30 | class CodeVersioningError(Exception): 31 | def __init__(self, message, bad_descriptor): 32 | super(CodeVersioningError, self).__init__(message) 33 | self.bad_descriptor = bad_descriptor 34 | 35 | 36 | class EntitySerializationError(Exception): 37 | pass 38 | 39 | 40 | class EntityComputationError(Exception): 41 | pass 42 | 43 | 44 | class EntityValueError(ValueError): 45 | pass 46 | 47 | 48 | class AttributeValidationError(Exception): 49 | pass 50 | 51 | 52 | class MalformedDescriptorError(Exception): 53 | pass 54 | -------------------------------------------------------------------------------- /tests/test_tokenize.py: -------------------------------------------------------------------------------- 1 | import pickle 2 | 3 | from bionic.tokenization import tokenize 4 | 5 | 6 | def test_tokenize_straight_translation(): 7 | assert tokenize(1) == "1" 8 | assert tokenize(1.0) == "1.0" 9 | assert tokenize("hello") == "hello" 10 | 11 | 12 | def test_tokenize_simple_cleaning(): 13 | assert tokenize("Hello").startswith("hello_") 14 | assert tokenize(True).startswith("true_") 15 | assert tokenize("test\x00").startswith("test._") 16 | 17 | 18 | def test_avoid_initial_period(): 19 | assert tokenize(".test").startswith("_.test") 20 | assert tokenize("\x00\x00").startswith("_..") 21 | 22 | 23 | def test_ensure_token_length_is_capped(): 24 | assert len(tokenize("a" * 1000)) < 50 25 | 26 | 27 | class Point: 28 | def __init__(self, x, y): 29 | self.x = x 30 | self.y = y 31 | 32 | 33 | def test_tokenize_complex_type(): 34 | token = tokenize(Point(1, 2), pickle.dumps) 35 | assert isinstance(token, str) 36 | assert len(token) == 10 37 | 38 | 39 | def test_tokenize_no_collisions(): 40 | points = [Point(x, y) for x in range(100) for y in range(100)] 41 | tokens = [tokenize(point, pickle.dumps) for point in points] 42 | assert len(set(tokens)) == len(points) 43 | -------------------------------------------------------------------------------- /tests/test_utils/test_urls.py: -------------------------------------------------------------------------------- 1 | from bionic.persistence import relativize_url, derelativize_url 2 | 3 | 4 | rel_artifact_url = "../artifacts/artifact.pkl" 5 | abs_artifact_url = "file:///Users/User/cache/artifacts/artifact.pkl" 6 | abs_metadata_url = "file:///Users/User/cache/metadata/metadata.yaml" 7 | gcs_artifact_url = "gs://my_bucket/cache/artifacts/artifact.pkl" 8 | gcs_metadata_url = "gs://my_bucket/cache/metadata/metadata.yaml" 9 | 10 | 11 | # file url tests 12 | def test_relativize_abs_file_urls(): 13 | assert relativize_url(abs_artifact_url, abs_metadata_url) == rel_artifact_url 14 | 15 | 16 | def test_relativize_relative_file_urls(): 17 | assert relativize_url(rel_artifact_url, abs_metadata_url) == rel_artifact_url 18 | 19 | 20 | def test_derelativize_abs_file_urls(): 21 | assert derelativize_url(abs_artifact_url, abs_metadata_url) == abs_artifact_url 22 | 23 | 24 | def test_derelativize_relative_file_urls(): 25 | assert derelativize_url(rel_artifact_url, abs_metadata_url) == abs_artifact_url 26 | 27 | 28 | # gcs url tests 29 | def test_relativize_gcs_urls(): 30 | assert relativize_url(gcs_artifact_url, gcs_metadata_url) == gcs_artifact_url 31 | 32 | 33 | def test_derelativize_gcs_urls(): 34 | assert relativize_url(gcs_artifact_url, gcs_metadata_url) == gcs_artifact_url 35 | -------------------------------------------------------------------------------- /.pre-commit-config.yaml: -------------------------------------------------------------------------------- 1 | repos: 2 | - repo: https://gitlab.com/pycqa/flake8 3 | rev: 3.8.3 4 | hooks: 5 | - id: flake8 6 | # Pre-commit does some static analysis by caching packages (can 7 | # be found in ~/.cache/pre-commit/). When used in a virtualenv 8 | # (like pyenv), flake8 does not work correctly withthe default 9 | # language and the custom dnode match linter breaks. 10 | # To get around this, pre-commit devs recommend to use it as a 11 | # "system" hook (default is "local"). See 12 | # https://github.com/pre-commit/pre-commit-hooks/issues/157 13 | # for more information on this issue. 14 | language: system 15 | 16 | - repo: https://github.com/pre-commit/pre-commit-hooks 17 | rev: v3.2.0 18 | hooks: 19 | - id: trailing-whitespace 20 | - id: end-of-file-fixer 21 | # TODO: This throws an error on MacOS Big Sur. Updating the rev 22 | # to v3.3.0 does not fix the problem either. Here is the error for 23 | # reference: 24 | # 25 | # could not determine a constructor for the tag 'tag:yaml.org,2002:python/tuple' 26 | # 27 | # - id: check-yaml 28 | - id: check-added-large-files 29 | 30 | - repo: https://github.com/psf/black 31 | rev: 20.8b1 32 | hooks: 33 | - id: black 34 | language_version: python3 35 | -------------------------------------------------------------------------------- /bionic/filecopier.py: -------------------------------------------------------------------------------- 1 | """ 2 | Contains the ``FileCopier`` class, which is essentially a file path with a useful 3 | ``copy`` method attached to it. 4 | """ 5 | 6 | import subprocess 7 | 8 | from bionic.gcs import upload_to_gcs 9 | 10 | 11 | class FileCopier: 12 | """ 13 | A wrapper for a Path object, exposing a ``copy`` method that will copy 14 | the underlying file to a local or cloud destination. 15 | 16 | Parameters 17 | ---------- 18 | src_file_path: Path 19 | A path to a file. 20 | """ 21 | 22 | def __init__(self, src_file_path): 23 | self.src_file_path = src_file_path 24 | 25 | def copy(self, destination): 26 | """ 27 | Copies file that FileCopier represents to `destination` 28 | 29 | This supports both local and GCS destinations. For the former, we follow cp's 30 | conventions and for the latter we follow fsspec's put / put_file APIs which 31 | can be found at 32 | https://filesystem-spec.readthedocs.io/en/latest/api.html#fsspec.spec.AbstractFileSystem. 33 | 34 | Parameters 35 | ---------- 36 | 37 | destination: Path or str 38 | Where to copy the underlying file 39 | """ 40 | 41 | # handle gcs 42 | if str(destination).startswith("gs://"): 43 | upload_to_gcs(self.src_file_path, str(destination)) 44 | else: 45 | subprocess.check_call( 46 | ["cp", "-R", str(self.src_file_path), str(destination)] 47 | ) 48 | -------------------------------------------------------------------------------- /.gitignore: -------------------------------------------------------------------------------- 1 | # Byte-compiled / optimized / DLL files 2 | __pycache__/ 3 | *.py[cod] 4 | *$py.class 5 | 6 | # C extensions 7 | *.so 8 | 9 | # Distribution / packaging 10 | .Python 11 | env/ 12 | build/ 13 | develop-eggs/ 14 | dist/ 15 | downloads/ 16 | eggs/ 17 | .eggs/ 18 | lib/ 19 | lib64/ 20 | parts/ 21 | sdist/ 22 | var/ 23 | *.egg-info/ 24 | .installed.cfg 25 | *.egg 26 | 27 | # PyInstaller 28 | # Usually these files are written by a python script from a template 29 | # before PyInstaller builds the exe, so as to inject date/other infos into it. 30 | *.manifest 31 | *.spec 32 | 33 | # Installer logs 34 | pip-log.txt 35 | pip-delete-this-directory.txt 36 | 37 | # Unit test / coverage reports 38 | htmlcov/ 39 | .tox/ 40 | .coverage 41 | .coverage.* 42 | .cache 43 | .pytest_cache 44 | nosetests.xml 45 | coverage.xml 46 | *,cover 47 | .hypothesis/ 48 | 49 | # Pytest-profiling reports 50 | prof/ 51 | 52 | # Translations 53 | *.mo 54 | *.pot 55 | 56 | # Django stuff: 57 | *.log 58 | 59 | # Sphinx documentation 60 | docs/_build/ 61 | 62 | # PyBuilder 63 | target/ 64 | 65 | # pyenv python configuration file 66 | .python-version 67 | 68 | # Datafiles 69 | *.csv 70 | *.gz 71 | *.h5 72 | *.pkl 73 | *.pk 74 | *.html 75 | *.log 76 | *.db 77 | *.db-journal 78 | 79 | # iPython Notebooks 80 | *.ipynb 81 | .ipynb_checkpoints 82 | 83 | # Vim swap files 84 | *.swp 85 | 86 | # Bionic cache files 87 | bndata 88 | 89 | # VSCode settings 90 | .vscode 91 | 92 | # Python virtual environment 93 | .venv 94 | 95 | # Test data 96 | !tests/test_flow/test_persistence_compatibility/artifacts/**/*.pkl 97 | -------------------------------------------------------------------------------- /tests/test_flow/test_persistence_compatibility.py: -------------------------------------------------------------------------------- 1 | import pytest 2 | import shutil 3 | 4 | from .generate_test_compatibility_cache import Harness, CACHE_TEST_DIR 5 | 6 | 7 | @pytest.fixture 8 | def older_serialized_cache_harness(make_counter, tmp_path): 9 | # shutil.copytree dest should not exist 10 | tmp_cache_path = tmp_path.joinpath("test_cache") 11 | shutil.copytree(CACHE_TEST_DIR, tmp_cache_path) 12 | harness = Harness(tmp_cache_path, make_counter) 13 | return harness 14 | 15 | 16 | # Tests caching backward compatibility by loading and deserializaing 17 | # an old snapshot of the cache. Test failure indicates that the changes 18 | # made to the caching layer are backward incompatible. 19 | # In case of a failure, either 20 | # a) fix the caching logic so it's backward compatible or 21 | # b) update the cache schema version and generate a new cache snapshot. 22 | # 23 | # To update cache schema version, change `CACHE_SCHEMA_VERSION` in cache.py. 24 | # 25 | # To renegerate cache, run the following command from bionic/ dir 26 | # `python -m tests.test_flow.generate_test_compatibility_cache` 27 | def test_caching_compatibility(older_serialized_cache_harness): 28 | flows = older_serialized_cache_harness.flows 29 | 30 | for flow in flows: 31 | assert ( 32 | flow.get("total_sum") == older_serialized_cache_harness.EXPECTED_TOTAL_SUM 33 | ) 34 | 35 | # Assert that no methods were called. 36 | assert older_serialized_cache_harness.lowercase_sum_counter.times_called() == 0 37 | assert older_serialized_cache_harness.uppercase_sum_counter.times_called() == 0 38 | assert older_serialized_cache_harness.total_sum_counter.times_called() == 0 39 | -------------------------------------------------------------------------------- /README.md: -------------------------------------------------------------------------------- 1 | # Bionic 2 | 3 | Bionic is a framework for analyzing and modeling data in Python. It's designed 4 | to help you **iterate faster on your research**, and help your colleagues 5 | **reuse your code more easily**. 6 | 7 | Bionic is in alpha and evolving rapidly. We recommend it for research projects 8 | where the dataset fits in memory. We do not recommend it for pipelines running 9 | in production. 10 | 11 | Check out the [full documentation](https://bionic.readthedocs.io/en/stable/), 12 | or go straight to [Get 13 | Started](https://bionic.readthedocs.io/en/stable/get-started.html). 14 | 15 | ## Installation 16 | 17 | Bionic can be installed from PyPI: 18 | 19 | pip install bionic[standard] 20 | 21 | You'll probably want to install [Graphviz](https://www.graphviz.org/) as well. 22 | See the [Installation 23 | docs](https://bionic.readthedocs.io/en/stable/get-started.html#installation) 24 | for more details on installing and configuring Bionic's dependencies. 25 | 26 | ## Contributing 27 | 28 | See the 29 | [Contribution](https://bionic.readthedocs.io/en/stable/contributing.html) 30 | section of our docs. 31 | 32 | ## License 33 | 34 | Copyright 2019 Square, Inc. 35 | 36 | Licensed under the Apache License, Version 2.0 (the "License"); 37 | you may not use this file except in compliance with the License. 38 | You may obtain a copy of the License at 39 | 40 | http://www.apache.org/licenses/LICENSE-2.0 41 | 42 | Unless required by applicable law or agreed to in writing, software 43 | distributed under the License is distributed on an "AS IS" BASIS, 44 | WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 45 | See the License for the specific language governing permissions and 46 | limitations under the License. 47 | -------------------------------------------------------------------------------- /example/intro_workflow.py: -------------------------------------------------------------------------------- 1 | import numpy as np 2 | import pandas as pd 3 | from scipy.stats import multivariate_normal 4 | from sklearn.linear_model import LinearRegression 5 | 6 | import bionic as bn 7 | 8 | builder = bn.FlowBuilder("intro") 9 | 10 | builder.assign("random_seed", 0) 11 | builder.assign("variance", 2) 12 | builder.assign("correlation", 0.5) 13 | builder.assign("n_samples", 1000) 14 | 15 | 16 | @builder 17 | def my_random_df(random_seed, variance, correlation, n_samples): 18 | data = multivariate_normal( 19 | mean=[0, 0], 20 | cov=[[variance, correlation * variance], [correlation * variance, variance]], 21 | ).rvs(size=n_samples, random_state=random_seed) 22 | return pd.DataFrame(columns=["x", "y"], data=data) 23 | 24 | 25 | @builder 26 | def my_model(my_random_df): 27 | model = LinearRegression() 28 | model.fit(my_random_df[["x"]], my_random_df["y"]) 29 | return model 30 | 31 | 32 | @builder 33 | def est_correlation(my_model): 34 | return my_model.coef_[0] 35 | 36 | 37 | @builder 38 | def est_intercept(my_model): 39 | return my_model.intercept_ 40 | 41 | 42 | @builder 43 | @bn.pyplot("plt") 44 | def my_plot(my_random_df, est_correlation, est_intercept, plt): 45 | with plt.style.context("seaborn-whitegrid"): 46 | plt.scatter(my_random_df["x"], my_random_df["y"], alpha=0.2) 47 | 48 | line_xs = np.array([my_random_df["x"].min(), my_random_df["x"].max()]) 49 | line_ys = (line_xs + est_correlation) + est_intercept 50 | plt.plot(line_xs, line_ys) 51 | 52 | 53 | flow = builder.build() 54 | 55 | if __name__ == "__main__": 56 | bn.util.init_basic_logging() 57 | 58 | print("Estimated intercept:", flow.get("est_intercept")) 59 | print("Estimated correlation:", flow.get("est_correlation")) 60 | -------------------------------------------------------------------------------- /setup.py: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env python 2 | # -*- coding: utf-8 -*- 3 | 4 | import os 5 | from runpy import run_path 6 | 7 | from setuptools import find_packages, setup 8 | 9 | # This appears to be the least annoying Python-version-agnostic way of loading 10 | # an external file. 11 | extras_require = run_path( 12 | os.path.join(os.path.dirname(__file__), "bionic", "deps/extras.py") 13 | )["extras_require"] 14 | 15 | with open("README.md") as readme_file: 16 | readme = readme_file.read() 17 | 18 | requirements = [ 19 | "attrs>=20.1", 20 | "cattrs", 21 | "PyYAML", 22 | "numpy", 23 | "pandas", 24 | "pyarrow", 25 | # 0.19.1 had a regression which was fixed in 0.19.2 26 | # See tobgu/pyrsistent#263 on GitHub. 27 | "pyrsistent!=0.19.1", 28 | "decorator<5", 29 | ] 30 | 31 | setup( 32 | name="bionic", 33 | version="0.11.1", 34 | description=( 35 | "A Python framework for building, running, and sharing data science " 36 | "workflows" 37 | ), 38 | long_description=readme, 39 | long_description_content_type="text/markdown", 40 | license="Apache License 2.0", 41 | author="Janek Klawe", 42 | author_email="janek@squareup.com", 43 | url="https://github.com/square/bionic", 44 | packages=find_packages(), 45 | include_package_data=True, 46 | install_requires=requirements, 47 | extras_require=extras_require, 48 | python_requires=">=3.7", 49 | zip_safe=False, 50 | keywords="bionic", 51 | classifiers=[ 52 | "Development Status :: 3 - Alpha", 53 | "Intended Audience :: Developers", 54 | "Natural Language :: English", 55 | "License :: OSI Approved :: Apache Software License", 56 | "Programming Language :: Python :: 3", 57 | "Programming Language :: Python :: 3.7", 58 | ], 59 | ) 60 | -------------------------------------------------------------------------------- /docs/api/flow.rst: -------------------------------------------------------------------------------- 1 | ==================== 2 | Flow and FlowBuilder 3 | ==================== 4 | 5 | Introduction 6 | ------------ 7 | 8 | ``FlowBuilder`` and ``Flow`` are the primary interfaces for constructing and 9 | running Bionic flows. Either of them can be used to represent 10 | the collection of interdependent entities that make up a single analysis. The 11 | difference is that a ``FlowBuilder`` is a mutable object which can be updated, 12 | while a ``Flow`` is an immutable object which can perform computation. 13 | 14 | The typical pattern is to start with an empty ``FlowBuilder``, incrementally 15 | add entity definitions to it, then use ``FlowBuilder.build()`` to generate a 16 | ``Flow``. This ``Flow`` can be used immediately to compute entity values, or 17 | passed to other code, which might reconfigure or extend it. 18 | 19 | Although ``Flow`` objects are immutable, there is a mechanism for modifying 20 | them: instead of a method like ``set`` that mutates the ``Flow``, there is a 21 | ``setting`` method that returns a new copy with the requested change. This 22 | allows ``Flow``\ s to be easily customized without worrying about shared state. 23 | However, this API can only be used to update existing entities; if you want to 24 | define new entities, you'll need to convert the ``Flow`` back to a 25 | ``FlowBuilder`` using ``to_builder``. 26 | 27 | See `the Concepts documentation 28 | <../concepts.rst#flows-flowbuilders-and-entities>`_ for more details. 29 | 30 | FlowBuilder API 31 | --------------- 32 | 33 | .. autoclass:: bionic.FlowBuilder 34 | :members: 35 | 36 | FlowCase API 37 | ............ 38 | 39 | .. autoclass:: bionic.flow.FlowCase 40 | :members: 41 | 42 | Flow API 43 | -------- 44 | 45 | .. autoclass:: bionic.Flow 46 | :members: 47 | 48 | Cache API 49 | --------- 50 | 51 | .. autoclass:: bionic.cache_api.Cache 52 | :members: 53 | 54 | CacheEntry API 55 | -------------- 56 | 57 | .. autoclass:: bionic.cache_api.CacheEntry 58 | :members: -------------------------------------------------------------------------------- /tests/test_flow/test_join.py: -------------------------------------------------------------------------------- 1 | import pytest 2 | 3 | from bionic.exception import UnsetEntityError 4 | 5 | 6 | @pytest.fixture(scope="function") 7 | def preset_builder(builder): 8 | builder.declare("x") 9 | builder.declare("y") 10 | builder.declare("z") 11 | 12 | @builder 13 | def xy(x, y): 14 | return x * y 15 | 16 | @builder 17 | def yz(y, z): 18 | return y * z 19 | 20 | @builder 21 | def xy_plus_yz(xy, yz): 22 | return xy + yz 23 | 24 | return builder 25 | 26 | 27 | def test_simple(preset_builder): 28 | builder = preset_builder 29 | 30 | builder.set("x", 2) 31 | builder.set("y", 3) 32 | builder.set("z", 4) 33 | 34 | flow = builder.build() 35 | 36 | assert flow.get("xy") == 6 37 | assert flow.get("yz") == 12 38 | assert flow.get("xy_plus_yz") == 18 39 | 40 | 41 | def test_cartesian_product(preset_builder): 42 | builder = preset_builder 43 | 44 | builder.set("x", values=[2]) 45 | builder.set("y", values=[3, 4]) 46 | builder.set("z", values=[5, 6, 7]) 47 | 48 | flow = builder.build() 49 | 50 | assert flow.get("xy", set) == {2 * 3, 2 * 4} # noqa: E226 51 | assert flow.get("yz", set) == { 52 | 3 * 5, 53 | 3 * 6, 54 | 3 * 7, 55 | 4 * 5, 56 | 4 * 6, 57 | 4 * 7, 58 | } # noqa: E226 59 | assert flow.get("xy_plus_yz", set) == { 60 | 2 * 3 + 3 * 5, 61 | 2 * 3 + 3 * 6, 62 | 2 * 3 + 3 * 7, 63 | 2 * 4 + 4 * 5, 64 | 2 * 4 + 4 * 6, 65 | 2 * 4 + 4 * 7, 66 | } # noqa: E226 67 | 68 | 69 | def test_empty(preset_builder): 70 | builder = preset_builder 71 | 72 | builder.set("y", 3) 73 | builder.set("z", values=[4, 5]) 74 | 75 | flow = builder.build() 76 | 77 | assert flow.get("xy", set) == set() 78 | assert flow.get("yz", set) == {12, 15} 79 | assert flow.get("xy_plus_yz", set) == set() 80 | 81 | with pytest.raises(UnsetEntityError): 82 | flow.get("xy") 83 | with pytest.raises(UnsetEntityError): 84 | flow.get("xy_plus_yz") 85 | -------------------------------------------------------------------------------- /bionic/utils/reload.py: -------------------------------------------------------------------------------- 1 | import builtins 2 | import importlib 3 | from sys import modules as module_registry 4 | from fnmatch import fnmatch 5 | from sysconfig import get_paths as sysconfig_paths 6 | 7 | 8 | def recursive_reload(module): 9 | """ 10 | Helper method to reload a module recursively. If a module imports a set of 11 | modules, then the modules in the set are also reloaded and so on. 12 | 13 | Modules that are part of the current python installation are not reloaded. 14 | For example, modules part of python standard library or modules installed 15 | through pip (or some other package manager that use distutils). 16 | 17 | Also note that this method may not be able to handle dynamic imports that 18 | only happens at runtime. For example, if a module imports another module 19 | only when a certain method is executed, reloading the former module does 20 | not guarantee that the latter module is reloaded. 21 | """ 22 | 23 | original_import = builtins.__import__ 24 | already_reloaded = set() 25 | 26 | def custom_import(name, globals=None, locals=None, fromlist=[], level=0): 27 | if name in module_registry: 28 | module = module_registry[name] 29 | if name not in already_reloaded and not is_internal_module(module): 30 | already_reloaded.add(name) 31 | importlib.reload(module) 32 | return original_import(name, globals, locals, fromlist, level) 33 | 34 | try: 35 | builtins.__import__ = custom_import 36 | return importlib.reload(module) 37 | finally: 38 | builtins.__import__ = original_import 39 | 40 | 41 | def is_internal_module(module): 42 | return not hasattr(module, "__file__") or is_internal_file(module.__file__) 43 | 44 | 45 | def is_internal_file(filename): 46 | """ 47 | Helper method that determines whether the provided file is internal 48 | to Python, i.e., it's in the Python installation paths. 49 | """ 50 | return any( 51 | fnmatch(filename, file_dir + "/*") for file_dir in sysconfig_paths().values() 52 | ) 53 | -------------------------------------------------------------------------------- /tests/test_flow/test_persistence_compatibility/inventory/lowercase_chars/c41a252b102715306f81212ca8465bed426ce061c4123344374beef09d8f3c19/d7da6d04d6196967cff9964eda07ad47ec9006da10f9a88b9882e5697fdb47a4/metadata_0c0598ce0c72797d2da87fde651ed6df34f0a7477bcebac7fb3cf0699c3c3f0f.yaml: -------------------------------------------------------------------------------- 1 | artifact: 2 | content_hash: fae27d617aaf3e7d982e787694bfe8aa6184450716890f3529f5b9dbbe8a6f18 3 | url: ../../../../artifacts/lowercase_chars/cfe1e872-5b26-4733-9859-4d323d667ae5/lowercase_chars.setpkl 4 | descriptor: lowercase_chars 5 | provenance: 6 | case_key_elements: 7 | - !!python/tuple 8 | - lowercase_chars 9 | - 9ed0cd8e69 10 | code_fingerprint: 11 | bytecode_hash: b'`\xad\xae\x8e\xb8b5\x96%\x14\x04\xee\r\xc9w3' 12 | is_identity: true 13 | orig_flow_name: null 14 | version: 15 | includes_bytecode: true 16 | major: '0' 17 | minor: '0' 18 | dep_digests: 19 | - exact_hash: 0c0598ce0c72797d2da87fde651ed6df34f0a7477bcebac7fb3cf0699c3c3f0f 20 | functional_hash: c41a252b102715306f81212ca8465bed426ce061c4123344374beef09d8f3c19 21 | nominal_hash: d7da6d04d6196967cff9964eda07ad47ec9006da10f9a88b9882e5697fdb47a4 22 | provenance: 23 | case_key_elements: 24 | - !!python/tuple 25 | - lowercase_chars 26 | - 9ed0cd8e69 27 | code_fingerprint: 28 | bytecode_hash: null 29 | is_identity: false 30 | orig_flow_name: null 31 | version: 32 | includes_bytecode: true 33 | major: 9ed0cd8e69 34 | minor: '0' 35 | dep_digests: [] 36 | descriptor: 37 | exact_hash: 0c0598ce0c72797d2da87fde651ed6df34f0a7477bcebac7fb3cf0699c3c3f0f 38 | functional_hash: c41a252b102715306f81212ca8465bed426ce061c4123344374beef09d8f3c19 39 | nominal_hash: d7da6d04d6196967cff9964eda07ad47ec9006da10f9a88b9882e5697fdb47a4 40 | descriptor: lowercase_chars 41 | exact_hash: 0c0598ce0c72797d2da87fde651ed6df34f0a7477bcebac7fb3cf0699c3c3f0f 42 | functional_hash: c41a252b102715306f81212ca8465bed426ce061c4123344374beef09d8f3c19 43 | nominal_hash: d7da6d04d6196967cff9964eda07ad47ec9006da10f9a88b9882e5697fdb47a4 44 | -------------------------------------------------------------------------------- /tests/test_flow/test_persistence_compatibility/inventory/uppercase_chars/9a35aac5b21b31f32254590e01830edd7bd3df6b03c93b7186f733b5e6aeaa45/12aa58ae3a54347d38eef097626b97fd71ee9d5054f1c65b4a5ec40de608b975/metadata_2f13ca96c050b75e0a719b2dab30735d7a894dbf636cf756f109d671d10087c0.yaml: -------------------------------------------------------------------------------- 1 | artifact: 2 | content_hash: 95f21ce003ce31aba89385b8f7bd69e443fa1376763b8f9168313d778950e2ec 3 | url: ../../../../artifacts/uppercase_chars/185898d4-eaeb-46dc-85db-498018b29756/uppercase_chars.setpkl 4 | descriptor: uppercase_chars 5 | provenance: 6 | case_key_elements: 7 | - !!python/tuple 8 | - uppercase_chars 9 | - e99019711a 10 | code_fingerprint: 11 | bytecode_hash: b'`\xad\xae\x8e\xb8b5\x96%\x14\x04\xee\r\xc9w3' 12 | is_identity: true 13 | orig_flow_name: null 14 | version: 15 | includes_bytecode: true 16 | major: '0' 17 | minor: '0' 18 | dep_digests: 19 | - exact_hash: 2f13ca96c050b75e0a719b2dab30735d7a894dbf636cf756f109d671d10087c0 20 | functional_hash: 9a35aac5b21b31f32254590e01830edd7bd3df6b03c93b7186f733b5e6aeaa45 21 | nominal_hash: 12aa58ae3a54347d38eef097626b97fd71ee9d5054f1c65b4a5ec40de608b975 22 | provenance: 23 | case_key_elements: 24 | - !!python/tuple 25 | - uppercase_chars 26 | - e99019711a 27 | code_fingerprint: 28 | bytecode_hash: null 29 | is_identity: false 30 | orig_flow_name: null 31 | version: 32 | includes_bytecode: true 33 | major: e99019711a 34 | minor: '0' 35 | dep_digests: [] 36 | descriptor: 37 | exact_hash: 2f13ca96c050b75e0a719b2dab30735d7a894dbf636cf756f109d671d10087c0 38 | functional_hash: 9a35aac5b21b31f32254590e01830edd7bd3df6b03c93b7186f733b5e6aeaa45 39 | nominal_hash: 12aa58ae3a54347d38eef097626b97fd71ee9d5054f1c65b4a5ec40de608b975 40 | descriptor: uppercase_chars 41 | exact_hash: 2f13ca96c050b75e0a719b2dab30735d7a894dbf636cf756f109d671d10087c0 42 | functional_hash: 9a35aac5b21b31f32254590e01830edd7bd3df6b03c93b7186f733b5e6aeaa45 43 | nominal_hash: 12aa58ae3a54347d38eef097626b97fd71ee9d5054f1c65b4a5ec40de608b975 44 | -------------------------------------------------------------------------------- /docs/maintaining.rst: -------------------------------------------------------------------------------- 1 | :orphan: 2 | 3 | ================== 4 | Maintaining Bionic 5 | ================== 6 | 7 | This page documents project maintenance processes followed by Bionic’s core developers. 8 | If you’re not a core developer but need a new release for some reason, please contact 9 | one of the developers `listed here 10 | `_. 11 | 12 | Release Process 13 | --------------- 14 | 15 | We use `bumpversion `_ to manage our version 16 | strings and `GitHub Releases `_ to publish 17 | releases to PyPI. Follow these steps to release a new version of Bionic: 18 | 19 | 1. Merge a PR with updates to our version strings and release notes. 20 | 21 | a. Check out the current master branch. 22 | b. Create a new branch. 23 | c. Run ``bumpversion minor`` or ``bumpversion patch`` to bump the version. Running 24 | this command will create a new commit with all the version strings updated to the 25 | new version. 26 | d. Follow the commented instructions near ``Upcoming Version`` in the 27 | `release-notes.rst 28 | `_ file and 29 | update the upcoming version section. 30 | e. Amend the commit to add the release notes changes. 31 | f. Open a PR for your branch and merge it after approval from another core 32 | developer. 33 | 34 | 2. Once your PR is merged, create a release from the GitHub Releases page. 35 | 36 | a. On the `GitHub Releases `_ page, click 37 | ``Draft a new release``. 38 | b. Specify the bumped version as the ``Tag version`` and ``Release title``. Don't 39 | forget to prefix the version with ``v``. E.g., if the new version is ``0.8.0``, 40 | your tag and title should both be ``v0.8.0``. 41 | c. Click ``Publish Release``. 42 | d. Verify that the `Upload Python Package Action 43 | `_ 44 | workflow was completed successfully and the new release is visible on `PyPI 45 | `_. 46 | -------------------------------------------------------------------------------- /tests/test_flow/test_interactions.py: -------------------------------------------------------------------------------- 1 | import pytest 2 | 3 | from ..helpers import RoundingProtocol 4 | import bionic as bn 5 | 6 | 7 | @pytest.fixture(scope="function") 8 | def preset_builder(builder): 9 | builder.assign("n", values=[1, 2, 3]) 10 | 11 | @builder 12 | def xs(n): 13 | return list(range(n)) 14 | 15 | @builder 16 | def ys(xs): 17 | return [x**2 for x in xs] 18 | 19 | return builder 20 | 21 | 22 | def test_pyplot_then_gather(preset_builder): 23 | builder = preset_builder 24 | 25 | @builder 26 | @bn.pyplot("plt") 27 | @bn.gather("n", ["xs", "ys"]) 28 | def plot(gather_df, plt): 29 | for row in gather_df.itertuples(): 30 | plt.plot(row.xs, row.ys) 31 | 32 | img = builder.build().get("plot") 33 | assert img.width > 0 34 | assert img.height > 0 35 | 36 | 37 | def test_gather_then_pyplot(preset_builder): 38 | builder = preset_builder 39 | 40 | @builder 41 | @bn.gather("n", ["xs", "ys"]) 42 | @bn.pyplot("plt") 43 | def plot(gather_df, plt): 44 | for row in gather_df.itertuples(): 45 | plt.plot(row.xs, row.ys) 46 | 47 | img = builder.build().get("plot") 48 | assert img.width > 0 49 | assert img.height > 0 50 | 51 | 52 | def test_outputs_with_multiplicity(builder): 53 | builder.assign("x", values=[2, 3]) 54 | builder.assign("y", 4) 55 | 56 | @builder 57 | @bn.outputs("x_plus_y", "xy") 58 | def _(x, y): 59 | return (x + y), (x * y) 60 | 61 | @builder 62 | @bn.gather("xy") 63 | def sum_xy(gather_df): 64 | return gather_df["xy"].sum() 65 | 66 | @builder 67 | @bn.gather("x_plus_y") 68 | def sum_x_plus_y(gather_df): 69 | return gather_df["x_plus_y"].sum() 70 | 71 | flow = builder.build() 72 | assert flow.get("sum_xy") == 20 73 | assert flow.get("sum_x_plus_y") == 13 74 | 75 | flow = flow.clearing_cases("x") 76 | assert flow.get("sum_xy") == 0 77 | assert flow.get("sum_x_plus_y") == 0 78 | 79 | 80 | def test_outputs_with_protocols(builder): 81 | @builder 82 | @RoundingProtocol() 83 | @bn.outputs("x", "y") 84 | def _(): 85 | return 0.1, 1.9 86 | 87 | flow = builder.build() 88 | 89 | assert flow.get("x") == 0 90 | assert flow.get("y") == 2 91 | -------------------------------------------------------------------------------- /tests/test_helpers.py: -------------------------------------------------------------------------------- 1 | import pytest 2 | 3 | 4 | def test_longest_regex_prefix(): 5 | from .helpers import longest_regex_prefix_match 6 | 7 | def longest_prefix(regex, string): 8 | return longest_regex_prefix_match(regex, string).re.pattern 9 | 10 | assert longest_prefix("test", "test") == "test" 11 | assert longest_prefix("test", "te") == "te" 12 | assert longest_prefix("test", "text") == "te" 13 | assert longest_prefix("test", "testtest") == "test" 14 | assert longest_prefix("zest", "test") == "" 15 | assert longest_prefix("(test)", "test") == "(test)" 16 | assert longest_prefix("(test)", "text") == "" 17 | assert longest_prefix("(test)test", "testtest") == "(test)test" 18 | assert longest_prefix("(test)test", "testtext") == "(test)te" 19 | assert longest_prefix("x\n\n\nx", "x\n\n\nx") == "x\n\n\nx" 20 | assert longest_prefix("x\n\n\nx", "x\n\n\ny") == "x\n\n\n" 21 | assert longest_prefix("x\n\n\nx", "x\n\ny") == "x\n\n" 22 | assert longest_prefix("x\n\n\nx", "y\n\n\nx") == "" 23 | assert longest_prefix("test.*test", "testtest") == "test.*test" 24 | assert longest_prefix("test.*test", "testxxtest") == "test.*test" 25 | assert longest_prefix("test.*test", "testxxzest") == "test.*t" 26 | assert longest_prefix("test.*test", "testxxz") == "test.*" 27 | assert longest_prefix("test.*test", "texttest") == "te" 28 | 29 | 30 | def test_assert_re_matches(): 31 | from .helpers import assert_re_matches 32 | 33 | def assert_re_nomatch(regex, string): 34 | with pytest.raises(AssertionError): 35 | assert_re_matches(regex, string) 36 | 37 | assert_re_matches("test", "test") 38 | assert_re_matches("test", "testxxx") 39 | assert_re_nomatch("test", "tesd") 40 | 41 | assert_re_matches("test$", "test") 42 | assert_re_nomatch("test$", "testx") 43 | 44 | assert_re_matches(".*test", "test") 45 | assert_re_matches(".*test", "xxtest") 46 | assert_re_matches(".*test", "testxx") 47 | assert_re_nomatch(".*test", "tesd") 48 | 49 | assert_re_matches("(test)", "test") 50 | assert_re_matches("(test)", "testx") 51 | assert_re_nomatch("(test)", "tesd") 52 | 53 | assert_re_matches("test.*test", "testtest") 54 | assert_re_matches("test.*test", "testxxtest") 55 | assert_re_nomatch("test.*test", "test\ntest") 56 | 57 | assert_re_matches("(?s)test.*test", "test\ntest") 58 | -------------------------------------------------------------------------------- /bionic/deps/extras.py: -------------------------------------------------------------------------------- 1 | """ 2 | This file defines the ``extras_require`` argument used in setup.py -- i.e., the 3 | set of available Bionic subpackages (like bionic[standard] or bionic[gcp]). 4 | It's in its own file because Bionic uses the information here when importing 5 | optional dependencies. 6 | """ 7 | 8 | from collections import OrderedDict 9 | 10 | 11 | def combine(*dep_lists): 12 | """Combines multiple lists into a single sorted list of distinct items.""" 13 | return list(sorted(set(dep for dep_list in dep_lists for dep in dep_list))) 14 | 15 | 16 | # Construct the mapping from "extra name" to package descriptor. 17 | # We use an OrderedDict because the optdep module will want to know which 18 | # extras were added first. 19 | extras = OrderedDict() 20 | 21 | extras["image"] = ["Pillow"] 22 | # We don't support versions of matplotlib below 3.1 because the default backend has 23 | # problems on OS X; and we don't support 3.2.x because of this bug: 24 | # https://github.com/matplotlib/matplotlib/issues/15410 25 | extras["matplotlib"] = combine(["matplotlib>=3.1,!=3.2.*"], extras["image"]) 26 | extras["viz"] = combine(["hsluv", "networkx", "pydot"], extras["image"]) 27 | 28 | extras["standard"] = combine(extras["matplotlib"], extras["viz"]) 29 | 30 | extras["dill"] = ["dill"] 31 | extras["dask"] = ["dask[dataframe]"] 32 | extras["gcp"] = ["fsspec", "gcsfs"] 33 | extras["parallel"] = ["cloudpickle", "loky"] 34 | extras["geopandas"] = ["geopandas"] 35 | extras["aip"] = combine( 36 | [ 37 | "google-auth", 38 | "google-api-python-client", 39 | "google-cloud-logging", 40 | "cloudpickle", 41 | "docker", 42 | ], 43 | extras["gcp"], 44 | ) 45 | 46 | extras["examples"] = combine(extras["standard"], ["scikit-learn"]) 47 | extras["full"] = combine(*extras.values()) 48 | 49 | extras["dev"] = combine( 50 | [ 51 | "pytest", 52 | "pytest-shard", 53 | "black", 54 | "flake8", 55 | "flake8-print", 56 | "flake8-fixme", 57 | "importlib-metadata<5", # flake8 is incompatible with importlib 5.0.0 58 | "sphinx!=3.2.0", 59 | "sphinx_rtd_theme", 60 | "sphinx-autobuild", 61 | "nbsphinx", 62 | "jupyter", 63 | "bumpversion", 64 | "GitPython", 65 | ], 66 | *extras.values() 67 | ) 68 | 69 | # This will be imported by setup.py. 70 | extras_require = extras 71 | -------------------------------------------------------------------------------- /tests/test_flow/test_persistence_compatibility/inventory/lowercase_sum/18f9fabca61690edac92e2e690a0238243a6765a5a323023ea921df8d167b365/abdeaf50842c524bad26317b37054a082ee1c42365af2cccc3ef44963c4e5ab7/metadata_d6cdbcac0ffae0019872657e0074ec86ba77748544bf477ebeb02c7fe1491beb.yaml: -------------------------------------------------------------------------------- 1 | artifact: 2 | content_hash: 53f0c7cbf9464530929bbdb5991d003f15e53687898ab690af83ee8ea90c1533 3 | url: ../../../../artifacts/lowercase_sum/44556b2a-bd8f-44c6-a1bb-1a03ed2a839f/lowercase_sum.json 4 | descriptor: lowercase_sum 5 | provenance: 6 | case_key_elements: 7 | - !!python/tuple 8 | - lowercase_chars 9 | - 9ed0cd8e69 10 | code_fingerprint: 11 | bytecode_hash: b'`\xad\xae\x8e\xb8b5\x96%\x14\x04\xee\r\xc9w3' 12 | is_identity: true 13 | orig_flow_name: null 14 | version: 15 | includes_bytecode: true 16 | major: '0' 17 | minor: '0' 18 | dep_digests: 19 | - exact_hash: d6cdbcac0ffae0019872657e0074ec86ba77748544bf477ebeb02c7fe1491beb 20 | functional_hash: 18f9fabca61690edac92e2e690a0238243a6765a5a323023ea921df8d167b365 21 | nominal_hash: abdeaf50842c524bad26317b37054a082ee1c42365af2cccc3ef44963c4e5ab7 22 | provenance: 23 | case_key_elements: 24 | - !!python/tuple 25 | - lowercase_chars 26 | - 9ed0cd8e69 27 | code_fingerprint: 28 | bytecode_hash: b'\xd8!\\\xa0wX\x8d\x1c\x06\xb8AK\xaf\xde\xa8\xc2' 29 | is_identity: false 30 | orig_flow_name: null 31 | version: 32 | includes_bytecode: true 33 | major: '0' 34 | minor: '0' 35 | dep_digests: 36 | - exact_hash: fae27d617aaf3e7d982e787694bfe8aa6184450716890f3529f5b9dbbe8a6f18 37 | functional_hash: fae27d617aaf3e7d982e787694bfe8aa6184450716890f3529f5b9dbbe8a6f18 38 | nominal_hash: fae27d617aaf3e7d982e787694bfe8aa6184450716890f3529f5b9dbbe8a6f18 39 | provenance: null 40 | descriptor: 41 | exact_hash: d6cdbcac0ffae0019872657e0074ec86ba77748544bf477ebeb02c7fe1491beb 42 | functional_hash: 18f9fabca61690edac92e2e690a0238243a6765a5a323023ea921df8d167b365 43 | nominal_hash: abdeaf50842c524bad26317b37054a082ee1c42365af2cccc3ef44963c4e5ab7 44 | descriptor: lowercase_sum 45 | exact_hash: d6cdbcac0ffae0019872657e0074ec86ba77748544bf477ebeb02c7fe1491beb 46 | functional_hash: 18f9fabca61690edac92e2e690a0238243a6765a5a323023ea921df8d167b365 47 | nominal_hash: abdeaf50842c524bad26317b37054a082ee1c42365af2cccc3ef44963c4e5ab7 48 | -------------------------------------------------------------------------------- /tests/test_flow/test_persistence_compatibility/inventory/lowercase_sum/df26876dd4463a18ff0c4fe5ed4088f6642b919fa5690e7c36314dab74b6aeae/928386e200f120009b0fba16b2f0de0c22974433d0e30690957ef441b254b74f/metadata_68023f00b2b8d8baf1e747165c0432eabc0148ce5f801b3da9103ede202da633.yaml: -------------------------------------------------------------------------------- 1 | artifact: 2 | content_hash: a3e8bae3649bc57d44b928616c9a641643e36bd26e13467f24dffe7aba3eaff2 3 | url: ../../../../artifacts/lowercase_sum/1c22c085-6fa5-4df7-a69a-a03a3e880e90/lowercase_sum.json 4 | descriptor: lowercase_sum 5 | provenance: 6 | case_key_elements: 7 | - !!python/tuple 8 | - lowercase_chars 9 | - 9ed0cd8e69 10 | code_fingerprint: 11 | bytecode_hash: b'`\xad\xae\x8e\xb8b5\x96%\x14\x04\xee\r\xc9w3' 12 | is_identity: true 13 | orig_flow_name: null 14 | version: 15 | includes_bytecode: true 16 | major: '0' 17 | minor: '0' 18 | dep_digests: 19 | - exact_hash: 68023f00b2b8d8baf1e747165c0432eabc0148ce5f801b3da9103ede202da633 20 | functional_hash: df26876dd4463a18ff0c4fe5ed4088f6642b919fa5690e7c36314dab74b6aeae 21 | nominal_hash: 928386e200f120009b0fba16b2f0de0c22974433d0e30690957ef441b254b74f 22 | provenance: 23 | case_key_elements: 24 | - !!python/tuple 25 | - lowercase_chars 26 | - 9ed0cd8e69 27 | code_fingerprint: 28 | bytecode_hash: b'\xd8!\\\xa0wX\x8d\x1c\x06\xb8AK\xaf\xde\xa8\xc2' 29 | is_identity: false 30 | orig_flow_name: null 31 | version: 32 | includes_bytecode: true 33 | major: '0' 34 | minor: '0' 35 | dep_digests: 36 | - exact_hash: fae27d617aaf3e7d982e787694bfe8aa6184450716890f3529f5b9dbbe8a6f18 37 | functional_hash: fae27d617aaf3e7d982e787694bfe8aa6184450716890f3529f5b9dbbe8a6f18 38 | nominal_hash: fae27d617aaf3e7d982e787694bfe8aa6184450716890f3529f5b9dbbe8a6f18 39 | provenance: null 40 | descriptor: 41 | exact_hash: 68023f00b2b8d8baf1e747165c0432eabc0148ce5f801b3da9103ede202da633 42 | functional_hash: df26876dd4463a18ff0c4fe5ed4088f6642b919fa5690e7c36314dab74b6aeae 43 | nominal_hash: 928386e200f120009b0fba16b2f0de0c22974433d0e30690957ef441b254b74f 44 | descriptor: lowercase_sum 45 | exact_hash: 68023f00b2b8d8baf1e747165c0432eabc0148ce5f801b3da9103ede202da633 46 | functional_hash: df26876dd4463a18ff0c4fe5ed4088f6642b919fa5690e7c36314dab74b6aeae 47 | nominal_hash: 928386e200f120009b0fba16b2f0de0c22974433d0e30690957ef441b254b74f 48 | -------------------------------------------------------------------------------- /tests/test_flow/test_persistence_compatibility/inventory/uppercase_sum/4eebae0400c86e94dbe61d3669149a8818be6c07a985c723330b97b87547e7b3/2131eba21ea8c21c6e4830c8be043b71ee21edd9caa43657e5647fe5e72feb91/metadata_dcc2a4fa9aaf6a7e06f5761c52c3ff9f00772e3c169286c31530badd602a4ed5.yaml: -------------------------------------------------------------------------------- 1 | artifact: 2 | content_hash: 03cd8df2dd4256cd04907f6b281b2116f9188b3f32cd99d7e6d98f81b9a0e675 3 | url: ../../../../artifacts/uppercase_sum/5998ef92-4102-4e9c-9ef5-f996da3a9fd9/uppercase_sum.json 4 | descriptor: uppercase_sum 5 | provenance: 6 | case_key_elements: 7 | - !!python/tuple 8 | - uppercase_chars 9 | - e99019711a 10 | code_fingerprint: 11 | bytecode_hash: b'`\xad\xae\x8e\xb8b5\x96%\x14\x04\xee\r\xc9w3' 12 | is_identity: true 13 | orig_flow_name: null 14 | version: 15 | includes_bytecode: true 16 | major: '0' 17 | minor: '0' 18 | dep_digests: 19 | - exact_hash: dcc2a4fa9aaf6a7e06f5761c52c3ff9f00772e3c169286c31530badd602a4ed5 20 | functional_hash: 4eebae0400c86e94dbe61d3669149a8818be6c07a985c723330b97b87547e7b3 21 | nominal_hash: 2131eba21ea8c21c6e4830c8be043b71ee21edd9caa43657e5647fe5e72feb91 22 | provenance: 23 | case_key_elements: 24 | - !!python/tuple 25 | - uppercase_chars 26 | - e99019711a 27 | code_fingerprint: 28 | bytecode_hash: b'\xc0j\x18\x8f\xeef\xff\x0e\x89%\xa8z#\x16\x8d\xc9' 29 | is_identity: false 30 | orig_flow_name: null 31 | version: 32 | includes_bytecode: true 33 | major: '0' 34 | minor: '0' 35 | dep_digests: 36 | - exact_hash: 95f21ce003ce31aba89385b8f7bd69e443fa1376763b8f9168313d778950e2ec 37 | functional_hash: 95f21ce003ce31aba89385b8f7bd69e443fa1376763b8f9168313d778950e2ec 38 | nominal_hash: 95f21ce003ce31aba89385b8f7bd69e443fa1376763b8f9168313d778950e2ec 39 | provenance: null 40 | descriptor: 41 | exact_hash: dcc2a4fa9aaf6a7e06f5761c52c3ff9f00772e3c169286c31530badd602a4ed5 42 | functional_hash: 4eebae0400c86e94dbe61d3669149a8818be6c07a985c723330b97b87547e7b3 43 | nominal_hash: 2131eba21ea8c21c6e4830c8be043b71ee21edd9caa43657e5647fe5e72feb91 44 | descriptor: uppercase_sum 45 | exact_hash: dcc2a4fa9aaf6a7e06f5761c52c3ff9f00772e3c169286c31530badd602a4ed5 46 | functional_hash: 4eebae0400c86e94dbe61d3669149a8818be6c07a985c723330b97b87547e7b3 47 | nominal_hash: 2131eba21ea8c21c6e4830c8be043b71ee21edd9caa43657e5647fe5e72feb91 48 | -------------------------------------------------------------------------------- /tests/test_flow/test_persistence_compatibility/inventory/uppercase_sum/77f703a7588b04005d8ba8db4cf58b8accd9b951a7c301e0bd7a844315aab6b8/25476be4b0032b37f58d1721d86043060b4c8647fe9fdeefc8dd30636231f542/metadata_154a158abcc7649a0948905714394ab5816346d05843df523261458828e60035.yaml: -------------------------------------------------------------------------------- 1 | artifact: 2 | content_hash: cca69c8993ccba521327129f3f6d91ff72b7b1fb625ce355541dc349be74668b 3 | url: ../../../../artifacts/uppercase_sum/fbdc03e4-c713-4a7f-aca6-79bd31bb9d62/uppercase_sum.json 4 | descriptor: uppercase_sum 5 | provenance: 6 | case_key_elements: 7 | - !!python/tuple 8 | - uppercase_chars 9 | - e99019711a 10 | code_fingerprint: 11 | bytecode_hash: b'`\xad\xae\x8e\xb8b5\x96%\x14\x04\xee\r\xc9w3' 12 | is_identity: true 13 | orig_flow_name: null 14 | version: 15 | includes_bytecode: true 16 | major: '0' 17 | minor: '0' 18 | dep_digests: 19 | - exact_hash: 154a158abcc7649a0948905714394ab5816346d05843df523261458828e60035 20 | functional_hash: 77f703a7588b04005d8ba8db4cf58b8accd9b951a7c301e0bd7a844315aab6b8 21 | nominal_hash: 25476be4b0032b37f58d1721d86043060b4c8647fe9fdeefc8dd30636231f542 22 | provenance: 23 | case_key_elements: 24 | - !!python/tuple 25 | - uppercase_chars 26 | - e99019711a 27 | code_fingerprint: 28 | bytecode_hash: b'\xc0j\x18\x8f\xeef\xff\x0e\x89%\xa8z#\x16\x8d\xc9' 29 | is_identity: false 30 | orig_flow_name: null 31 | version: 32 | includes_bytecode: true 33 | major: '0' 34 | minor: '0' 35 | dep_digests: 36 | - exact_hash: 95f21ce003ce31aba89385b8f7bd69e443fa1376763b8f9168313d778950e2ec 37 | functional_hash: 95f21ce003ce31aba89385b8f7bd69e443fa1376763b8f9168313d778950e2ec 38 | nominal_hash: 95f21ce003ce31aba89385b8f7bd69e443fa1376763b8f9168313d778950e2ec 39 | provenance: null 40 | descriptor: 41 | exact_hash: 154a158abcc7649a0948905714394ab5816346d05843df523261458828e60035 42 | functional_hash: 77f703a7588b04005d8ba8db4cf58b8accd9b951a7c301e0bd7a844315aab6b8 43 | nominal_hash: 25476be4b0032b37f58d1721d86043060b4c8647fe9fdeefc8dd30636231f542 44 | descriptor: uppercase_sum 45 | exact_hash: 154a158abcc7649a0948905714394ab5816346d05843df523261458828e60035 46 | functional_hash: 77f703a7588b04005d8ba8db4cf58b8accd9b951a7c301e0bd7a844315aab6b8 47 | nominal_hash: 25476be4b0032b37f58d1721d86043060b4c8647fe9fdeefc8dd30636231f542 48 | -------------------------------------------------------------------------------- /bionic/utils/urls.py: -------------------------------------------------------------------------------- 1 | """ 2 | Utilities for working with URLs. 3 | """ 4 | 5 | import os 6 | from pathlib import Path 7 | from urllib.parse import unquote, urlparse 8 | 9 | FILE_SCHEME = "file" 10 | GCS_SCHEME = "gs" 11 | SUPPORTED_SCHEMES = [FILE_SCHEME, GCS_SCHEME] 12 | 13 | 14 | def is_file_url(url): 15 | result = urlparse(url) 16 | return result.scheme == FILE_SCHEME 17 | 18 | 19 | def is_gcs_url(url): 20 | result = urlparse(url) 21 | return result.scheme == GCS_SCHEME 22 | 23 | 24 | def is_absolute_url(url): 25 | result = urlparse(url) 26 | if not result.scheme: 27 | return False 28 | if result.scheme not in SUPPORTED_SCHEMES: 29 | raise ValueError(f"Found a URL with unsupported scheme {result.scheme!r}.") 30 | return True 31 | 32 | 33 | def path_from_url(url): 34 | result = urlparse(url) 35 | return Path(unquote(result.path)) 36 | 37 | 38 | def url_from_path(path): 39 | return Path(path).as_uri() 40 | 41 | 42 | def bucket_and_object_names_from_gs_url(url): 43 | if not is_gcs_url(url): 44 | raise ValueError(f'url must have schema "{GCS_SCHEME}": got {url}') 45 | result = urlparse(url) 46 | result_path = result.path 47 | return result.netloc, result_path[1:] 48 | 49 | 50 | def relativize_url(absolute_url, base_url): 51 | """ 52 | Converts an absolute file URL to one relative to a base file URL. 53 | 54 | If either URL is not a file URL, this returns the original absolute URL. 55 | """ 56 | 57 | if not is_file_url(absolute_url) or not is_file_url(base_url): 58 | return absolute_url 59 | absolute_path = path_from_url(absolute_url) 60 | base_path = path_from_url(base_url) 61 | # Using str(absolute_path.relative_to(base_path.parent)) doesn't work as well here, 62 | # because it throws an exception if base_path is not a parent of absolute_path. 63 | return os.path.relpath(absolute_path, base_path.parent) 64 | 65 | 66 | def derelativize_url(relative_url, base_url): 67 | """ 68 | Given a URL relative to another base URL, returns an absolute URL. 69 | 70 | If the first URL is not relative, it is returned unchanged. 71 | """ 72 | 73 | if is_absolute_url(relative_url): 74 | return relative_url 75 | base_path = path_from_url(base_url) 76 | relative_path = path_from_url(relative_url) 77 | absolute_path = os.path.normpath(base_path.parent.joinpath(relative_path)) 78 | return url_from_path(absolute_path) 79 | -------------------------------------------------------------------------------- /docs/index.rst: -------------------------------------------------------------------------------- 1 | ====== 2 | Bionic 3 | ====== 4 | 5 | .. note:: 6 | Bionic is in alpha and evolving rapidly. We recommend it for research 7 | projects where the dataset fits in memory. We do not recommend it for 8 | pipelines running in production. 9 | 10 | **Release:** v\ |version| --- 11 | **Quick Links:** `Source `_ | `Issues `_ | `Installation `_ | `Example `_ 12 | 13 | Bionic is a framework for analyzing and modeling data in Python. It's designed 14 | to help you **iterate faster on your research**, and help your colleagues 15 | **reuse your code more easily**. 16 | 17 | You define the *entities* you care about -- dataframes, parameters, models, 18 | plots -- using individual Python functions. Then Bionic assembles your 19 | definitions into a *flow*: a custom Python object that can efficiently compute 20 | any of your entities, and can be modified on the fly to test out new 21 | variations. 22 | 23 | This approach has several benefits: 24 | 25 | * Bionic automatically glues your functions into a coherent program, so your 26 | **code stays modular** but behaves like an **integrated end-to-end tool**. 27 | * You can compute any entity with one function call, so it's **easy to iterate 28 | and debug**. 29 | * Everything you compute is automatically cached, so you spend **less time 30 | waiting** and **zero time managing data files**. 31 | * Flows are easy to use from a notebook, so you can **work interactively** but 32 | keep your code in a **version-controlled** Python file. 33 | * Any part of a flow can be modified dynamically, so you can **quickly try 34 | experiments**, and your colleagues can **reuse your code** without rewriting 35 | it. 36 | 37 | .. 38 | This is super annoying, but it's the only way I've found to make a bold 39 | internal link in RST. (I really want the link to be bold so you can see 40 | the example link easily when you're scanning.) 41 | 42 | Check out an |bold link|! 43 | 44 | .. |bold link| raw:: html 45 | 46 | 47 | example here 48 | 49 | Documentation Contents 50 | ---------------------- 51 | 52 | .. toctree:: 53 | :maxdepth: 2 54 | 55 | what 56 | get-started 57 | concepts 58 | warnings 59 | api/index.rst 60 | get-help 61 | contributing 62 | future 63 | release-notes 64 | -------------------------------------------------------------------------------- /docs/api/decorators.rst: -------------------------------------------------------------------------------- 1 | ========== 2 | Decorators 3 | ========== 4 | 5 | Introduction 6 | ------------ 7 | 8 | Bionic decorators are Python decorators designed to be used in conjunction with 9 | a ``FlowBuilder``. They modify the way functions are incorporated into flows. 10 | 11 | The normal way (without decorators) of incorporating functions into flows is 12 | as follows: 13 | 14 | .. code-block:: python 15 | 16 | import bionic as bn 17 | 18 | builder = FlowBuilder('my_flow') 19 | 20 | builder.assign('x', 1) 21 | 22 | @builder 23 | def x_plus_one(x): 24 | return x + 1 25 | 26 | print(builder.build().get('x_plus_one')) # Prints "2". 27 | 28 | In the simple case above, the function is interpreted as a new entity named 29 | ``x_plus_one`` which depends on the existing entity ``x``. However, in many 30 | cases we want Bionic to process the function in a more complex way. In these 31 | cases we can add additional decorators: 32 | 33 | .. code-block:: python 34 | 35 | import bionic as bn 36 | 37 | builder = FlowBuilder('my_flow') 38 | 39 | builder.assign('x', 1) 40 | 41 | @builder 42 | @bn.outputs('x_plus_one', 'x_plus_two') 43 | @bn.persist(False) 44 | def some_function(x): 45 | return (x + 1), (x + 2) 46 | 47 | print(builder.build().get('x_plus_one')) # Prints "2". 48 | print(builder.build().get('x_plus_two')) # Prints "3". 49 | 50 | These decorators tell Bionic that our function actually generates two values 51 | for two different entities (``x_plus_one`` and ``x_plus_two``), and these 52 | values should not be persisted to disk. 53 | 54 | All Bionic decorators should be placed *after* the initial ``@builder`` 55 | decorator, but *before* any regular (non-Bionic) decorators. Finally, the 56 | ``@builder`` decorator returns the original function, so it can be called 57 | normally, as if it had been defined without any of the Bionic decorators. 58 | E.g.: 59 | 60 | .. code-block:: python 61 | 62 | @builder 63 | @bn.persist(False) 64 | def f(x): 65 | return x + 1 66 | 67 | assert f(7) == 8 68 | 69 | Built-In Decorators 70 | ------------------- 71 | 72 | .. autofunction:: bionic.run_in_aip 73 | .. autofunction:: bionic.changes_per_run 74 | .. autofunction:: bionic.docs 75 | .. autofunction:: bionic.gather 76 | .. autofunction:: bionic.immediate 77 | .. autofunction:: bionic.memoize 78 | .. autofunction:: bionic.output 79 | .. autofunction:: bionic.outputs 80 | .. autofunction:: bionic.persist 81 | .. autofunction:: bionic.pyplot 82 | .. autofunction:: bionic.version 83 | 84 | -------------------------------------------------------------------------------- /tests/test_flow/test_multi_out.py: -------------------------------------------------------------------------------- 1 | import pytest 2 | 3 | import bionic as bn 4 | 5 | 6 | def test_no_doc(builder): 7 | @builder 8 | @bn.outputs("a", "b") 9 | def f(): 10 | return 1, 2 11 | 12 | flow = builder.build() 13 | assert flow.entity_doc("a") is None 14 | assert flow.entity_doc("b") is None 15 | 16 | 17 | def test_multi_docs(builder): 18 | @builder 19 | @bn.outputs("a", "b") 20 | @bn.docs("a doc", "b doc") 21 | def f(): 22 | return 1, 2 23 | 24 | flow = builder.build() 25 | assert flow.entity_doc("a") == "a doc" 26 | assert flow.entity_doc("b") == "b doc" 27 | 28 | 29 | def test_multi_docs_decorated_first(builder): 30 | @builder 31 | @bn.docs("a doc", "b doc") 32 | @bn.outputs("a", "b") 33 | def f(): 34 | return 1, 2 35 | 36 | flow = builder.build() 37 | assert flow.entity_doc("a") == "a doc" 38 | assert flow.entity_doc("b") == "b doc" 39 | 40 | 41 | def test_too_many_docs(builder): 42 | with pytest.raises(ValueError): 43 | 44 | @builder 45 | @bn.docs("a doc", "b doc") 46 | def f(): 47 | return 1, 2 48 | 49 | 50 | def test_too_few_docs(builder): 51 | with pytest.warns(Warning): 52 | 53 | @builder 54 | @bn.outputs("a", "b") 55 | def f(): 56 | "a and b doc" 57 | return 1, 2 58 | 59 | flow = builder.build() 60 | assert flow.entity_doc("a") == "a and b doc" 61 | assert flow.entity_doc("b") == "a and b doc" 62 | 63 | 64 | def test_multi_default_protocols(builder): 65 | @builder 66 | @bn.outputs("a", "b") 67 | def f(): 68 | return 1, 2 69 | 70 | flow = builder.build() 71 | assert flow.entity_protocol("a") == bn.flow.DEFAULT_PROTOCOL 72 | assert flow.entity_protocol("b") == bn.flow.DEFAULT_PROTOCOL 73 | 74 | 75 | def test_multi_custom_protocols(builder): 76 | protocol = bn.protocol.dillable() 77 | 78 | @builder 79 | @bn.outputs("a", "b") 80 | @protocol 81 | def f(): 82 | return 1, 2 83 | 84 | flow = builder.build() 85 | assert flow.entity_protocol("a") == protocol 86 | assert flow.entity_protocol("b") == protocol 87 | 88 | 89 | def test_multi_custom_protocols_decorated_first(builder): 90 | protocol = bn.protocol.dillable() 91 | 92 | @builder 93 | @protocol 94 | @bn.outputs("a", "b") 95 | def f(): 96 | return 1, 2 97 | 98 | flow = builder.build() 99 | assert flow.entity_protocol("a") == protocol 100 | assert flow.entity_protocol("b") == protocol 101 | -------------------------------------------------------------------------------- /bionic/tokenization.py: -------------------------------------------------------------------------------- 1 | """ 2 | Contains a tokenize() function which can be used to convert arbitrary values 3 | into nice strings, suitable for use as filenames. 4 | """ 5 | 6 | from .utils.misc import hash_to_hex 7 | 8 | 9 | def char_range(first, last): 10 | "Return a list of all the characters from first to last, inclusive." 11 | return [chr(i) for i in range(ord(first), ord(last) + 1)] 12 | 13 | 14 | CLEAN_CHARS = set( 15 | char_range("a", "z") + char_range("A", "Z") + char_range("0", "9") + ["_", "-", "."] 16 | ) 17 | MAX_CLEAN_STR_LEN = 32 18 | 19 | 20 | def clean_str(string): 21 | "Converts an arbitary string to one that could be used as a filename." 22 | cleaned = "".join((c if c in CLEAN_CHARS else ".") for c in string) 23 | # Some filesystems are case insensitive, so we don't want uppercase 24 | # letters. 25 | cleaned = cleaned.lower() 26 | # Some filesystems treat files differently if they start with a period, so 27 | # let's avoid that. 28 | if cleaned.startswith("."): 29 | cleaned = "_" + cleaned 30 | if len(cleaned) > MAX_CLEAN_STR_LEN: 31 | head_len = (MAX_CLEAN_STR_LEN // 2) - 1 32 | tail_len = MAX_CLEAN_STR_LEN - (head_len + 3) 33 | cleaned = cleaned[:head_len] + "..." + cleaned[-tail_len:] 34 | return cleaned 35 | 36 | 37 | # When hashing values for tokens, we'll hash down to 5 bytes (10 hex chars). 38 | # The reasoning is: 39 | # - we want to support up to 1e6 distinct values 40 | # - to avoid collisions, we need a hash space of 1e6 squared, or 1e12 41 | # - that's 36 bits 42 | # - rounding up, that's 5 bytes 43 | # I picked 1e6 arbitrarily; the hash is only used when two values have the same 44 | # "clean string" value OR when they can't be converted to strings at all, but 45 | # that will include things like dicts of hyperparameter values. 46 | HASH_LEN = 5 47 | 48 | 49 | # TODO: add optional directory parameter for where to write/read from 50 | def tokenize(value, serialize_func=None): 51 | """ 52 | Convert an arbitrary value to a nice, unique string that could be used as a 53 | filename. If a serialization function is provided, the value will be 54 | serialized and hashed. Otherwise it will be converted to a string; if that 55 | string is not suitable for a filename, it will be cleaned and a hash will 56 | be appended. 57 | """ 58 | 59 | if serialize_func is not None: 60 | bytestring = serialize_func(value) 61 | token = hash_to_hex(bytestring, HASH_LEN) 62 | else: 63 | value_str = str(value) 64 | token = clean_str(value_str) 65 | if token != value_str: 66 | token += "_" + hash_to_hex(value_str.encode("utf-8"), HASH_LEN) 67 | 68 | return token 69 | -------------------------------------------------------------------------------- /bionic/gcs.py: -------------------------------------------------------------------------------- 1 | """ 2 | Utilities for working with Google Cloud Storage. 3 | """ 4 | 5 | import logging 6 | import warnings 7 | 8 | from .deps.optdep import import_optional_dependency 9 | 10 | logger = logging.getLogger(__name__) 11 | 12 | 13 | _cached_gcs_fs = None 14 | 15 | 16 | def get_gcs_fs_without_warnings(cache_value=True): 17 | # TODO It's not expensive to create the gcs filesystem, but caching this enables 18 | # us to mock the cached gcs_fs with a mock implementation in tests. We should 19 | # change the tests to inject the filesystem in a different way and get rid of 20 | # this caching. 21 | if cache_value: 22 | global _cached_gcs_fs 23 | if _cached_gcs_fs is None: 24 | _cached_gcs_fs = get_gcs_fs_without_warnings(cache_value=False) 25 | return _cached_gcs_fs 26 | 27 | fsspec = import_optional_dependency("fsspec", purpose="caching to GCS") 28 | 29 | with warnings.catch_warnings(): 30 | # Google's SDK warns if you use end user credentials instead of a 31 | # service account. I think this warning is intended for production 32 | # server code, where you don't want GCP access to be tied to a 33 | # particular user. However, this code is intended to be run by 34 | # individuals, so using end user credentials seems appropriate. 35 | # Hence, we'll suppress this warning. 36 | warnings.filterwarnings( 37 | "ignore", "Your application has authenticated using end user credentials" 38 | ) 39 | logger.info("Initializing GCS filesystem ...") 40 | return fsspec.filesystem("gcs") 41 | 42 | 43 | # TODO: Consider using persistence.GcsFilesystem instead of exposing this function. 44 | def upload_to_gcs(path, url): 45 | """ 46 | Copy a local path to GCS URL. 47 | """ 48 | gcs_fs = get_gcs_fs_without_warnings() 49 | if path.is_dir(): 50 | gcs_fs.put(str(path), url, recursive=True) 51 | else: 52 | # If the GCS URL is a folder, we want to write the file in the folder. 53 | # There seems to be a bug in fsspec due to which, the file is uploaded 54 | # as the url, instead of inside the folder. What this means is, writing 55 | # a file c.json to gs://a/b/ would result in file gs://a/b instead of 56 | # gs://a/b/c.json. 57 | # 58 | # The `put` API is supposed to write the file inside the folder but it 59 | # strips the ending "/" at the end in fsspec's `_strip_protocol` method. 60 | # See https://github.com/intake/filesystem_spec/issues/448 for more 61 | # details and tracking this issue. 62 | if url.endswith("/"): 63 | url = url + path.name 64 | gcs_fs.put_file(str(path), url) 65 | -------------------------------------------------------------------------------- /tests/test_flow/test_persistence_compatibility/inventory/total_sum/401bf02deffc7c8d58ba69ed3187ed5b47c8b69fe92da389d7161a78581ce1d0/f723892d217a64ac4124c1e92c00d73b8d8986cf17f50a4de70ad5224e54d17b/metadata_3e3b9ef2b6a3946f569202f99045b3b61d7b1a8e327566282ec558dd0254af34.yaml: -------------------------------------------------------------------------------- 1 | artifact: 2 | content_hash: 927b8935346c55e24d2cf6de1a771cf018fb38df4bd182acad078557b88a773d 3 | url: ../../../../artifacts/total_sum/faed6d2b-5b8c-449a-9fd6-946bec4f5b0d/total_sum.json 4 | descriptor: total_sum 5 | provenance: 6 | case_key_elements: 7 | - !!python/tuple 8 | - lowercase_chars 9 | - 9ed0cd8e69 10 | - !!python/tuple 11 | - uppercase_chars 12 | - e99019711a 13 | code_fingerprint: 14 | bytecode_hash: b'`\xad\xae\x8e\xb8b5\x96%\x14\x04\xee\r\xc9w3' 15 | is_identity: true 16 | orig_flow_name: null 17 | version: 18 | includes_bytecode: true 19 | major: '0' 20 | minor: '0' 21 | dep_digests: 22 | - exact_hash: 3e3b9ef2b6a3946f569202f99045b3b61d7b1a8e327566282ec558dd0254af34 23 | functional_hash: 401bf02deffc7c8d58ba69ed3187ed5b47c8b69fe92da389d7161a78581ce1d0 24 | nominal_hash: f723892d217a64ac4124c1e92c00d73b8d8986cf17f50a4de70ad5224e54d17b 25 | provenance: 26 | case_key_elements: 27 | - !!python/tuple 28 | - lowercase_chars 29 | - 9ed0cd8e69 30 | - !!python/tuple 31 | - uppercase_chars 32 | - e99019711a 33 | code_fingerprint: 34 | bytecode_hash: b'\xf9\x8a\xd9\xda\xf2\xc3Hb<\xa5\xde\xc2g\x04xA' 35 | is_identity: false 36 | orig_flow_name: null 37 | version: 38 | includes_bytecode: true 39 | major: '0' 40 | minor: '0' 41 | dep_digests: 42 | - exact_hash: 53f0c7cbf9464530929bbdb5991d003f15e53687898ab690af83ee8ea90c1533 43 | functional_hash: 53f0c7cbf9464530929bbdb5991d003f15e53687898ab690af83ee8ea90c1533 44 | nominal_hash: 53f0c7cbf9464530929bbdb5991d003f15e53687898ab690af83ee8ea90c1533 45 | provenance: null 46 | - exact_hash: cca69c8993ccba521327129f3f6d91ff72b7b1fb625ce355541dc349be74668b 47 | functional_hash: cca69c8993ccba521327129f3f6d91ff72b7b1fb625ce355541dc349be74668b 48 | nominal_hash: cca69c8993ccba521327129f3f6d91ff72b7b1fb625ce355541dc349be74668b 49 | provenance: null 50 | descriptor: 51 | exact_hash: 3e3b9ef2b6a3946f569202f99045b3b61d7b1a8e327566282ec558dd0254af34 52 | functional_hash: 401bf02deffc7c8d58ba69ed3187ed5b47c8b69fe92da389d7161a78581ce1d0 53 | nominal_hash: f723892d217a64ac4124c1e92c00d73b8d8986cf17f50a4de70ad5224e54d17b 54 | descriptor: total_sum 55 | exact_hash: 3e3b9ef2b6a3946f569202f99045b3b61d7b1a8e327566282ec558dd0254af34 56 | functional_hash: 401bf02deffc7c8d58ba69ed3187ed5b47c8b69fe92da389d7161a78581ce1d0 57 | nominal_hash: f723892d217a64ac4124c1e92c00d73b8d8986cf17f50a4de70ad5224e54d17b 58 | -------------------------------------------------------------------------------- /tests/test_flow/test_persistence_compatibility/inventory/total_sum/e06c59dc8c0982d1a495ac7525ed5f3b6cc09fcfe9b82b905f199adbaebc7d98/e34bb2007d114a11e1fd278c00dbf9a244d935eeb2e4a788b50d542152ae655c/metadata_354167e0f9a7bdc5d7170980d1c0e278e2875bde04db5ac6ccd7bd7f1a73bd6d.yaml: -------------------------------------------------------------------------------- 1 | artifact: 2 | content_hash: 6c7cfc2deb288fa30f5036fa442d8082d755c0bbcbb8f93f3c49c957142114fb 3 | url: ../../../../artifacts/total_sum/2f000e88-5a8f-4762-b7c4-77eb444348f6/total_sum.json 4 | descriptor: total_sum 5 | provenance: 6 | case_key_elements: 7 | - !!python/tuple 8 | - lowercase_chars 9 | - 9ed0cd8e69 10 | - !!python/tuple 11 | - uppercase_chars 12 | - e99019711a 13 | code_fingerprint: 14 | bytecode_hash: b'`\xad\xae\x8e\xb8b5\x96%\x14\x04\xee\r\xc9w3' 15 | is_identity: true 16 | orig_flow_name: null 17 | version: 18 | includes_bytecode: true 19 | major: '0' 20 | minor: '0' 21 | dep_digests: 22 | - exact_hash: 354167e0f9a7bdc5d7170980d1c0e278e2875bde04db5ac6ccd7bd7f1a73bd6d 23 | functional_hash: e06c59dc8c0982d1a495ac7525ed5f3b6cc09fcfe9b82b905f199adbaebc7d98 24 | nominal_hash: e34bb2007d114a11e1fd278c00dbf9a244d935eeb2e4a788b50d542152ae655c 25 | provenance: 26 | case_key_elements: 27 | - !!python/tuple 28 | - lowercase_chars 29 | - 9ed0cd8e69 30 | - !!python/tuple 31 | - uppercase_chars 32 | - e99019711a 33 | code_fingerprint: 34 | bytecode_hash: b'\xf9\x8a\xd9\xda\xf2\xc3Hb<\xa5\xde\xc2g\x04xA' 35 | is_identity: false 36 | orig_flow_name: null 37 | version: 38 | includes_bytecode: true 39 | major: '0' 40 | minor: '0' 41 | dep_digests: 42 | - exact_hash: a3e8bae3649bc57d44b928616c9a641643e36bd26e13467f24dffe7aba3eaff2 43 | functional_hash: a3e8bae3649bc57d44b928616c9a641643e36bd26e13467f24dffe7aba3eaff2 44 | nominal_hash: a3e8bae3649bc57d44b928616c9a641643e36bd26e13467f24dffe7aba3eaff2 45 | provenance: null 46 | - exact_hash: 03cd8df2dd4256cd04907f6b281b2116f9188b3f32cd99d7e6d98f81b9a0e675 47 | functional_hash: 03cd8df2dd4256cd04907f6b281b2116f9188b3f32cd99d7e6d98f81b9a0e675 48 | nominal_hash: 03cd8df2dd4256cd04907f6b281b2116f9188b3f32cd99d7e6d98f81b9a0e675 49 | provenance: null 50 | descriptor: 51 | exact_hash: 354167e0f9a7bdc5d7170980d1c0e278e2875bde04db5ac6ccd7bd7f1a73bd6d 52 | functional_hash: e06c59dc8c0982d1a495ac7525ed5f3b6cc09fcfe9b82b905f199adbaebc7d98 53 | nominal_hash: e34bb2007d114a11e1fd278c00dbf9a244d935eeb2e4a788b50d542152ae655c 54 | descriptor: total_sum 55 | exact_hash: 354167e0f9a7bdc5d7170980d1c0e278e2875bde04db5ac6ccd7bd7f1a73bd6d 56 | functional_hash: e06c59dc8c0982d1a495ac7525ed5f3b6cc09fcfe9b82b905f199adbaebc7d98 57 | nominal_hash: e34bb2007d114a11e1fd278c00dbf9a244d935eeb2e4a788b50d542152ae655c 58 | -------------------------------------------------------------------------------- /.github/workflows/bionic-test.yml: -------------------------------------------------------------------------------- 1 | name: CI 2 | 3 | on: 4 | push: 5 | pull_request: 6 | workflow_dispatch: 7 | 8 | jobs: 9 | build: 10 | 11 | # TODO Consider running on macos-latest as well. 12 | runs-on: ubuntu-latest 13 | # Just in case we start having to pay for our CI compute costs, it's probably wise 14 | # to have a time limit. 15 | timeout-minutes: 60 16 | strategy: 17 | # Keep running all test configurations, even if one of them fails. This is helpful 18 | # because if one configuration fails, it's useful to see whether the other ones 19 | # fail too. (This helps diagnose tests that are flaky or specific to one Python 20 | # version.) 21 | fail-fast: false 22 | matrix: 23 | python-version: [3.7, 3.8] 24 | include: 25 | - python-version: 3.7 26 | shard-id: 1 27 | - python-version: 3.8 28 | shard-id: 2 29 | 30 | steps: 31 | - uses: actions/checkout@v2 32 | - name: Set up Python ${{ matrix.python-version }} 33 | uses: actions/setup-python@v2 34 | with: 35 | python-version: ${{ matrix.python-version }} 36 | - name: Install dependencies 37 | run: | 38 | python -m pip install --upgrade pip 39 | sudo apt-get install graphviz 40 | pip install --upgrade --upgrade-strategy eager '.[dev]' 41 | # This prints out all installed package versions, which may help for debugging 42 | # build failures. 43 | pip freeze 44 | - name: Set up gcloud 45 | uses: google-github-actions/setup-gcloud@v0.2.1 46 | with: 47 | service_account_key: ${{ secrets.GCP_SA_KEY }} 48 | export_default_credentials: true 49 | - name: Set up GCS bucket argument 50 | # If we have access to a GCS bucket, we want to run our tests with it. 51 | # But if we were triggered from a pull request (as opposed to a push) then 52 | # we won't have access to any secrets, in which case we need to omit the 53 | # `--bucket` argument. 54 | # Unfortunately this seems to be the simplest way to make this work. See 55 | # https://github.community/t/how-can-i-test-if-secrets-are-available-in-an-action/17911 56 | # for more details. 57 | run: | 58 | ([ -z ${{ secrets.GCP_BUCKET }} ] || echo "BUCKET_ARG=--bucket=${{ secrets.GCP_BUCKET }}" >> $GITHUB_ENV) 59 | - name: Lint with flake8 and black 60 | run: | 61 | flake8 62 | black --check . 63 | - name: Run baseline tests 64 | # Running GCS tests in CI costs less than a dollar per day on average. 65 | run: | 66 | pytest $BUCKET_ARG 67 | - name: Run extra tests (sharded) 68 | # Running each test on each Python version is expensive, so we compromise: we run 69 | # the baseline tests above on each version, since they're fast and hopefully 70 | # comprehensive enough to shake out any version-specific bugs; and we run each of 71 | # the other tests on just one Python version, reducing the total build time. 72 | run: | 73 | pytest $BUCKET_ARG --parallel --slow -m 'not baseline' --num-shards 3 --shard-id ${{matrix.shard-id}} 74 | -------------------------------------------------------------------------------- /bionic/decoration.py: -------------------------------------------------------------------------------- 1 | """ 2 | Code for creating and applying Bionic decorators. 3 | 4 | Bionic decorators are expected to be applied like this: 5 | 6 | @builder 7 | @decorator1 8 | @decorator2 9 | def func(arg1, arg2, ...): 10 | ... 11 | 12 | Each decorator attaches information to the decorated function by by creating or updating 13 | a DecorationAccumulator object, set as an attribute on the function. The assumption is 14 | that the decorator at the top, (``@builder``) will be a FlowBuilder object which removes 15 | this accumulator object and uses it to define a new entity. 16 | """ 17 | 18 | import attr 19 | import warnings 20 | 21 | from .exception import AttributeValidationError 22 | from .provider import FunctionProvider 23 | from .utils.misc import oneline 24 | 25 | 26 | @attr.s 27 | class DecorationAccumulator: 28 | provider = attr.ib() 29 | 30 | protocol = attr.ib(default=None) 31 | docs = attr.ib(default=None) 32 | should_persist = attr.ib(default=None) 33 | should_memoize = attr.ib(default=None) 34 | 35 | def wrap_provider(self, wrapper_fn, *args, **kwargs): 36 | self.provider = wrapper_fn(self.provider, *args, **kwargs) 37 | 38 | def update_attr( 39 | self, attr_name, attr_value, decorator_name, raise_if_already_set=True 40 | ): 41 | old_attr_value = getattr(self, attr_name) 42 | if old_attr_value is not None: 43 | message = f""" 44 | Tried to use {decorator_name} with value {attr_value!r}, 45 | but this decorator was already used with value {old_attr_value!r} 46 | """ 47 | if raise_if_already_set: 48 | raise AttributeValidationError(oneline(message)) 49 | else: 50 | preamble = """ 51 | Applying this type of decorator multiple times is deprecated and will 52 | become an error condition in a future release; please remove all but 53 | the uppermost uses of this decorator. Details: 54 | """ 55 | warnings.warn(oneline(preamble) + "\n" + oneline(message)) 56 | setattr(self, attr_name, attr_value) 57 | 58 | 59 | def decorator_updating_accumulator(acc_update_func): 60 | """ 61 | Creates a decorator which applies a transformation to the DecorationAccumulator 62 | attached to the decorated function. (If no accumulator is attached, the decorator 63 | will initialize one.) 64 | """ 65 | 66 | def decorator(func): 67 | init_accumulator_if_not_set_on_func(func) 68 | acc = get_accumulator_from_func(func) 69 | acc_update_func(acc) 70 | return func 71 | 72 | return decorator 73 | 74 | 75 | ACC_ATTR_NAME = "bionic_decorator_accumulator" 76 | 77 | 78 | def init_accumulator_if_not_set_on_func(func): 79 | if not hasattr(func, ACC_ATTR_NAME): 80 | setattr( 81 | func, 82 | ACC_ATTR_NAME, 83 | DecorationAccumulator(provider=FunctionProvider(func)), 84 | ) 85 | 86 | 87 | def get_accumulator_from_func(func): 88 | return getattr(func, ACC_ATTR_NAME) 89 | 90 | 91 | def pop_accumulator_from_func(func): 92 | acc = get_accumulator_from_func(func) 93 | delattr(func, ACC_ATTR_NAME) 94 | return acc 95 | -------------------------------------------------------------------------------- /tests/test_flow/generate_test_compatibility_cache.py: -------------------------------------------------------------------------------- 1 | # This script generates cache for a flow represented in Harness class 2 | # inside test_dir (tests/test_flow/test_persistence_compatibility). 3 | # The generated cache is used by test_persistence_compatibility.py tests 4 | # to validate that the cache can be deserialized by current Bionic. 5 | # In case the caching has changed, this file is used to replace the 6 | # test cache. 7 | # 8 | # To renegerate cache, run the following command from bionic/ dir 9 | # `python -m tests.test_flow.generate_test_compatibility_cache` 10 | # 11 | # Note that the repo ignores *.pkl datafiles which is bypassed using 12 | # "Test data" section in .gitignore. 13 | 14 | import os 15 | import shutil 16 | 17 | import bionic as bn 18 | 19 | from ..helpers import ResettingCallCounter 20 | 21 | 22 | CACHE_TEST_DIR = os.path.join( 23 | os.path.dirname(__file__), "test_persistence_compatibility" 24 | ) 25 | 26 | 27 | class Harness: 28 | """ 29 | Holds a simple Bionic flow with counters to all the functions in it. 30 | """ 31 | 32 | EXPECTED_TOTAL_SUM = 1002 33 | 34 | def __init__(self, cache_dir, make_counter): 35 | lowercase_sum_counter = make_counter() 36 | uppercase_sum_counter = make_counter() 37 | total_sum_counter = make_counter() 38 | 39 | builder = bn.FlowBuilder("test") 40 | 41 | builder.set("core__persistent_cache__flow_dir", cache_dir) 42 | 43 | # It's important that this test uses sets, because we want to check that sets 44 | # are hashed deterministically. (Set iteration is non-deterministic, but it's 45 | # always the same within one Python process, so a simpler test where we just 46 | # run a flow multiple times won't work for this.) 47 | builder.assign("lowercase_chars", set("abcdef")) 48 | builder.assign("uppercase_chars", frozenset("ABCDEF")) 49 | 50 | @builder 51 | @bn.version_no_warnings 52 | def lowercase_sum(lowercase_chars): 53 | lowercase_sum_counter.mark() 54 | return sum(ord(char) for char in lowercase_chars) 55 | 56 | @builder 57 | @bn.version_no_warnings 58 | def uppercase_sum(uppercase_chars): 59 | uppercase_sum_counter.mark() 60 | return sum(ord(char) for char in uppercase_chars) 61 | 62 | @builder 63 | @bn.version_no_warnings 64 | def total_sum(lowercase_sum, uppercase_sum): 65 | total_sum_counter.mark() 66 | return lowercase_sum + uppercase_sum 67 | 68 | self.lowercase_sum_counter = lowercase_sum_counter 69 | self.uppercase_sum_counter = uppercase_sum_counter 70 | self.total_sum_counter = total_sum_counter 71 | 72 | self.manual_flow = builder.build() 73 | builder.set("core__versioning_mode", "auto") 74 | self.auto_flow = builder.build() 75 | 76 | @property 77 | def flows(self): 78 | return [self.manual_flow, self.auto_flow] 79 | 80 | 81 | if __name__ == "__main__": 82 | 83 | def make_counter(): 84 | return ResettingCallCounter() 85 | 86 | harness = Harness(CACHE_TEST_DIR, make_counter) 87 | 88 | shutil.rmtree(CACHE_TEST_DIR) 89 | 90 | for flow in harness.flows: 91 | # Make sure everything is written to the cache. 92 | flow.get("total_sum") 93 | -------------------------------------------------------------------------------- /tests/test_flow/test_new_api.py: -------------------------------------------------------------------------------- 1 | """ 2 | These tests are for experimental descriptor-based uses of Bionic's API. 3 | """ 4 | 5 | import pytest 6 | 7 | import bionic as bn 8 | from bionic.exception import EntityValueError 9 | 10 | 11 | def test_returns(builder): 12 | @builder 13 | @bn.returns("one") 14 | def _(): 15 | return 1 16 | 17 | @builder 18 | @bn.returns("two,") 19 | def _(): 20 | return (2,) 21 | 22 | @builder 23 | @bn.returns("three, four") 24 | def _(): 25 | return 3, 4 26 | 27 | @builder 28 | @bn.returns("five, (six, seven)") 29 | def _(): 30 | return 5, (6, 7) 31 | 32 | flow = builder.build() 33 | 34 | assert flow.get("one") == 1 35 | assert flow.get("two") == 2 36 | assert flow.get("three") == 3 37 | assert flow.get("four") == 4 38 | assert flow.get("five") == 5 39 | assert flow.get("six") == 6 40 | assert flow.get("seven") == 7 41 | 42 | 43 | def test_failing_returns(builder): 44 | @builder 45 | @bn.returns("a, b") 46 | def wrong_number_of_values(): 47 | return 1, 2, 3 48 | 49 | @builder 50 | @bn.returns("c, d") 51 | def not_a_sequence(): 52 | return 1 53 | 54 | @builder 55 | @bn.returns("(e, f), g") 56 | def wrong_tuple_structure(): 57 | return 1, (2, 3) 58 | 59 | flow = builder.build() 60 | 61 | with pytest.raises(EntityValueError): 62 | flow.get("a") 63 | 64 | with pytest.raises(EntityValueError): 65 | flow.get("c") 66 | 67 | with pytest.raises(EntityValueError): 68 | flow.get("e") 69 | 70 | 71 | def test_accepts(builder): 72 | builder.assign("x", 2) 73 | builder.assign("y", 3) 74 | builder.assign("z", 4) 75 | 76 | @builder 77 | @bn.accepts(my_x="x") 78 | def x_plus_one(my_x): 79 | return my_x + 1 80 | 81 | @builder 82 | @bn.accepts(x_="x,") 83 | def x_plus_two(x_): 84 | (x,) = x_ 85 | return x + 2 86 | 87 | @builder 88 | @bn.accepts(my_y="y", my_other_y="y") 89 | def x_plus_two_y(x, my_y, my_other_y): 90 | return x + my_y + my_other_y 91 | 92 | @builder 93 | @bn.accepts(x_y="x, y") 94 | def x_plus_y(x_y): 95 | x, y = x_y 96 | return x + y 97 | 98 | @builder 99 | @bn.accepts(my_x="x", my_y="y") 100 | def xy(my_x, my_y): 101 | return my_x * my_y 102 | 103 | @builder 104 | @bn.accepts(x_y_z="x, (y, z)") 105 | def x_plus_y_plus_z(x_y_z): 106 | x, (y, z) = x_y_z 107 | return x + y + z 108 | 109 | flow = builder.build() 110 | 111 | assert flow.get("x_plus_one") == 3 112 | assert flow.get("x_plus_two") == 4 113 | assert flow.get("x_plus_y") == 5 114 | assert flow.get("x_plus_two_y") == 8 115 | assert flow.get("xy") == 6 116 | assert flow.get("x_plus_y_plus_z") == 9 117 | 118 | 119 | @pytest.mark.skip("Not implemented yet") 120 | def test_get(builder): 121 | builder.assign("x", 2) 122 | builder.assign("y", 3) 123 | builder.assign("z", 4) 124 | 125 | flow = builder.build() 126 | 127 | assert flow.get("()") == () 128 | assert flow.get("x,") == (2,) 129 | assert flow.get("x, x") == (2, 2) 130 | assert flow.get("x, y") == (2, 3) 131 | assert flow.get("x, (y, z)") == (2, (3, 4)) 132 | -------------------------------------------------------------------------------- /bionic/aip/main.py: -------------------------------------------------------------------------------- 1 | """ 2 | This module is run as main in order to execute a task on a worker. 3 | """ 4 | 5 | import logging 6 | import pickle 7 | import os 8 | import sys 9 | 10 | from bionic.deps.optdep import import_optional_dependency 11 | from bionic.gcs import get_gcs_fs_without_warnings 12 | 13 | 14 | def _run(ipath, gcs_fs): 15 | cloudpickle = import_optional_dependency("cloudpickle") 16 | 17 | with gcs_fs.open(ipath, "rb") as f: 18 | task = cloudpickle.load(f) 19 | 20 | # Now that we have the task, set up logging. 21 | _set_up_logging(task.job_id, task.config.project_id) 22 | logging.info(f"Read task from {ipath}") 23 | 24 | result = task.function() 25 | 26 | opath = task.output_uri 27 | logging.info(f"Uploading result to {opath}") 28 | with gcs_fs.open(opath, "wb") as f: 29 | pickle.dump(result, f) 30 | 31 | 32 | # Main entry point for AIP 33 | def run(): 34 | """ 35 | This method is a proxy to _run which does the actual work. The proxy exists 36 | so that _run can be replaced for testing. 37 | """ 38 | _run(sys.argv[-1], get_gcs_fs_without_warnings()) 39 | 40 | 41 | def _set_up_logging(job_id, project_id): 42 | if os.environ.get("BIONIC_NO_STACKDRIVER", False): 43 | return 44 | 45 | # TODO This is the ID of the hyperparameter tuning trial currently 46 | # running on this VM. This field is only set if the current 47 | # training job is a hyperparameter tuning job. Conductor uses this 48 | # environment variable but AIP documentation suggests us to use 49 | # TF_CONFIG. Check whether we need to update this env variable. 50 | # Find more details on TF_CONFIG at this link: 51 | # https://cloud.google.com/ai-platform/training/docs/distributed-training-details 52 | trial_id = os.environ.get("CLOUD_ML_TRIAL_ID", None) 53 | 54 | glogging = import_optional_dependency("google.cloud.logging") 55 | 56 | client = glogging.Client(project=project_id) 57 | resource = glogging.resource.Resource( 58 | type="ml_job", 59 | # AIP expects a default task_name for the master cluster. We 60 | # use a placeholder value till we start using clusters. Once we 61 | # do, it should be configured based on the cluster. 62 | labels=dict(job_id=job_id, project_id=project_id, task_name="master-replica-0"), 63 | ) 64 | labels = None 65 | if trial_id is not None: 66 | # Enable grouping by trial when present. 67 | labels = {"ml.googleapis.com/trial_id": trial_id} 68 | 69 | # Enable only the cloud logger to avoid duplicate messages. 70 | handler = glogging.handlers.handlers.CloudLoggingHandler( 71 | client, resource=resource, labels=labels 72 | ) 73 | root_logger = logging.getLogger() 74 | # Remote the StreamHandler. Any logs logged by it shows up as error 75 | # logs in Stackdriver. 76 | root_logger.handlers = [] 77 | # We should ideally make this configurable, but till then, let's 78 | # set the level to DEBUG to write all the logs. It's not hard to 79 | # filter using log level on Stackdriver so it doesn't create too 80 | # much noise anyway. 81 | root_logger.setLevel(logging.DEBUG) 82 | root_logger.addHandler(handler) 83 | for logger_name in glogging.handlers.handlers.EXCLUDED_LOGGER_DEFAULTS: 84 | logging.getLogger(logger_name).propagate = False 85 | 86 | 87 | if __name__ == "__main__": 88 | run() 89 | -------------------------------------------------------------------------------- /bionic/protocol.py: -------------------------------------------------------------------------------- 1 | from . import protocols 2 | from .utils.misc import oneline 3 | 4 | # These are callable with or without arguments. See BaseProtocol.__call__ for 5 | # why we instantiate them here. 6 | picklable = protocols.PicklableProtocol() # noqa: F401 7 | dillable = protocols.DillableProtocol() # noqa: F401 8 | dask = protocols.DaskProtocol() # noqa: F401 9 | image = protocols.ImageProtocol() # noqa: F401 10 | numpy = protocols.NumPyProtocol() # noqa: F401 11 | yaml = protocols.YamlProtocol() # noqa: F401 12 | path = protocols.PathProtocol() # noqa: F401 13 | geodataframe = protocols.GeoPandasProtocol() # noqa: F401 14 | json = protocols.JsonProtocol() # noqa: F401 15 | 16 | 17 | def frame(func=None, file_format=None, check_dtypes=None): 18 | """ 19 | Decorator indicating that an entity will always have a pandas DataFrame 20 | type. 21 | 22 | The frame values will be serialized to either Parquet (default) or Feather. 23 | Parquet is more popular, but some types of data or frame structures are 24 | only supported by one format or the other. In particular, ordered 25 | categorical columns are supported by Feather and not Parquet. 26 | 27 | This decorator can be used with or without arguments: 28 | 29 | .. code-block:: python 30 | 31 | @frame 32 | def dataframe(...): 33 | ... 34 | 35 | @frame(file_format='feather') 36 | def dataframe(...): 37 | ... 38 | 39 | Parameters 40 | ---------- 41 | file_format: {'parquet', 'feather'} (default: 'parquet') 42 | Which file format to use when saving values to disk. 43 | check_dtypes: boolean (default: True) 44 | Check for column types not supported by the file format. This 45 | check is best-effort and not guaranteed to catch all problems. If 46 | an unsupported data type is found, an exception will be thrown at 47 | serialization time. 48 | """ 49 | 50 | # If the first argument is present, we were (hopefully) used as a decorator 51 | # without any other arguments. 52 | if func is not None: 53 | if file_format is not None or check_dtypes is not None: 54 | raise ValueError("frame can't be called with both a function and keywords") 55 | if not callable(func): 56 | raise ValueError( 57 | oneline( 58 | """ 59 | frame must be used either (a) directly as a decorator or 60 | (b) with keyword arguments; 61 | it can't take positional arguments. 62 | """ 63 | ) 64 | ) 65 | return protocols.ParquetDataFrameProtocol()(func) 66 | 67 | # Otherwise, we have arguments and should return a decorator. 68 | if file_format is None or file_format == "parquet": 69 | kwargs = {} 70 | if check_dtypes is not None: 71 | kwargs["check_dtypes"] = check_dtypes 72 | return protocols.ParquetDataFrameProtocol(**kwargs) 73 | elif file_format == "feather": 74 | return protocols.FeatherDataFrameProtocol() 75 | else: 76 | raise ValueError( 77 | oneline( 78 | f""" 79 | file_format must be one of {'parquet', 'feather'}; 80 | got {file_format!r}""" 81 | ) 82 | ) 83 | 84 | 85 | # These need to be called with arguments. 86 | enum = protocols.EnumProtocol # noqa: F401 87 | type = protocols.TypeProtocol # noqa: F401 88 | -------------------------------------------------------------------------------- /docs/api/protocols.rst: -------------------------------------------------------------------------------- 1 | =========== 2 | Protocols 3 | =========== 4 | 5 | Introduction 6 | ------------ 7 | 8 | Protocols are special cases of Bionic decorators; their effect is to specify 9 | the `Serialization Protocol <../concepts.rst#serialization-protocols>`_ for the 10 | entity being defined. For example: 11 | 12 | .. code-block:: python 13 | 14 | # This entity should only have values equal to "short" or "long". 15 | @builder 16 | @bn.protocol.enum('short', 'long') 17 | def name_length(name): 18 | if len(name) < 10: 19 | return 'short' 20 | else: 21 | return 'long' 22 | 23 | # This entity's value will always be a ``pandas.DataFrame``. 24 | @builder 25 | @bn.protocol.frame 26 | def raw_df(): 27 | from sklearn import datasets 28 | dataset = datasets.load_breast_cancer() 29 | df = pd.DataFrame( 30 | data=dataset.data, 31 | ) 32 | df['target'] = dataset.target 33 | return df 34 | 35 | Protocols are used to tell Bionic how to serialize, deserialize, and validate 36 | entity values. In most cases, Bionic's default protocol can figure out an 37 | appropriate way to handle each value, so explicit protocol decorators are 38 | usually not required. However, they can be useful for data types that need 39 | special handling, or just to add clarity, safety, or documentation to a 40 | entity definition. 41 | 42 | Protocols can also be used when creating new entities with ``declare`` or 43 | ``assign``: 44 | 45 | .. code-block:: python 46 | 47 | builder.assign('name_length', 'short', bn.protocol.enum('short', 'long')) 48 | builder.declare('raw_df', bn.protocol.frame) 49 | 50 | Custom Protocols 51 | ---------------- 52 | 53 | If you need to control how an entity is serialized, you can write your own 54 | custom protocol. (However, since Bionic is still at an early stage, future 55 | API changes may break your implementation.) 56 | 57 | .. code-block:: python 58 | 59 | class MyProtocol(BaseProtocol): 60 | def get_fixed_file_extension(self): 61 | """ 62 | Returns a file extension identifying this protocol. This value will be appended 63 | to the name of any file written by the protocol, and may be used to determine 64 | whether a file can be read by the protocol. 65 | 66 | This string should be unique, not shared with any other protocol. By 67 | convention, it doesn't include an initial period, but may include periods in 68 | the middle. (For example, `"csv"`, and `"csv.zip"` would both be sensible 69 | file extensions.) 70 | """ 71 | raise NotImplementedError() 72 | 73 | def write(self, value, path): 74 | """Serializes the object ``value`` to the pathlib path ``path``.""" 75 | raise NotImplementedError() 76 | 77 | def read(self, path): 78 | """Deserializes an object from the pathlib path ``path``, and returns it.""" 79 | raise NotImplementedError() 80 | 81 | Built-In Protocol Decorators 82 | ---------------------------- 83 | 84 | .. autofunction:: bionic.protocol.dask 85 | .. autofunction:: bionic.protocol.dillable 86 | .. autofunction:: bionic.protocol.enum 87 | .. autofunction:: bionic.protocol.frame 88 | .. autofunction:: bionic.protocol.geodataframe 89 | .. autofunction:: bionic.protocol.image 90 | .. autofunction:: bionic.protocol.json 91 | .. autofunction:: bionic.protocol.numpy 92 | .. autofunction:: bionic.protocol.path 93 | .. autofunction:: bionic.protocol.picklable 94 | .. autofunction:: bionic.protocol.type 95 | .. autofunction:: bionic.protocol.yaml 96 | -------------------------------------------------------------------------------- /tests/test_flow/test_logging.py: -------------------------------------------------------------------------------- 1 | import logging 2 | import threading 3 | 4 | import pytest 5 | 6 | 7 | @pytest.mark.allows_parallel 8 | def test_logging_details(builder, log_checker, parallel_execution_enabled): 9 | """ 10 | Test the details of the log messages we emit. Since these messages are currently the 11 | best way to get visibility into what Bionic is doing, we have much more detailed 12 | tests than we'd normally want for logging. This means we'll have to tweak these 13 | tests as we update the format or implementation details of our logging. 14 | 15 | At some point we should introduce a separate system for user-facing 16 | progress reporting instead of using logs. 17 | """ 18 | 19 | builder.assign("x", 1) 20 | 21 | @builder 22 | def x_plus_one(x): 23 | return x + 1 24 | 25 | @builder 26 | def x_plus_two(x_plus_one): 27 | return x_plus_one + 1 28 | 29 | flow = builder.build() 30 | assert flow.get("x_plus_one") == 2 31 | log_checker.expect_all( 32 | "Accessed x(x=1) from definition", 33 | "Computing x_plus_one(x=1) ...", 34 | "Computed x_plus_one(x=1)", 35 | ) 36 | 37 | assert flow.get("x_plus_two") == 3 38 | 39 | if parallel_execution_enabled: 40 | # This is different from serial execution because we don't pass 41 | # in-memory cache to the subprocesses. The subprocess loads the 42 | # entities from disk cache instead. 43 | log_checker.expect_all( 44 | "Loaded x_plus_one(x=1) from disk cache", 45 | "Computing x_plus_two(x=1) ...", 46 | "Computed x_plus_two(x=1)", 47 | ) 48 | else: 49 | log_checker.expect_all( 50 | "Accessed x_plus_one(x=1) from in-memory cache", 51 | "Computing x_plus_two(x=1) ...", 52 | "Computed x_plus_two(x=1)", 53 | ) 54 | 55 | flow = builder.build() 56 | assert flow.get("x_plus_one") == 2 57 | # We don't access the definitions for simple lookup objects in 58 | # parallel execution unless we use the objects for computation. 59 | # Since we load x_plus_one from disk cache, we don't access the 60 | # definition for x. 61 | # To clarify: we do access it for looking at the cache, but it's 62 | # taken from the case key where it is loaded by default and is not 63 | # counted as definition access in the flow. 64 | log_checker.expect_all("Loaded x_plus_one(x=1) from disk cache") 65 | 66 | flow = builder.build() 67 | assert flow.get("x_plus_two") == 3 68 | log_checker.expect_all("Loaded x_plus_two(x=1) from disk cache") 69 | 70 | flow = flow.setting("x_plus_one", 3) 71 | assert flow.get("x_plus_two") == 4 72 | log_checker.expect_all( 73 | "Accessed x_plus_one(x_plus_one=3) from definition", 74 | "Computing x_plus_two(x_plus_one=3) ...", 75 | "Computed x_plus_two(x_plus_one=3)", 76 | ) 77 | 78 | 79 | class CannotPickleMe: 80 | def __init__(self): 81 | # Storing a lock makes it unpickleable 82 | self.lock = threading.Lock() 83 | 84 | def __str__(self): 85 | return "Cannot pickle me" 86 | 87 | 88 | def test_log_unpickleable_value(builder, log_checker): 89 | @builder 90 | def log_unpickleable_value(): 91 | # Test that we handle unpickleable value in `LogRecord.msg`. 92 | logging.info(CannotPickleMe()) 93 | # Test that we handle unpickleable value in `LogRecord.args`. 94 | logging.info("Logging unpickleable class: %s", CannotPickleMe()) 95 | return 5 96 | 97 | assert builder.build().get("log_unpickleable_value") == 5 98 | 99 | log_checker.expect_all( 100 | "Computing log_unpickleable_value() ...", 101 | "Cannot pickle me", 102 | "Logging unpickleable class: Cannot pickle me", 103 | "Computed log_unpickleable_value()", 104 | ) 105 | -------------------------------------------------------------------------------- /docs/tutorials/hello_world.ipynb: -------------------------------------------------------------------------------- 1 | { 2 | "cells": [ 3 | { 4 | "cell_type": "markdown", 5 | "metadata": {}, 6 | "source": [ 7 | "# Hello World" 8 | ] 9 | }, 10 | { 11 | "cell_type": "raw", 12 | "metadata": { 13 | "raw_mimetype": "text/restructuredtext" 14 | }, 15 | "source": [ 16 | "Let's start with a very simple example: a Bionic flow that generates the text\n", 17 | "\"Hello world!\"\n", 18 | "\n", 19 | "*(The code for this example is available in the Bionic repo at\n", 20 | "example/hello_world.py.)*\n", 21 | "\n", 22 | ".. literalinclude:: ../../example/hello_world.py\n", 23 | " :language: python\n", 24 | " :linenos:\n", 25 | "\n", 26 | "We can run this code (assuming we've checked out the ``bionic`` repo) like\n", 27 | "this:\n", 28 | "\n", 29 | ".. code-block:: bash\n", 30 | "\n", 31 | " > cd bionic\n", 32 | " > python example/hello_world.py\n", 33 | " Hello world!\n", 34 | "\n", 35 | "We can also import it in an interpreter or notebook:" 36 | ] 37 | }, 38 | { 39 | "cell_type": "code", 40 | "execution_count": null, 41 | "metadata": {}, 42 | "outputs": [], 43 | "source": [ 44 | "# Configure the PYTHONPATH for this notebook.\n", 45 | "import _tutorial_setup\n", 46 | "\n", 47 | "from example.hello_world import flow\n", 48 | "flow.get('message')" 49 | ] 50 | }, 51 | { 52 | "cell_type": "markdown", 53 | "metadata": {}, 54 | "source": [ 55 | "Although our `flow` object is immutable, we can easily make a new copy with a different value of `subject`:" 56 | ] 57 | }, 58 | { 59 | "cell_type": "code", 60 | "execution_count": null, 61 | "metadata": {}, 62 | "outputs": [], 63 | "source": [ 64 | "new_flow = flow.setting('subject', 'universe')\n", 65 | "new_flow.get('message')" 66 | ] 67 | }, 68 | { 69 | "cell_type": "markdown", 70 | "metadata": {}, 71 | "source": [ 72 | "We can also try changing the `message` directly:" 73 | ] 74 | }, 75 | { 76 | "cell_type": "code", 77 | "execution_count": null, 78 | "metadata": {}, 79 | "outputs": [], 80 | "source": [ 81 | "flow.setting('message', 'Goodbye world!').get('message')" 82 | ] 83 | }, 84 | { 85 | "cell_type": "markdown", 86 | "metadata": {}, 87 | "source": [ 88 | "As a convenience, `setting` and `get` can be called by an alternative syntax which makes it easier for your notebook or interpreter to autocomplete entity names:" 89 | ] 90 | }, 91 | { 92 | "cell_type": "code", 93 | "execution_count": null, 94 | "metadata": {}, 95 | "outputs": [], 96 | "source": [ 97 | "flow.setting.subject('universe').get.message()" 98 | ] 99 | }, 100 | { 101 | "cell_type": "markdown", 102 | "metadata": {}, 103 | "source": [ 104 | "Finally, we can visualize our flow as a directed acyclic graph:" 105 | ] 106 | }, 107 | { 108 | "cell_type": "code", 109 | "execution_count": null, 110 | "metadata": {}, 111 | "outputs": [], 112 | "source": [ 113 | "flow.render_dag()" 114 | ] 115 | }, 116 | { 117 | "cell_type": "markdown", 118 | "metadata": {}, 119 | "source": [ 120 | "This flow doesn't do much, but it illustrates how flows can be constructed, used, and modified.\n", 121 | "The next tutorial will demonstrate a more practical example." 122 | ] 123 | } 124 | ], 125 | "metadata": { 126 | "kernelspec": { 127 | "display_name": "Python 3", 128 | "language": "python", 129 | "name": "python3" 130 | }, 131 | "language_info": { 132 | "codemirror_mode": { 133 | "name": "ipython", 134 | "version": 3 135 | }, 136 | "file_extension": ".py", 137 | "mimetype": "text/x-python", 138 | "name": "python", 139 | "nbconvert_exporter": "python", 140 | "pygments_lexer": "ipython3", 141 | "version": "3.7.3" 142 | } 143 | }, 144 | "nbformat": 4, 145 | "nbformat_minor": 2 146 | } 147 | -------------------------------------------------------------------------------- /tests/test_flow/test_outputs.py: -------------------------------------------------------------------------------- 1 | import pytest 2 | 3 | import pandas as pd 4 | import pandas.testing as pdt 5 | 6 | from ..helpers import RoundingProtocol 7 | 8 | import bionic as bn 9 | from bionic.exception import EntityValueError, UndefinedEntityError 10 | 11 | 12 | @pytest.fixture(scope="function") 13 | def preset_builder(builder): 14 | builder.assign("x", 2) 15 | builder.assign("y", 3) 16 | 17 | return builder 18 | 19 | 20 | def test_output(preset_builder): 21 | builder = preset_builder 22 | 23 | @builder 24 | @bn.output("g") 25 | def f(x, y): 26 | return x + y 27 | 28 | flow = builder.build() 29 | 30 | assert flow.get("g") == 5 31 | 32 | with pytest.raises(UndefinedEntityError): 33 | flow.get("f") 34 | 35 | 36 | def test_outputs(builder): 37 | builder.assign("numerator", 14) 38 | builder.assign("denominator", 3) 39 | 40 | @builder 41 | @bn.outputs("quotient", "remainder") 42 | def divide(numerator, denominator): 43 | quotient = numerator // denominator 44 | remainder = numerator % denominator 45 | return quotient, remainder 46 | 47 | flow = builder.build() 48 | 49 | assert flow.get("quotient") == 4 50 | assert flow.get("remainder") == 2 51 | 52 | with pytest.raises(UndefinedEntityError): 53 | flow.get("divide") 54 | 55 | 56 | def test_outputs_custom_protocols_first(builder): 57 | builder.assign("location", (37.7, -122.4)) 58 | 59 | @builder 60 | @bn.outputs("lat", "lon") 61 | def latlon(location): 62 | return location 63 | 64 | @builder 65 | @RoundingProtocol() 66 | @bn.outputs("rounded_lat", "rounded_lon") 67 | def rounded_latlon(lat, lon): 68 | return lat, lon 69 | 70 | @builder 71 | @bn.outputs("other_rounded_lat", "other_rounded_lon") 72 | @RoundingProtocol() 73 | def other_rounded_latlon(lat, lon): 74 | return lat, lon 75 | 76 | flow = builder.build() 77 | 78 | assert flow.get("lat") == 37.7 79 | assert flow.get("lon") == -122.4 80 | 81 | assert flow.get("rounded_lat") == 38 82 | assert flow.get("rounded_lon") == -122 83 | 84 | assert flow.get("other_rounded_lat") == 38 85 | assert flow.get("other_rounded_lon") == -122 86 | 87 | 88 | # I'm not sure if there's an easy way to test that we're using the correct 89 | # default protocol for each type, but at least we can check that nothing 90 | # breaks. 91 | def test_outputs_default_protocols(builder): 92 | expected_df = pd.DataFrame(columns=["x", "y"], data=[[1, 2], [3, 4]]) 93 | 94 | @builder 95 | @bn.outputs("size", "df") 96 | def f(): 97 | df = expected_df.copy() 98 | return len(df), df 99 | 100 | flow = builder.build() 101 | 102 | assert flow.get("size") == 2 103 | pdt.assert_frame_equal(flow.get("df"), expected_df) 104 | 105 | 106 | def test_singleton_outputs(builder): 107 | @builder 108 | @bn.outputs("a") 109 | def one_output(): 110 | return (2,) 111 | 112 | assert builder.build().get("a") == 2 113 | 114 | 115 | def test_wrong_number_of_outputs(builder): 116 | @builder 117 | @bn.outputs("a", "b") 118 | def three_outputs(): 119 | return (1, 2, 3) 120 | 121 | flow = builder.build() 122 | with pytest.raises(EntityValueError): 123 | flow.get("a") 124 | 125 | 126 | def test_non_sequence_outputs(builder): 127 | @builder 128 | @bn.outputs("a", "b") 129 | def non_sequence_output(): 130 | return 1 131 | 132 | flow = builder.build() 133 | with pytest.raises(EntityValueError): 134 | flow.get("a") 135 | 136 | 137 | def test_non_sequence_outputs_message(builder): 138 | @builder 139 | @bn.outputs("a") 140 | def non_sequence_output(): 141 | return 7 142 | 143 | flow = builder.build() 144 | with pytest.raises(EntityValueError) as e: 145 | flow.get("a") 146 | assert "did you mean to use @output instead of @outputs?" in e.value 147 | -------------------------------------------------------------------------------- /example/ml_workflow_cli.py: -------------------------------------------------------------------------------- 1 | """ 2 | A CLI for running an extended version of the ML example in `ml_workflow`. 3 | 4 | Fits and evaluates a model on a scikit-learn dataset. 5 | """ 6 | 7 | import time 8 | 9 | import pandas as pd 10 | from sklearn import datasets, metrics 11 | 12 | import bionic as bn 13 | from .ml_workflow import flow as base_ml_flow 14 | 15 | # Add an AUC score summary to our flow. 16 | builder = base_ml_flow.to_builder() 17 | 18 | 19 | @builder 20 | def auc_score(test_frame, prediction_frame): 21 | """ 22 | The Area Under the (Receiver Operating Characteristic) Curve. 23 | """ 24 | return metrics.roc_auc_score( 25 | test_frame["target"], 26 | prediction_frame["proba"], 27 | ) 28 | 29 | 30 | @builder 31 | @bn.gather(over="hyperparams_dict", also="auc_score", into="gathered_frame") 32 | @bn.outputs("best_hyperparams_dict", "best_auc_score") 33 | @bn.docs( 34 | """The hyperparameter settings with the highest AUC score.""", 35 | """The best (highest) AUC score, compared over all hyperparameter settings.""", 36 | ) 37 | def best_settings(gathered_frame): 38 | best_row = gathered_frame.sort_values("auc_score", ascending=False).iloc[0] 39 | return best_row[["hyperparams_dict", "auc_score"]] 40 | 41 | 42 | flow = builder.build() 43 | 44 | # Compute and print the model performance. 45 | if __name__ == "__main__": 46 | import argparse 47 | 48 | parser = argparse.ArgumentParser(description="Runs a simple ML workflow example") 49 | parser.add_argument("--bucket", "-b", help="Name of GCS bucket to cache in") 50 | parser.add_argument( 51 | "--quiet", "-q", help="Don't enable INFO-level logging", action="store_true" 52 | ) 53 | parser.add_argument( 54 | "--parallel", "-p", help="Run flow in parallel", action="store_true" 55 | ) 56 | parser.add_argument( 57 | "-C", 58 | help="Value or values (comma-separated) for " 59 | "the inverse regularization parameter 'C'", 60 | ) 61 | parser.add_argument( 62 | "--big-dataset", "-B", help="Use bigger covertype dataset", action="store_true" 63 | ) 64 | parser.add_argument( 65 | "--render-dag", 66 | "-D", 67 | help="Render DAG visualization to file instead of running", 68 | ) 69 | 70 | args = parser.parse_args() 71 | if not args.quiet: 72 | bn.utils.misc.init_basic_logging() 73 | if args.bucket is not None: 74 | flow = flow.setting( 75 | "core__persistent_cache__gcs__bucket_name", args.bucket 76 | ).setting("core__persistent_cache__gcs__enabled", True) 77 | if args.C is not None: 78 | c_values_str = args.C 79 | c_values = [ 80 | float(c_value_str.strip()) for c_value_str in c_values_str.split(",") 81 | ] 82 | flow = flow.setting("hyperparams_dict", values=[{"C": c} for c in c_values]) 83 | if args.big_dataset: 84 | builder = flow.to_builder() 85 | 86 | @builder 87 | @bn.version("covtype dataset") 88 | def raw_frame(): 89 | dataset = datasets.fetch_covtype() 90 | feature_names = [f"feature_{ix}" for ix in range(dataset.data.shape[1])] 91 | df = pd.DataFrame(data=dataset.data, columns=feature_names) 92 | # This is a multiclass dataset, but we want to treat it as a binary one. 93 | # We'll just try to detect class 2, since that one is the most common. 94 | df["target"] = dataset.target == 2 95 | return df 96 | 97 | flow = builder.build() 98 | if args.parallel: 99 | flow = flow.setting("core__parallel_execution__enabled", True) 100 | 101 | if args.render_dag: 102 | flow.render_dag().save(args.render_dag) 103 | exit() 104 | 105 | start = time.time() 106 | all_hpds = flow.get("hyperparams_dict", collection=list) 107 | best_hpd = flow.get("best_hyperparams_dict") 108 | best_auc_score = flow.get("best_auc_score") # noqa: F811 109 | end = time.time() 110 | 111 | print(f"Number of hyperparameter configurations tested: {len(all_hpds)}") 112 | print(f"Best hyperparameter configuration: {best_hpd!r}") 113 | print(f"Best AUC: {best_auc_score!r}") 114 | print(f"Total time taken: {end - start}") 115 | -------------------------------------------------------------------------------- /bionic/deps/optdep.py: -------------------------------------------------------------------------------- 1 | import re 2 | import importlib 3 | 4 | from .extras import extras_require as package_desc_lists_by_extra 5 | from ..utils.misc import oneline 6 | 7 | 8 | ILLEGAL_NAME_CHAR = re.compile("[^a-zA-Z0-9\\-._\\[\\]]") 9 | 10 | 11 | def first_token_from_package_desc(desc): 12 | first_mismatch = ILLEGAL_NAME_CHAR.search(desc) 13 | if first_mismatch is None: 14 | return desc 15 | 16 | if desc[first_mismatch.start()] not in " <>!=": 17 | raise AssertionError( 18 | oneline( 19 | f""" 20 | Package descriptor {desc!r} contained 21 | unexpected character {desc[first_mismatch.start()]!r}""" 22 | ) 23 | ) 24 | 25 | return desc[: first_mismatch.start()] 26 | 27 | 28 | # For packages that we don't import by the exact package name, these are 29 | # aliases we use. 30 | alias_lists_by_package = { 31 | "google-cloud-logging": ["google.cloud.logging"], 32 | "google-auth": ["google.auth"], 33 | "Pillow": ["PIL.Image"], 34 | "dask[dataframe]": ["dask.dataframe"], 35 | "google-api-python-client": ["googleapiclient.discovery"], 36 | } 37 | 38 | # Now we contruct a new data structure to allow us to give helpful error 39 | # messages when the user tries to import a package that's not available. 40 | extras_by_importable_name = {} 41 | for extra, package_descs in package_desc_lists_by_extra.items(): 42 | for package_desc in package_descs: 43 | package = first_token_from_package_desc(package_desc) 44 | 45 | # Associate this package with the extra it belongs to -- as long as 46 | # we haven't seen this package before. (Because we're iterating over 47 | # an OrderedDict, this will end up associating each package with the 48 | # first extra that requires it, which should also be the most specific 49 | # extra, and therefore the most helpful one to mention in an error 50 | # message.) 51 | if package not in extras_by_importable_name: 52 | extras_by_importable_name[package] = extra 53 | 54 | if package in alias_lists_by_package: 55 | for importable_name in alias_lists_by_package[package]: 56 | assert importable_name not in extras_by_importable_name 57 | extras_by_importable_name[importable_name] = extra 58 | 59 | # This is a fake entry for testing, since it's annoying to mock this. 60 | TEST_EXTRA_NAME = "_FAKE_TEST_EXTRA_" 61 | TEST_PACKAGE_NAME = "_FAKE_TEST_PACKAGE_" 62 | extras_by_importable_name[TEST_PACKAGE_NAME] = TEST_EXTRA_NAME 63 | 64 | 65 | # This is based on a similar function in Pandas: 66 | # https://github.com/pandas-dev/pandas/blob/8ea102acdb45bb70cb30ea77108a50054c28c24d/pandas/compat/_optional.py 67 | def import_optional_dependency(name, purpose=None, raise_on_missing=True): 68 | """ 69 | Attempts to import a Python module that may or may not be available. If 70 | it's not available, this function throws an ImportError explaining what the 71 | user needs to install. (Unless ``raise_on_missing`` is set to False, in 72 | which case it returns None.) 73 | """ 74 | 75 | if name not in extras_by_importable_name: 76 | raise AssertionError( 77 | oneline( 78 | f""" 79 | Attempted to import {name!r}, 80 | which is not registered as a dependency""" 81 | ) 82 | ) 83 | 84 | # TODO Once we have specific version requirements for our optional 85 | # packages, we should check that the version is correct. 86 | 87 | try: 88 | return importlib.import_module(name) 89 | except ImportError: 90 | if raise_on_missing: 91 | extra_name = extras_by_importable_name[name] 92 | 93 | if purpose is None: 94 | description = "required" 95 | else: 96 | description = "required for " + purpose 97 | 98 | raise ImportError( 99 | oneline( 100 | f""" 101 | Unable to import package {name!r}, which is {description}; 102 | you can use ``pip install 'bionic[{extra_name}]'`` 103 | to resolve this""" 104 | ) 105 | ) 106 | 107 | else: 108 | return None 109 | -------------------------------------------------------------------------------- /bionic/utils/keyed_priority_stack.py: -------------------------------------------------------------------------------- 1 | """ 2 | Provides an implementation of an updatable priority queue with specific ordering rules. 3 | 4 | The other implementations I found (including `heapq` and `queue.PriorityQueue`) all have 5 | one or more problems: 6 | 7 | 1. They're based on min-heaps, so they return the lowest value first; this is 8 | counterintuitive when dealing with priorities. 9 | 2. Built-in tiebreaking is not provided. 10 | 3. Updating priorities is impossible, or needs to be implemented separately. 11 | 12 | The implementation here is based on the `heapq` implementation of a binary heap, but 13 | adds reversed ordering (highest priority first), LIFO tiebreaking, and keyed lookup. 14 | """ 15 | 16 | import heapq 17 | from functools import total_ordering 18 | 19 | 20 | class KeyedPriorityStack: 21 | """ 22 | An updatable priority queue where ties are broken in last-in-first-out (LIFO) order. 23 | 24 | This data structure has a stack-like interface, supporting `push` and `pop`, but 25 | each element on the stack also has an associated key and priority. By default, 26 | `pop` removes returns the element with the *highest* priority (breaking ties in 27 | LIFO order), but it also accepts an optional key argument that specifies a specific 28 | element to be popped. This can be used to easily update an element's priority. 29 | """ 30 | 31 | def __init__(self): 32 | self._heap = [] 33 | self._next_seq_id = 0 34 | self._n_unremoved_entries = 0 35 | self._unremoved_entries_by_key = {} 36 | 37 | def push(self, key, value, priority): 38 | """ 39 | Adds a value to the stack with associated key and priority. 40 | """ 41 | 42 | if key is None: 43 | raise KeyError("Attempted to add None as key to priority stack") 44 | if key in self._unremoved_entries_by_key: 45 | raise ValueError( 46 | f"Attempted to add duplicate key to priority stack: {key!r}" 47 | ) 48 | seq_id = self._next_seq_id 49 | self._next_seq_id += 1 50 | entry = PriorityEntry(priority, seq_id, key, value) 51 | self._unremoved_entries_by_key[key] = entry 52 | heapq.heappush(self._heap, entry) 53 | self._n_unremoved_entries += 1 54 | 55 | def pop(self, key=None): 56 | """ 57 | Removes a value from the stack and returns it. 58 | 59 | If no key is provided, removes and returns the highest-priority element (or 60 | the last-added such element, if there is a tie). 61 | 62 | If a key is provided, removes and returns the element with that key. 63 | """ 64 | 65 | if key is not None: 66 | if key not in self._unremoved_entries_by_key: 67 | raise KeyError(f"Key not found in priority stack: {key!r}") 68 | entry = self._unremoved_entries_by_key.pop(key) 69 | entry.is_removed = True 70 | self._n_unremoved_entries -= 1 71 | return entry.value 72 | 73 | else: 74 | while True: 75 | if self._n_unremoved_entries == 0: 76 | raise IndexError("Attempted to get item from empty priority stack") 77 | entry = heapq.heappop(self._heap) 78 | if entry.is_removed: 79 | continue 80 | self._n_unremoved_entries -= 1 81 | del self._unremoved_entries_by_key[entry.key] 82 | return entry.value 83 | 84 | def __len__(self): 85 | """ 86 | Returns the number of elements on the stack. 87 | """ 88 | 89 | return self._n_unremoved_entries 90 | 91 | 92 | @total_ordering 93 | class PriorityEntry: 94 | def __init__(self, priority, seq_id, key, value): 95 | self.priority = priority 96 | self.seq_id = seq_id 97 | self.key = key 98 | self.value = value 99 | self.is_removed = False 100 | 101 | def __lt__(self, other): 102 | assert isinstance(other, PriorityEntry) 103 | 104 | return (self.priority, self.seq_id) > (other.priority, other.seq_id) 105 | 106 | def __eq__(self, other): 107 | if not isinstance(other, PriorityEntry): 108 | return False 109 | return (self.priority, self.seq_id) == (other.priority, other.seq_id) 110 | -------------------------------------------------------------------------------- /docs/get-started.rst: -------------------------------------------------------------------------------- 1 | =========== 2 | Get Started 3 | =========== 4 | 5 | Installation 6 | ------------ 7 | 8 | Bionic can be installed using ``pip``: 9 | 10 | .. code-block:: bash 11 | 12 | pip install 'bionic[standard]' 13 | 14 | The ``bionic[standard]`` package includes the core framework as well as the 15 | most commonly-used dependencies. There are several other subpackages offering 16 | different dependencies, documented :ref:`below`. 17 | 18 | You will probably also want to install `Graphviz `_, 19 | which Bionic uses to generate visualizations of its workflow graph. 20 | Unfortunately Graphviz is not written in Python and can't be installed by 21 | ``pip``. On Mac OS X, you can use `Homebrew `_ to install 22 | it: 23 | 24 | .. code-block:: bash 25 | 26 | brew install graphviz 27 | 28 | If you want your data to be automatically `cached to Google Cloud Storage`_, 29 | you'll also need to have the `Google Cloud SDK`_ installed, have access to a 30 | GCS bucket, and install the ``bionic[gcp]`` subpackage. 31 | 32 | .. _cached to Google Cloud Storage: concepts.rst#caching-in-google-cloud-storage 33 | .. _Google Cloud SDK : https://cloud.google.com/sdk/ 34 | 35 | Finally, installing `LibYAML `_ will improve 36 | performance for some workloads. LibYAML is also available via Homebrew: 37 | 38 | .. code-block:: bash 39 | 40 | brew install libyaml 41 | 42 | Bionic supports Python 3.7 and above. 43 | 44 | .. _extra-packages: 45 | 46 | Extra Packages 47 | .............. 48 | 49 | The default ``bionic`` PyPI package installs only the minimal dependencies for 50 | building and running flows. However, many other dependency configurations are 51 | available. Most users will want the ``bionic[standard]`` package, which 52 | supports common integrations like `Matplotlib `_, 53 | as well as `graph visualization`_. 54 | 55 | .. _graph visualization: concepts.rst#visualizing-flows 56 | 57 | The full set of subpackages is as follows: 58 | 59 | ========== ==================================== ================================ 60 | Subpackage Installation Command Enables 61 | ========== ==================================== ================================ 62 | dask ``pip install 'bionic[dask]'`` the ``@dask`` decorator 63 | ---------- ------------------------------------ -------------------------------- 64 | dev ``pip install 'bionic[dev]'`` every feature; testing; building 65 | documentation 66 | ---------- ------------------------------------ -------------------------------- 67 | dill ``pip install 'bionic[dill]'`` the ``@dillable`` decorator 68 | ---------- ------------------------------------ -------------------------------- 69 | examples ``pip install 'bionic[examples]'`` the tutorial example code 70 | ---------- ------------------------------------ -------------------------------- 71 | full ``pip install 'bionic[full]'`` every non-development feature 72 | ---------- ------------------------------------ -------------------------------- 73 | gcp ``pip install 'bionic[gcp]'`` caching to GCS 74 | ---------- ------------------------------------ -------------------------------- 75 | geopandas ``pip install 'bionic[geopandas]'`` the ``@geodataframe`` decorator 76 | ---------- ------------------------------------ -------------------------------- 77 | image ``pip install 'bionic[image]'`` automatic de/serialization of 78 | ``PIL.Image`` objects 79 | ---------- ------------------------------------ -------------------------------- 80 | matplotlib ``pip install 'bionic[matplotlib]'`` the ``@pyplot`` decorator 81 | ---------- ------------------------------------ -------------------------------- 82 | parallel ``pip install 'bionic[parallel]'`` parallel execution 83 | ---------- ------------------------------------ -------------------------------- 84 | standard ``pip install 'bionic[standard]'`` graph visualization; ``Image`` 85 | handling; ``@pyplot`` 86 | ---------- ------------------------------------ -------------------------------- 87 | viz ``pip install 'bionic[viz]'`` graph visualization 88 | ========== ==================================== ================================ 89 | 90 | Tutorials 91 | --------- 92 | 93 | These two worked examples illustrate the basic mechanics of Bionic. 94 | 95 | .. toctree:: 96 | :maxdepth: 1 97 | 98 | tutorials/hello_world.ipynb 99 | tutorials/ml_workflow.ipynb 100 | -------------------------------------------------------------------------------- /bionic/aip/docker_image_builder.py: -------------------------------------------------------------------------------- 1 | """ 2 | Builds a docker image for Google AI Platform execution using the current Python 3 | environment. 4 | """ 5 | import pathlib 6 | import subprocess 7 | from concurrent.futures import Future 8 | from concurrent.futures.thread import ThreadPoolExecutor 9 | from textwrap import dedent 10 | 11 | from bionic.deps.optdep import import_optional_dependency 12 | 13 | import hashlib 14 | import sys 15 | import tempfile 16 | import re 17 | import logging 18 | 19 | logger = logging.getLogger(__name__) 20 | 21 | 22 | _cached_docker_module = None 23 | _cached_docker_client = None 24 | 25 | 26 | def get_docker_module(): 27 | global _cached_docker_module 28 | 29 | if _cached_docker_module is None: 30 | _cached_docker_module = import_optional_dependency( 31 | "docker", purpose="Build Docker images" 32 | ) 33 | 34 | return _cached_docker_module 35 | 36 | 37 | def get_docker_client(): 38 | global _cached_docker_client 39 | 40 | if _cached_docker_client is None: 41 | docker = get_docker_module() 42 | logger.info("Initializing Docker client ...") 43 | _cached_docker_client = docker.from_env() 44 | 45 | return _cached_docker_client 46 | 47 | 48 | def fix_pip_requirements(pip_requirements: str) -> str: 49 | # Pip freeze may contain entries with editable installs pointing to remote 50 | # git repositories. This can happen when doing Bionic development. Docker 51 | # service is not able to access repositories using the git+git protocol. 52 | # Hence, any entries containing git+git is converted to use git+https. 53 | # 54 | # Example entry: 55 | # -e git+git@github.com:square/bionic.git@f13f5405e928d92b553d2cbee41084eecccf7de3#egg=bionic 56 | # 57 | # Converted entry: 58 | # -e git+https://github.com/square/bionic.git@f13f5405e928d92b553d2cbee41084eecccf7de3#egg=bionic 59 | 60 | def fix_line(line: str) -> str: 61 | if line.startswith("-e git+git"): 62 | return re.sub(r"-e ([^@]*)@", "-e git+https://", line.replace(":", "/")) 63 | else: 64 | return line 65 | 66 | return "\n".join([fix_line(line) for line in pip_requirements.split("\n")]) 67 | 68 | 69 | def get_pip_freeze() -> str: 70 | return subprocess.run( 71 | ["pip", "freeze"], capture_output=True, check=True, encoding="utf-8" 72 | ).stdout 73 | 74 | 75 | def get_pip_requirements() -> str: 76 | return fix_pip_requirements(get_pip_freeze()) 77 | 78 | 79 | def get_image_uri(project_name: str, pip_requirements: str) -> str: 80 | m = hashlib.sha256() 81 | m.update(pip_requirements.encode("utf-8")) 82 | m.update(str(sys.version_info).encode("utf-8")) 83 | 84 | image_tag = f"bionic_{m.hexdigest()}" 85 | 86 | return f"gcr.io/{project_name}/bionic:{image_tag}" 87 | 88 | 89 | def build_image( 90 | docker_client, 91 | pip_requirements: str, 92 | image_uri: str, 93 | ): 94 | with tempfile.TemporaryDirectory() as tmp_dir: 95 | tmp_path = pathlib.Path(tmp_dir) 96 | 97 | (tmp_path / "requirements.txt").write_text(pip_requirements) 98 | 99 | container_image = f"python:{sys.version_info[0]}.{sys.version_info[1]}" 100 | 101 | (tmp_path / "Dockerfile").write_text( 102 | dedent( 103 | f""" 104 | FROM {container_image} 105 | COPY requirements.txt requirements.txt 106 | RUN pip install -r requirements.txt 107 | """ 108 | ) 109 | ) 110 | 111 | logger.info(f"Building {image_uri} using {container_image}") 112 | image, _ = docker_client.images.build(path=tmp_dir, tag=f"{image_uri}") 113 | 114 | logger.info(f"Pushing {image_uri}") 115 | for line in docker_client.images.push(f"{image_uri}", stream=True, decode=True): 116 | logger.debug(line) 117 | 118 | logger.info(f"Uploaded {image_uri}") 119 | 120 | 121 | def build_image_if_missing(project_name: str) -> str: 122 | pip_requirements = get_pip_requirements() 123 | image_uri = get_image_uri(project_name, pip_requirements) 124 | 125 | docker = get_docker_module() 126 | docker_client = get_docker_client() 127 | 128 | try: 129 | docker_client.images.get_registry_data(image_uri) 130 | logger.info(f"{image_uri} already exists") 131 | except docker.errors.NotFound: 132 | build_image( 133 | docker_client=docker_client, 134 | pip_requirements=pip_requirements, 135 | image_uri=image_uri, 136 | ) 137 | 138 | return image_uri 139 | 140 | 141 | def build_image_if_missing_async(project_name: str) -> Future: 142 | return ThreadPoolExecutor(max_workers=1).submit( 143 | build_image_if_missing, project_name 144 | ) 145 | -------------------------------------------------------------------------------- /tests/test_flow/test_copy.py: -------------------------------------------------------------------------------- 1 | import json 2 | from pathlib import Path 3 | 4 | import dask.dataframe as dd 5 | import pytest 6 | 7 | import bionic as bn 8 | from ..helpers import df_from_csv_str, equal_frame_and_index_content 9 | 10 | 11 | @pytest.fixture 12 | def preset_builder(builder): 13 | builder.assign("x", 2) 14 | builder.assign("y", 3) 15 | 16 | @builder 17 | def f(x, y): 18 | return x + y 19 | 20 | return builder 21 | 22 | 23 | @pytest.fixture 24 | def flow(preset_builder): 25 | return preset_builder.build() 26 | 27 | 28 | @pytest.fixture 29 | def expected_dask_df(): 30 | df_value = df_from_csv_str( 31 | """ 32 | color,number 33 | red,1 34 | blue,2 35 | green,3 36 | """ 37 | ) 38 | return dd.from_pandas(df_value, npartitions=1) 39 | 40 | 41 | @pytest.fixture 42 | def dask_flow(builder, expected_dask_df): 43 | @builder 44 | @bn.protocol.dask 45 | def dask_df(): 46 | return expected_dask_df 47 | 48 | return builder.build() 49 | 50 | 51 | @pytest.fixture 52 | def override_gcs_for_copy_if_fake_gcp(use_fake_gcp, gcs_fs, monkeypatch): 53 | """ 54 | A flow has an instance of GCS filesystem if GCS caching is enabled. But we 55 | still need to support the case where the user wants to upload the results to 56 | GCS even though GCS caching is disabled for the flow. Hence, the 57 | upload_to_gcs method does not use the flow's GCS filesystem in case GCS 58 | caching is disabled. If we have a fake GCS filesystem, we have to patch it 59 | manually. 60 | """ 61 | 62 | if use_fake_gcp: 63 | monkeypatch.setattr("bionic.gcs.get_gcs_fs_without_warnings", lambda: gcs_fs) 64 | 65 | 66 | def test_copy_file_to_existing_local_dir(flow, tmp_path): 67 | dir_path = tmp_path / "output" 68 | dir_path.mkdir() 69 | flow.get("f", mode="FileCopier").copy(destination=dir_path) 70 | 71 | expected_file_path = dir_path / "f.json" 72 | assert json.loads(expected_file_path.read_bytes()) == 5 73 | 74 | 75 | def test_copy_file_to_local_file(flow, tmp_path): 76 | file_path = tmp_path / "data.json" 77 | flow.get("f", mode="FileCopier").copy(destination=file_path) 78 | 79 | assert json.loads(file_path.read_bytes()) == 5 80 | 81 | 82 | def test_copy_file_to_local_file_using_str(flow, tmp_path): 83 | file_path = tmp_path / "data.json" 84 | file_path_str = str(file_path) 85 | flow.get("f", mode="FileCopier").copy(destination=file_path_str) 86 | assert json.loads(file_path.read_bytes()) == 5 87 | 88 | 89 | @pytest.mark.needs_gcs 90 | def test_copy_file_to_gcs_dir( 91 | flow, tmp_path, tmp_gcs_url_prefix, override_gcs_for_copy_if_fake_gcp, gcs_fs 92 | ): 93 | flow.get("f", mode="FileCopier").copy(destination=tmp_gcs_url_prefix) 94 | cloud_url = tmp_gcs_url_prefix + "f.json" 95 | local_path = tmp_path / "f.json" 96 | gcs_fs.get_file(cloud_url, local_path) 97 | assert json.loads(local_path.read_bytes()) == 5 98 | 99 | 100 | @pytest.mark.needs_gcs 101 | def test_copy_file_to_gcs_file( 102 | flow, tmp_path, tmp_gcs_url_prefix, override_gcs_for_copy_if_fake_gcp, gcs_fs 103 | ): 104 | cloud_url = tmp_gcs_url_prefix + "f.json" 105 | flow.get("f", mode="FileCopier").copy(destination=cloud_url) 106 | local_path = tmp_path / "f.json" 107 | gcs_fs.get_file(cloud_url, local_path) 108 | assert json.loads(local_path.read_bytes()) == 5 109 | 110 | 111 | def test_copy_dask_to_dir(tmp_path, expected_dask_df, dask_flow): 112 | destination = tmp_path / "output" 113 | destination.mkdir() 114 | expected_dir_path = destination / "dask_df.pq.dask" 115 | 116 | dask_flow.get("dask_df", mode="FileCopier").copy(destination=destination) 117 | 118 | actual = dd.read_parquet(expected_dir_path) 119 | assert equal_frame_and_index_content(actual.compute(), expected_dask_df.compute()) 120 | 121 | 122 | @pytest.mark.needs_gcs 123 | def test_copy_dask_to_gcs_dir( 124 | tmp_path, 125 | tmp_gcs_url_prefix, 126 | expected_dask_df, 127 | dask_flow, 128 | override_gcs_for_copy_if_fake_gcp, 129 | gcs_fs, 130 | ): 131 | cloud_url = tmp_gcs_url_prefix + "output" 132 | local_path = tmp_path / "output" 133 | 134 | dask_flow.get("dask_df", mode="FileCopier").copy(destination=cloud_url) 135 | 136 | gcs_fs.get(cloud_url, str(local_path), recursive=True) 137 | actual = dd.read_parquet(local_path) 138 | assert equal_frame_and_index_content(actual.compute(), expected_dask_df.compute()) 139 | 140 | 141 | def test_get_multi_value_entity(builder): 142 | my_set = {"oscar", "the", "grouch"} 143 | builder.assign("val", values=my_set) 144 | 145 | @builder 146 | def multi_entity(val): 147 | return val 148 | 149 | flow = builder.build() 150 | results = flow.get("multi_entity", collection=set, mode=Path) 151 | results = {json.loads(res.read_bytes()) for res in results} 152 | 153 | assert results == my_set 154 | -------------------------------------------------------------------------------- /tests/test_flow/test_dagviz.py: -------------------------------------------------------------------------------- 1 | """ 2 | Tests for dagviz and FlowImage class. 3 | """ 4 | 5 | import pytest 6 | from xml.etree import ElementTree as ET 7 | from PIL import Image 8 | 9 | import bionic as bn 10 | from bionic import dagviz 11 | 12 | 13 | @pytest.fixture 14 | def flow(builder): 15 | builder.assign("first_name", values=["Alice", "Bob"]) 16 | builder.assign("last_name", "Smith") 17 | 18 | @builder 19 | @bn.outputs("full_name", "initials") 20 | @bn.docs( 21 | """The full name.""", 22 | """Just the initials.""", 23 | ) 24 | def _(first_name, last_name): 25 | return f"{first_name} {last_name}", f"{first_name[0]}{last_name[0]}" 26 | 27 | @builder 28 | @bn.gather(over="full_name") 29 | @bn.returns("all_names,") 30 | def _(gather_df): 31 | """Comma-separated list of names.""" 32 | return ", ".join(gather_df["full_name"]) 33 | 34 | return builder.build() 35 | 36 | 37 | @pytest.fixture 38 | def flow_image(flow): 39 | return flow.render_dag() 40 | 41 | 42 | @pytest.fixture 43 | def flow_graph(flow): 44 | return flow._deriver.export_dag() 45 | 46 | 47 | @pytest.fixture 48 | def flow_dot(flow_graph): 49 | return dagviz.dot_from_graph(flow_graph) 50 | 51 | 52 | def nodes_by_name_from_dot(dot): 53 | return { 54 | node.get_name(): node 55 | for subgraph in dot.get_subgraphs() 56 | for node in subgraph.get_nodes() 57 | } 58 | 59 | 60 | def test_dag_size(flow_graph): 61 | assert len(flow_graph.nodes) == 11 62 | 63 | 64 | def test_dot_names_and_colors(flow_dot): 65 | nodes = nodes_by_name_from_dot(flow_dot) 66 | same_color_name_groups = [ 67 | # We've wrapped all our names in quotes to work around pydot. However, they're 68 | # not visible in the visualization. 69 | ['"first_name[0]"', '"first_name[1]"'], 70 | ['"last_name"'], 71 | [ 72 | '"[0]"', 73 | '"[1]"', 74 | '"full_name[0]"', 75 | '"full_name[1]"', 76 | '"initials[0]"', 77 | '"initials[1]"', 78 | ], 79 | ['""', '"all_names"'], 80 | ] 81 | 82 | all_names = [name for name_group in same_color_name_groups for name in name_group] 83 | assert set(nodes.keys()) == set(all_names) 84 | 85 | all_group_colors = set() 86 | for name_group in same_color_name_groups: 87 | group_colors = set(nodes[name].get_fillcolor() for name in name_group) 88 | assert len(group_colors) == 1 89 | (group_color,) = group_colors 90 | assert group_color not in all_group_colors 91 | all_group_colors.add(group_color) 92 | 93 | 94 | def test_dot_tooltips(flow_dot): 95 | nodes = nodes_by_name_from_dot(flow_dot) 96 | assert nodes['"last_name"'].get_tooltip() == "Persisted: True" 97 | assert ( 98 | nodes['"all_names"'].get_tooltip() 99 | == "Comma-separated list of names.\n\nPersisted: True" 100 | ) 101 | assert ( 102 | nodes['"initials[0]"'].get_tooltip() == "Just the initials.\n\nPersisted: True" 103 | ) 104 | assert ( 105 | nodes['"initials[1]"'].get_tooltip() == "Just the initials.\n\nPersisted: True" 106 | ) 107 | assert ( 108 | nodes['"[0]"'].get_tooltip() 109 | == "(Intermediate value) A Python tuple with 2 values.\n\nPersisted: False" 110 | ) 111 | 112 | 113 | def test_save_flowimage_file_path(tmp_path, flow_image): 114 | """When a file path is given as input, and type is supported by PIL 115 | check that output image format is preserved.""" 116 | filepath = tmp_path / "test.png" 117 | flow_image.save(filepath) 118 | output = Image.open(filepath) 119 | assert output.format == "PNG" 120 | 121 | 122 | def test_save_flowimage_file_path_svg(tmp_path, flow_image): 123 | """When a file path is given as input and svg as the format""" 124 | filepath = tmp_path / "test.svg" 125 | flow_image.save(filepath) 126 | output_text = (tmp_path / "test.svg").read_text() 127 | try: 128 | ET.fromstring(output_text) 129 | except ET.ParseError: 130 | pytest.fail( 131 | "output from saving SVG to file object not well formed XML {}".format( 132 | output_text 133 | ) 134 | ) 135 | 136 | 137 | def test_save_flowimage_file_object(tmp_path, flow_image): 138 | """When a file object is given as input, use PIL interface to save""" 139 | with open(tmp_path / "test.png", "wb") as file_object: 140 | flow_image.save(file_object, format="png") 141 | output = Image.open(tmp_path / "test.png") 142 | assert output.format == "PNG" 143 | 144 | 145 | def test_save_flowimage_file_object_svg(tmp_path, flow_image): 146 | """When a file object is given as input and file is svg, use builtin interface to save""" 147 | with open(tmp_path / "test.svg", "wb") as file_object: 148 | flow_image.save(file_object, format="svg") 149 | output_text = (tmp_path / "test.svg").read_text() 150 | try: 151 | ET.fromstring(output_text) 152 | except ET.ParseError: 153 | pytest.fail( 154 | "output from saving SVG to file object not well formed XML {}".format( 155 | output_text 156 | ) 157 | ) 158 | -------------------------------------------------------------------------------- /example/ml_workflow.py: -------------------------------------------------------------------------------- 1 | """ 2 | A toy ML workflow intended to demonstrate basic Bionic features. Trains a 3 | logistic regression model on the UCI ML Breast Cancer Wisconsin (Diagnostic) 4 | dataset. 5 | """ 6 | 7 | import re 8 | 9 | import pandas as pd 10 | from sklearn import datasets, linear_model, metrics, model_selection 11 | 12 | import bionic as bn 13 | 14 | # Initialize our builder. 15 | builder = bn.FlowBuilder("ml_workflow") 16 | 17 | # Define some basic parameters. 18 | builder.assign( 19 | "random_seed", 0, doc="Arbitrary seed for all random decisions in the flow." 20 | ) 21 | builder.assign( 22 | "test_split_fraction", 0.3, doc="Fraction of data to include in test set." 23 | ) 24 | builder.assign( 25 | "hyperparams_dict", {"C": 1}, doc="Hyperparameters to use when training the model." 26 | ) 27 | builder.assign( 28 | "feature_inclusion_regex", 29 | ".*", 30 | doc="Regular expression specifying which feature names to include.", 31 | ) 32 | 33 | 34 | # Load the raw data. 35 | @builder 36 | def raw_frame(): 37 | """ 38 | The raw data, including all features and a `target` column of labels. 39 | """ 40 | dataset = datasets.load_breast_cancer() 41 | df = pd.DataFrame(data=dataset.data, columns=dataset.feature_names) 42 | df["target"] = dataset.target 43 | return df 44 | 45 | 46 | # Select a subset of the columns to use as features. 47 | @builder 48 | def features_frame(raw_frame, feature_inclusion_regex): 49 | """Labeled data with a selected subset of the feature columns.""" 50 | included_feature_cols = [ 51 | col 52 | for col in raw_frame.columns.drop("target") 53 | if re.match(feature_inclusion_regex, col) 54 | ] 55 | return raw_frame[included_feature_cols + ["target"]] 56 | 57 | 58 | # Split the data into train and test sets. 59 | @builder 60 | # The `@outputs` decorator tells Bionic to define two new entities from this 61 | # function (which returns a tuple of two values). 62 | @bn.outputs("train_frame", "test_frame") 63 | @bn.docs( 64 | "Subset of feature data rows, used for model training.", 65 | "Subset of feature data rows, used for model testing.", 66 | ) 67 | def split_raw_frame(features_frame, test_split_fraction, random_seed): 68 | return model_selection.train_test_split( 69 | features_frame, 70 | test_size=test_split_fraction, 71 | random_state=random_seed, 72 | ) 73 | 74 | 75 | # Fit a logistic regression model on the training data. 76 | @builder 77 | def model(train_frame, random_seed, hyperparams_dict): 78 | """A binary classifier sklearn model.""" 79 | m = linear_model.LogisticRegression( 80 | solver="liblinear", random_state=random_seed, **hyperparams_dict 81 | ) 82 | m.fit(train_frame.drop("target", axis=1), train_frame["target"]) 83 | return m 84 | 85 | 86 | # Predict probabilities for the test data. 87 | @builder 88 | def prediction_frame(model, test_frame): 89 | """ 90 | A dataframe with one column, `proba`, containing predicted probabilities for the 91 | test data. 92 | """ 93 | predictions = model.predict_proba(test_frame.drop("target", axis=1))[:, 1] 94 | df = pd.DataFrame() 95 | df["proba"] = predictions 96 | return df 97 | 98 | 99 | # Evaluate the model's precision and recall over a range of threshold values. 100 | @builder 101 | def precision_recall_frame(test_frame, prediction_frame): 102 | """ 103 | A dataframe with three columns: 104 | - `threshold`: a probability threshold for the model 105 | - `precision`: the test set precision resulting from that threshold 106 | - `recall`: the test set recall resulting from that threshold 107 | """ 108 | precisions, recalls, thresholds = metrics.precision_recall_curve( 109 | test_frame["target"], 110 | prediction_frame["proba"], 111 | ) 112 | 113 | df = pd.DataFrame() 114 | df["threshold"] = [0] + list(thresholds) + [1] 115 | df["precision"] = list(precisions) + [1] 116 | df["recall"] = list(recalls) + [0] 117 | 118 | return df 119 | 120 | 121 | # Plot the precision against the recall. 122 | @builder 123 | # The `@pyplot` decorator makes the Matplotlib plotting context available to 124 | # our function, then translates our plot into an image object. 125 | @bn.pyplot("plt") 126 | # The `@gather` decorator collects the values of of "hyperparams_dict" and 127 | # "precision_recall_frame" into a single dataframe named "gathered_frame". 128 | # This might not seem very interesting since "gathered_frame" only has one row, 129 | # but it will become useful once we introduce multiplicity. 130 | @bn.gather( 131 | over="hyperparams_dict", also="precision_recall_frame", into="gathered_frame" 132 | ) 133 | def all_hyperparams_pr_plot(gathered_frame, plt): 134 | """ 135 | A plot of precision against recall. Includes one curve for each set of 136 | hyperparameters. 137 | """ 138 | _, ax = plt.subplots(figsize=(4, 3)) 139 | for row in gathered_frame.itertuples(): 140 | label = ", ".join( 141 | f"{key}={value}" for key, value in row.hyperparams_dict.items() 142 | ) 143 | row.precision_recall_frame.plot(x="recall", y="precision", label=label, ax=ax) 144 | ax.set_xlabel("Recall") 145 | ax.set_ylabel("Precision") 146 | 147 | 148 | # Assemble our flow object. 149 | flow = builder.build() 150 | -------------------------------------------------------------------------------- /docs/future.rst: -------------------------------------------------------------------------------- 1 | ==================== 2 | The Future of Bionic 3 | ==================== 4 | 5 | Development Status 6 | ------------------- 7 | 8 | Bionic is still at an early stage, and many features have been planned but not 9 | implemented. All of these features should be developed at some point, but the 10 | exact timeline is not fixed. 11 | 12 | Future Work 13 | ----------- 14 | 15 | Distributed Computation 16 | ....................... 17 | 18 | Currently Bionic computes everything on a single machine, using either a single 19 | process or many in parallel. Later it will be able to dispatch jobs to other machines 20 | (such as a cloud-based compute cluster) to achieve even more parallelization. 21 | 22 | Direct Access to Persisted Files 23 | ................................ 24 | 25 | Bionic is built around the idea that the user's code generally wants to operate 26 | on in-memory objects rather than files. However, in some cases it's preferable 27 | to operate on the raw files. For example, if a file is large we might want to 28 | load only small parts into memory at a time; or we might want to call an 29 | external script that only knows how to operate on files. In these cases it 30 | would be helpful to be able to do something like this: 31 | 32 | .. code-block:: python 33 | 34 | @builder 35 | @bionic.arg_as_file_path('raw_frame', 'raw_frame_path') 36 | def transformed_data(raw_frame_path): 37 | assert raw_frame_path.suffix == '.pq' 38 | subprocess.check_call(['transform_data.sh', str(raw_frame_path)]) 39 | 40 | Graph-Rewriting Decorators 41 | .......................... 42 | 43 | Normally Bionic translates each entity into a single node (or a parallel set of 44 | nodes) in its dependency graph. However, in somes cases we might want to 45 | generate a more complex subgraph. For example, the author of an entity might 46 | know that its computation can be safely broken into chunks and run in parallel: 47 | 48 | .. code-block:: python 49 | 50 | @builder 51 | @bionic.parallelize_by_row('raw_frame'): 52 | def filtered_data(raw_frame, relevant_categories): 53 | return raw_frame[raw_frame['category'].isin(relevant_categories)] 54 | 55 | 56 | User-Defined Decorators 57 | ....................... 58 | 59 | Bionic currently provides several built-in decorators, but their implementation 60 | is complex and tightly coupled with Bionic's internals. This is partly because 61 | we're still figuring out what Bionic's internal data model should look like. 62 | Once those internals are cleaner and more stable, it will be possible for users 63 | to write (and share) their own decorators. 64 | 65 | For example, Bionic provides a built-in :func:`@pyplot ` 66 | decorator to make Matplotlib plotting easier. We might want similar decorators 67 | for other external libraries that are awkward to use in the Bionic framework. 68 | 69 | Smarter Cache Invalidation 70 | .......................... 71 | 72 | Although Bionic attempts to automatically figure out when cached data can be 73 | used and when it needs to be recomputed, the user still needs to tell it about 74 | code changes using :func:`@version `. We have some experimental 75 | features (see :ref:`automatic-versioning`) to help with this, but they aren't 76 | 100% accurate. We believe we can improve their accuracy to the point where 77 | cache invalidation can be inferred automatically, without requiring the 78 | ``@version`` decorator at all. 79 | 80 | Automatic Regression Tests 81 | .......................... 82 | 83 | Following up on the concept of non-functional changes above: when a user 84 | performs a change that is supposed to be non-functional, they might actually 85 | want Bionic to verify this by re-running their code and confirming that the 86 | output is the same as the previous version's. 87 | 88 | Data Validation 89 | ............... 90 | 91 | Often we'd like to make assertions about an entity's output and be alerted if 92 | those assertions are violated. Currently this can be done in two ways: adding 93 | ``assert`` statements to the entity's function, or writing 94 | a custom `Protocol `_ with a special ``validate`` method. 95 | These solutions share two problems. First, they have to be written by the 96 | person who defines the entity; it's not possible to add new assertions about 97 | pre-existing entities. Second, if the assertions fail, the entity's value 98 | never gets persisted, so it's difficult to debug the problem -- especially if 99 | the value was expensive to compute. 100 | 101 | A better approach would be a first-class concept of an entity that validates 102 | other entities, after their value has been persisted but before it can be 103 | consumed by any other (non-validator) entities. 104 | 105 | Better Multiplicity Abstractions 106 | ................................ 107 | 108 | Bionic's concept of creating multiple values for an entity and then gathering 109 | them together is fairly novel (as far as we know), which means it will probably 110 | require some iteration before we find the best way to work with it. There are 111 | definitely many use cases of multiplicity that are awkward or impossible to 112 | express with the current API. For example, we might want one entity to be able 113 | to generate multiple downstream instances of another: for example, a 114 | ``hyperparameter_search_strategy`` entity which creates multiple instances of a 115 | ``hyperparameters_dict`` entity. 116 | -------------------------------------------------------------------------------- /tests/test_flow/test_persistence_aip.py: -------------------------------------------------------------------------------- 1 | import subprocess 2 | from textwrap import dedent 3 | 4 | import pytest 5 | 6 | import bionic as bn 7 | 8 | # This is detected by pytest and applied to all the tests in this module. 9 | from bionic.aip.docker_image_builder import fix_pip_requirements 10 | from bionic.aip.state import AipError 11 | 12 | pytestmark = pytest.mark.needs_aip 13 | 14 | 15 | def test_aip_jobs(aip_builder, log_checker): 16 | builder = aip_builder 17 | 18 | builder.assign("x1", 1) 19 | 20 | # Test various combinations of memoize and persist settings for these 21 | # function entities. 22 | 23 | @builder 24 | def x2(): 25 | return 2 26 | 27 | @builder 28 | @bn.persist(False) 29 | def x3(): 30 | return 3 31 | 32 | @builder 33 | @bn.memoize(False) 34 | def x4(): 35 | return 4 36 | 37 | @builder 38 | @bn.persist(False) 39 | @bn.memoize(False) 40 | def x5(): 41 | return 5 42 | 43 | @builder 44 | @bn.run_in_aip("n1-standard-4") 45 | def y1(x1, x2, x3, x4, x5): 46 | return x1 + x2 + x3 + x4 + x5 + 1 47 | 48 | @builder 49 | @bn.run_in_aip("n1-standard-8") 50 | def y2(x1, x2, x3, x4, x5): 51 | return x1 + x2 + x3 + x4 + x5 + 2 52 | 53 | @builder 54 | def y3(x1, x2, x3, x4, x5): 55 | return x1 + x2 + x3 + x4 + x5 + 3 56 | 57 | @builder 58 | def y4(x1, x2, x3, x4, x5): 59 | return x1 + x2 + x3 + x4 + x5 + 4 60 | 61 | @builder 62 | def y5(x1, x2, x3, x4, x5): 63 | return x1 + x2 + x3 + x4 + x5 + 5 64 | 65 | @builder 66 | def total(y1, y2, y3, y4, y5): 67 | return y1 + y2 + y3 + y4 + y5 68 | 69 | assert builder.build().get("y1") == 16 70 | 71 | log_checker.expect_regex( 72 | r"Staging AI Platform task .* at gs://.*bionic_y1.*", 73 | r"Started AI Platform task: https://console.cloud.google.com/ai-platform/jobs/.*bionic_y1.*", 74 | r"Submitting AI Platform task .*\(name='y1'\).*CaseKey\(x1=1\).*", 75 | r"Computed y1\(x1=1\) using AI Platform", 76 | r"Downloading y1\(x1=1\) from GCS \.\.\.", 77 | ) 78 | 79 | assert builder.build().get("total") == 90 80 | 81 | log_checker.expect_regex( 82 | r"Loaded y1\(x1=1\) from disk cache", 83 | r"Staging AI Platform task .* at gs://.*bionic_y2.*", 84 | r"Started AI Platform task: https://console.cloud.google.com/ai-platform/jobs/.*bionic_y2.*", 85 | r"Submitting AI Platform task .*\(name='y2'\).*CaseKey\(x1=1\).*", 86 | r"Computed y2\(x1=1\) using AI Platform", 87 | r"Downloading y2\(x1=1\) from GCS \.\.\.", 88 | r"Computed y3\(x1=1\)", 89 | r"Computed y4\(x1=1\)", 90 | r"Computed y5\(x1=1\)", 91 | r"Computed total\(x1=1\)", 92 | ) 93 | 94 | 95 | def test_aip_fail(aip_builder, log_checker): 96 | builder = aip_builder 97 | 98 | builder.assign("x", 1) 99 | 100 | @builder 101 | @bn.run_in_aip("n1-standard-4") 102 | def x_plus_one(x): 103 | raise Exception() 104 | 105 | with pytest.raises(AipError): 106 | builder.build().get("x_plus_one") 107 | 108 | log_checker.expect_regex( 109 | r"Staging AI Platform task .* at gs://.*bionic_x_plus_one.*", 110 | r"Started AI Platform task: https://console.cloud.google.com/ai-platform/jobs/.*bionic_x_plus_one.*", 111 | r"Submitting AI Platform task .*\(name='x_plus_one'\).*CaseKey\(x=1\).*", 112 | r".*error while doing remote computation for x_plus_one\(x=1\).*AipError.*", 113 | ) 114 | 115 | 116 | def test_fix_pip_requirements(): 117 | pip_requirements = dedent( 118 | """ 119 | Package1==1.2.3 120 | Package2==2 121 | -e git+git@github.com:square/bionic.git@f13f5405e928d92b553d2cbee41084eecccf7de3#egg=bionic 122 | -e git+https://github.com/square/bionic.git@88fec3d6921ed13b7c7575cca4c292b4f7003b9c#egg=bionic 123 | """ 124 | ) 125 | 126 | fixed_pip_requirements = dedent( 127 | """ 128 | Package1==1.2.3 129 | Package2==2 130 | -e git+https://github.com/square/bionic.git@f13f5405e928d92b553d2cbee41084eecccf7de3#egg=bionic 131 | -e git+https://github.com/square/bionic.git@88fec3d6921ed13b7c7575cca4c292b4f7003b9c#egg=bionic 132 | """ 133 | ) 134 | 135 | assert fix_pip_requirements(pip_requirements) == fixed_pip_requirements 136 | 137 | 138 | @pytest.mark.needs_aip_and_docker_commit_access 139 | @pytest.mark.real_gcp_only 140 | @pytest.mark.no_parallel 141 | def test_aip_with_docker_build(aip_builder): 142 | builder = aip_builder 143 | builder.set("core__aip_execution__docker_image_name", None) 144 | 145 | def get_pip_freeze_exclude_editable() -> str: 146 | # pip freeze may not work properly for editable installs when running in 147 | # AIP since AIP does not have access to remote git repositories. Hence 148 | # editable installs are excluded. 149 | return subprocess.run( 150 | ["pip", "freeze", "--exclude-editable"], 151 | capture_output=True, 152 | check=True, 153 | encoding="utf-8", 154 | ).stdout 155 | 156 | @builder 157 | @bn.run_in_aip("n1-standard-4") 158 | def x(): 159 | return get_pip_freeze_exclude_editable() 160 | 161 | flow = builder.build() 162 | 163 | assert flow.get("x") == get_pip_freeze_exclude_editable() 164 | -------------------------------------------------------------------------------- /tests/conftest.py: -------------------------------------------------------------------------------- 1 | import os 2 | 3 | import pytest 4 | from git import Repo 5 | 6 | from bionic.utils.misc import oneline 7 | 8 | 9 | @pytest.fixture(autouse=True) 10 | def set_env_variables(monkeypatch): 11 | # We don't want to set up Stackdriver logging for local tests. 12 | monkeypatch.setenv("BIONIC_NO_STACKDRIVER", "True") 13 | yield 14 | monkeypatch.delenv("BIONIC_NO_STACKDRIVER") 15 | 16 | 17 | def pytest_addoption(parser): 18 | parser.addoption( 19 | "--slow", action="store_true", default=False, help="run slow tests" 20 | ) 21 | parser.addoption( 22 | "--bucket", action="store", help="URL to GCS bucket to use for tests" 23 | ) 24 | parser.addoption( 25 | "--aip", 26 | action="store_true", 27 | default=False, 28 | help="run AIP tests, requires --bucket", 29 | ) 30 | parser.addoption( 31 | "--parallel", 32 | action="store_true", 33 | default=False, 34 | help="also run all tests with parallel execution mode", 35 | ) 36 | 37 | 38 | def pytest_configure(config): 39 | def add_mark(name, description): 40 | config.addinivalue_line("markers", f"{name}: given test {description}") 41 | 42 | # These markers are added manually. 43 | add_mark("slow", "runs slowly") 44 | add_mark("needs_gcs", "requires GCS to run") 45 | add_mark("needs_aip", "requires AIP execution to run") 46 | add_mark("needs_parallel", "requires parallel execution to run") 47 | add_mark("no_parallel", "does not run with parallel execution") 48 | add_mark( 49 | "allows_parallel", 50 | "can run with parallel execution even when that's not explicitly enabled", 51 | ) 52 | add_mark("real_gcp_only", "runs on real GCP only") 53 | add_mark("fake_gcp_only", "runs on fake GCP only") 54 | add_mark( 55 | "needs_aip_and_docker_commit_access", 56 | "requires AIP and docker access to the current git commit", 57 | ) 58 | 59 | # These markers are added automatically based on parametric fixtures. 60 | add_mark("serial", "will run using serial execution") 61 | add_mark("parallel", "will run using parallel execution") 62 | add_mark("real_gcp", "use real gcp") 63 | add_mark("fake_gcp", "use fake gcp") 64 | 65 | # This marker is added automatically based on other markers. 66 | add_mark("baseline", "runs by default when no options are passed to pytest") 67 | 68 | 69 | def is_current_commit_remotely_available(): 70 | repo = Repo(os.getcwd(), search_parent_directories=True) 71 | return ( 72 | not repo.is_dirty() 73 | and len(repo.git.branch("-r", "--contains", repo.head.ref.object.hexsha)) > 0 74 | ) 75 | 76 | 77 | def pytest_collection_modifyitems(config, items): 78 | also_run_slow = config.getoption("--slow") 79 | skip_slow = pytest.mark.skip(reason="only runs when --slow is set") 80 | 81 | has_gcs = config.getoption("--bucket") 82 | skip_needs_gcs = pytest.mark.skip(reason="only runs when --bucket is set") 83 | 84 | has_aip = has_gcs and config.getoption("--aip") 85 | skip_needs_aip = pytest.mark.skip( 86 | reason="only runs when both --bucket and --aip are set" 87 | ) 88 | 89 | also_run_parallel = config.getoption("--parallel") 90 | 91 | items_to_keep = [] 92 | for item in items: 93 | item_is_baseline = True 94 | 95 | if "slow" in item.keywords: 96 | item_is_baseline = False 97 | if not also_run_slow: 98 | item.add_marker(skip_slow) 99 | 100 | if "real_gcp" in item.keywords: 101 | if "fake_gcp_only" in item.keywords: 102 | continue 103 | 104 | if "needs_gcs" in item.keywords: 105 | item_is_baseline = False 106 | if not has_gcs: 107 | item.add_marker(skip_needs_gcs) 108 | 109 | if "needs_aip" in item.keywords: 110 | item_is_baseline = False 111 | if not has_aip: 112 | item.add_marker(skip_needs_aip) 113 | 114 | elif "fake_gcp" in item.keywords: 115 | if "real_gcp_only" in item.keywords: 116 | continue 117 | 118 | if "parallel" in item.keywords: 119 | if "allows_parallel" not in item.keywords: 120 | item_is_baseline = False 121 | 122 | if "no_parallel" in item.keywords or not also_run_parallel: 123 | continue 124 | 125 | elif "needs_parallel" in item.keywords: 126 | continue 127 | 128 | if "needs_aip_and_docker_commit_access" in item.keywords: 129 | if not has_aip: 130 | item.add_marker(skip_needs_aip) 131 | elif not is_current_commit_remotely_available(): 132 | item.add_marker( 133 | pytest.mark.skip( 134 | reason=oneline( 135 | """ 136 | only runs when --bucket and --aip are set and the 137 | current git commit is available for access by docker 138 | build; that means the commit is pushed to the remote 139 | repository 140 | """ 141 | ) 142 | ) 143 | ) 144 | 145 | if item_is_baseline: 146 | item.add_marker(pytest.mark.baseline) 147 | 148 | items_to_keep.append(item) 149 | 150 | items.clear() 151 | items.extend(items_to_keep) 152 | -------------------------------------------------------------------------------- /tests/test_utils/test_keyed_priority_stack.py: -------------------------------------------------------------------------------- 1 | import pytest 2 | 3 | from random import Random 4 | 5 | from bionic.utils.keyed_priority_stack import KeyedPriorityStack 6 | 7 | 8 | def test_simple_push(): 9 | kps = KeyedPriorityStack() 10 | 11 | assert len(kps) == 0 12 | 13 | kps.push("ONE", "1", 1) 14 | kps.push("TWO_A", "2a", 2) 15 | kps.push("THREE", "3", 3) 16 | kps.push("TWO_B", "2b", 2) 17 | 18 | assert len(kps) == 4 19 | 20 | assert kps.pop() == "3" 21 | assert kps.pop() == "2b" 22 | assert kps.pop() == "2a" 23 | assert kps.pop() == "1" 24 | 25 | assert len(kps) == 0 26 | 27 | 28 | def test_pop_by_key(): 29 | kps = KeyedPriorityStack() 30 | 31 | with pytest.raises(KeyError): 32 | kps.pop("ONE") 33 | 34 | kps.push("ONE", "1", 1) 35 | kps.push("TWO_A", "2a", 2) 36 | kps.push("THREE", "3", 3) 37 | kps.push("TWO_B", "2b", 2) 38 | 39 | with pytest.raises(KeyError): 40 | kps.pop("1") 41 | 42 | assert kps.pop("TWO_B") == "2b" 43 | assert kps.pop() == "3" 44 | assert kps.pop("TWO_A") == "2a" 45 | assert kps.pop() == "1" 46 | 47 | with pytest.raises(KeyError): 48 | kps.pop("THREE") 49 | 50 | 51 | def test_incomparable_unhashable_values(): 52 | class Wrapper: 53 | def __init__(self, value): 54 | self.value = value 55 | 56 | def __eq__(self, other): 57 | raise NotImplementedError("!") 58 | 59 | def __hash__(self, other): 60 | raise NotImplementedError("!") 61 | 62 | kps = KeyedPriorityStack() 63 | 64 | kps.push("ONE", Wrapper("1"), 1) 65 | kps.push("TWO_A", Wrapper("2a"), 2) 66 | kps.push("THREE", Wrapper("3"), 3) 67 | kps.push("TWO_B", Wrapper("2b"), 2) 68 | 69 | assert kps.pop().value == "3" 70 | assert kps.pop().value == "2b" 71 | assert kps.pop().value == "2a" 72 | assert kps.pop().value == "1" 73 | 74 | 75 | def test_random(): 76 | """ 77 | Tests our data structure by applying a series of random operations and comparing 78 | the results to an oracle (SimpleKeyedPriorityStack). 79 | """ 80 | 81 | random = Random(0) 82 | MAX_VALUE = 1000000 83 | 84 | test_kps = KeyedPriorityStack() 85 | ctrl_kps = SimpleKeyedPriorityStack() 86 | 87 | def do_push(): 88 | value = random.randrange(MAX_VALUE) 89 | priority = random.randrange(MAX_VALUE) 90 | key = random.randrange(MAX_VALUE) 91 | 92 | test_kps.push(key, value, priority) 93 | ctrl_kps.push(key, value, priority) 94 | 95 | def do_and_check_pop(): 96 | if len(test_kps) == 0: 97 | with pytest.raises(IndexError): 98 | test_kps.pop() 99 | return 100 | 101 | assert test_kps.pop() == ctrl_kps.pop() 102 | 103 | def do_and_check_pop_with_key(): 104 | key = ctrl_kps._get_random_key(random) 105 | if key is None: 106 | return 107 | 108 | assert test_kps.pop(key) == ctrl_kps.pop(key) 109 | 110 | def check_pop_missing_key(): 111 | key = random.randrange(MAX_VALUE) + MAX_VALUE 112 | 113 | with pytest.raises(KeyError): 114 | test_kps.pop(key) 115 | 116 | def check_push(): 117 | key = ctrl_kps._get_random_key(random) 118 | if key is None: 119 | return 120 | value = random.randrange(MAX_VALUE) 121 | priority = random.randrange(MAX_VALUE) 122 | 123 | with pytest.raises(ValueError): 124 | test_kps.push(key, value, priority) 125 | 126 | def check_len(): 127 | assert len(test_kps) == len(ctrl_kps) 128 | 129 | N_ITERS = 3000 130 | ACTIONS = [ 131 | # We have more pushes than pops, so the size of the stack should tend to grow 132 | # over time. 133 | do_push, 134 | do_push, 135 | do_push, 136 | do_and_check_pop, 137 | do_and_check_pop_with_key, 138 | check_len, 139 | check_push, 140 | check_pop_missing_key, 141 | ] 142 | for i in range(N_ITERS): 143 | action = random.choice(ACTIONS) 144 | action() 145 | while len(test_kps) > 0: 146 | do_and_check_pop() 147 | check_len() 148 | 149 | 150 | class SimpleKeyedPriorityStack: 151 | """ 152 | An alternative implementation of KeyedPriorityStack which is simpler but less 153 | efficient. 154 | """ 155 | 156 | def __init__(self): 157 | self._sorted_quads = [] 158 | self._next_seq_id = 0 159 | 160 | def push(self, key, value, priority): 161 | seq_id = self._next_seq_id 162 | self._next_seq_id += 1 163 | 164 | self._sorted_quads.append([priority, seq_id, value, key]) 165 | self._sorted_quads.sort() 166 | 167 | def pop(self, key=None): 168 | if key is not None: 169 | ix = self._quad_ix_for_key(key) 170 | _, _, value, _ = self._sorted_quads.pop(ix) 171 | return value 172 | 173 | else: 174 | _, _, value, _ = self._sorted_quads.pop() 175 | return value 176 | 177 | def __len__(self): 178 | return len(self._sorted_quads) 179 | 180 | def _quad_ix_for_key(self, key): 181 | (quad_ix,) = [ 182 | quad_ix 183 | for (quad_ix, (_, _, _, quad_key)) in enumerate(self._sorted_quads) 184 | if quad_key == key 185 | ] 186 | return quad_ix 187 | 188 | def _get_random_key(self, random): 189 | if len(self) == 0: 190 | return None 191 | return random.choice(self._sorted_quads)[3] 192 | -------------------------------------------------------------------------------- /docs/contributing.rst: -------------------------------------------------------------------------------- 1 | ====================== 2 | Contributing to Bionic 3 | ====================== 4 | 5 | Bionic's source is maintained on `GitHub `_. 6 | You can clone it with: 7 | 8 | .. code-block:: bash 9 | 10 | git clone git@github.com:square/bionic.git 11 | 12 | Pull requests are welcome! (However, for large changes, we recommend 13 | discussing the proposed change on our `Issues page 14 | `_ first.) Because Bionic is 15 | supported by Square, all new contributors will be asked to sign `Square's 16 | Contributor License Agreement 17 | `_ as part 18 | of the pull request process. 19 | 20 | For Bionic core developers, our internal processes are documented :doc:`here 21 | `. 22 | 23 | Submitting a Pull Request 24 | ------------------------- 25 | 26 | To maintain a baseline level of correctness, readability, and design 27 | coherence, every pull request to Bionic is reviewed by a maintainer. 28 | Maintainers typically check at least the following: 29 | 30 | 1. If you're making changes to Bionic's behavior, include tests if possible, 31 | and add an entry to the `Release Notes `_. 32 | 2. If you're updating Bionic's user-facing API, :ref:`update the 33 | documentation `. 34 | 3. Make sure all existing :ref:`tests ` and :ref:`style checks 35 |