├── tests
    ├── __init__.py
    ├── test_flow
    │   ├── __init__.py
    │   ├── test_persistence_compatibility
    │   │   ├── artifacts
    │   │   │   ├── total_sum
    │   │   │   │   ├── 2f000e88-5a8f-4762-b7c4-77eb444348f6
    │   │   │   │   │   └── total_sum.json
    │   │   │   │   └── faed6d2b-5b8c-449a-9fd6-946bec4f5b0d
    │   │   │   │   │   └── total_sum.json
    │   │   │   ├── lowercase_sum
    │   │   │   │   ├── 1c22c085-6fa5-4df7-a69a-a03a3e880e90
    │   │   │   │   │   └── lowercase_sum.json
    │   │   │   │   └── 44556b2a-bd8f-44c6-a1bb-1a03ed2a839f
    │   │   │   │   │   └── lowercase_sum.json
    │   │   │   ├── uppercase_sum
    │   │   │   │   ├── 5998ef92-4102-4e9c-9ef5-f996da3a9fd9
    │   │   │   │   │   └── uppercase_sum.json
    │   │   │   │   └── fbdc03e4-c713-4a7f-aca6-79bd31bb9d62
    │   │   │   │   │   └── uppercase_sum.json
    │   │   │   ├── lowercase_chars
    │   │   │   │   └── cfe1e872-5b26-4733-9859-4d323d667ae5
    │   │   │   │   │   └── lowercase_chars.setpkl
    │   │   │   │   │       ├── type.pkl
    │   │   │   │   │       └── items
    │   │   │   │   │           ├── 007b2b8ca6c265851d06cf3ba2ffcb4d6acc7e23883fbbde5b73afad3444260f_0.pkl
    │   │   │   │   │           ├── 0b5417898974f490fdf4a442f711925284da871660232c34ba2f8d98cae479dc_0.pkl
    │   │   │   │   │           ├── 730c53e7abe3c1fa5ec658e2c1139bf73026d56b3b933cf34c7b663d905b28bf_0.pkl
    │   │   │   │   │           ├── 897f2e2b559dd876ad870c82283197b8cfecdf84736192ea6fb9ee5a5080a3a4_0.pkl
    │   │   │   │   │           ├── bb2940ae26249720daf30d8464d1002c8c09d8f87688aab9cfbbddcdaf22f79f_0.pkl
    │   │   │   │   │           └── e4e3cdb83096746758d4f418c1c11d93ffdfbab5a4eebffef734e4396c2ce181_0.pkl
    │   │   │   └── uppercase_chars
    │   │   │   │   └── 185898d4-eaeb-46dc-85db-498018b29756
    │   │   │   │       └── uppercase_chars.setpkl
    │   │   │   │           ├── type.pkl
    │   │   │   │           └── items
    │   │   │   │               ├── 0e359834dbf9b14f902538ac42ef4ce523a7f665f04a0a985c5e7fe83df360a3_0.pkl
    │   │   │   │               ├── 21c27bb5c58f87daff8b16ac6dcd17b62345515033e5d8fa66fd44bcfb357780_0.pkl
    │   │   │   │               ├── 2432d9437cf69add843d4b37526aafb6e28b4edbd3b65a13bec0c99b4628304b_0.pkl
    │   │   │   │               ├── 3523c5c4504ff1e243867443a194deac2b64c05fd43f6eee5b4c172fcfd5f5bf_0.pkl
    │   │   │   │               ├── b511f210249bc8eb40056e3fc2383161ca20585396904cc84f77c31f289be4aa_0.pkl
    │   │   │   │               └── df62494217bc7fffc20d07542eeb8e269a35b6616fd80d26ada10561af513314_0.pkl
    │   │   └── inventory
    │   │   │   ├── lowercase_chars
    │   │   │       └── c41a252b102715306f81212ca8465bed426ce061c4123344374beef09d8f3c19
    │   │   │       │   └── d7da6d04d6196967cff9964eda07ad47ec9006da10f9a88b9882e5697fdb47a4
    │   │   │       │       └── metadata_0c0598ce0c72797d2da87fde651ed6df34f0a7477bcebac7fb3cf0699c3c3f0f.yaml
    │   │   │   ├── uppercase_chars
    │   │   │       └── 9a35aac5b21b31f32254590e01830edd7bd3df6b03c93b7186f733b5e6aeaa45
    │   │   │       │   └── 12aa58ae3a54347d38eef097626b97fd71ee9d5054f1c65b4a5ec40de608b975
    │   │   │       │       └── metadata_2f13ca96c050b75e0a719b2dab30735d7a894dbf636cf756f109d671d10087c0.yaml
    │   │   │   ├── lowercase_sum
    │   │   │       ├── 18f9fabca61690edac92e2e690a0238243a6765a5a323023ea921df8d167b365
    │   │   │       │   └── abdeaf50842c524bad26317b37054a082ee1c42365af2cccc3ef44963c4e5ab7
    │   │   │       │   │   └── metadata_d6cdbcac0ffae0019872657e0074ec86ba77748544bf477ebeb02c7fe1491beb.yaml
    │   │   │       └── df26876dd4463a18ff0c4fe5ed4088f6642b919fa5690e7c36314dab74b6aeae
    │   │   │       │   └── 928386e200f120009b0fba16b2f0de0c22974433d0e30690957ef441b254b74f
    │   │   │       │       └── metadata_68023f00b2b8d8baf1e747165c0432eabc0148ce5f801b3da9103ede202da633.yaml
    │   │   │   ├── uppercase_sum
    │   │   │       ├── 4eebae0400c86e94dbe61d3669149a8818be6c07a985c723330b97b87547e7b3
    │   │   │       │   └── 2131eba21ea8c21c6e4830c8be043b71ee21edd9caa43657e5647fe5e72feb91
    │   │   │       │   │   └── metadata_dcc2a4fa9aaf6a7e06f5761c52c3ff9f00772e3c169286c31530badd602a4ed5.yaml
    │   │   │       └── 77f703a7588b04005d8ba8db4cf58b8accd9b951a7c301e0bd7a844315aab6b8
    │   │   │       │   └── 25476be4b0032b37f58d1721d86043060b4c8647fe9fdeefc8dd30636231f542
    │   │   │       │       └── metadata_154a158abcc7649a0948905714394ab5816346d05843df523261458828e60035.yaml
    │   │   │   └── total_sum
    │   │   │       ├── 401bf02deffc7c8d58ba69ed3187ed5b47c8b69fe92da389d7161a78581ce1d0
    │   │   │           └── f723892d217a64ac4124c1e92c00d73b8d8986cf17f50a4de70ad5224e54d17b
    │   │   │           │   └── metadata_3e3b9ef2b6a3946f569202f99045b3b61d7b1a8e327566282ec558dd0254af34.yaml
    │   │   │       └── e06c59dc8c0982d1a495ac7525ed5f3b6cc09fcfe9b82b905f199adbaebc7d98
    │   │   │           └── e34bb2007d114a11e1fd278c00dbf9a244d935eeb2e4a788b50d542152ae655c
    │   │   │               └── metadata_354167e0f9a7bdc5d7170980d1c0e278e2875bde04db5ac6ccd7bd7f1a73bd6d.yaml
    │   ├── test_execution.py
    │   ├── test_relative_cache_path.py
    │   ├── test_plotting.py
    │   ├── test_persistence_compatibility.py
    │   ├── test_join.py
    │   ├── test_interactions.py
    │   ├── test_multi_out.py
    │   ├── generate_test_compatibility_cache.py
    │   ├── test_new_api.py
    │   ├── test_logging.py
    │   ├── test_outputs.py
    │   ├── test_copy.py
    │   ├── test_dagviz.py
    │   ├── test_persistence_aip.py
    │   └── test_executor.py
    ├── test_utils
    │   ├── __init__.py
    │   ├── test_urls.py
    │   ├── test_keyed_priority_stack.py
    │   └── test_misc.py
    ├── test_optdep.py
    ├── test_tokenize.py
    ├── test_helpers.py
    └── conftest.py
├── bionic
    ├── aip
    │   ├── __init__.py
    │   ├── client.py
    │   ├── state.py
    │   ├── main.py
    │   ├── docker_image_builder.py
    │   └── task.py
    ├── flake8
    │   └── __init__.py
    ├── descriptors
    │   └── __init__.py
    ├── core
    │   └── __init__.py
    ├── utils
    │   ├── __init__.py
    │   ├── gcp_auth.py
    │   ├── files.py
    │   ├── reload.py
    │   ├── urls.py
    │   └── keyed_priority_stack.py
    ├── deps
    │   ├── __init__.py
    │   ├── extras.py
    │   └── optdep.py
    ├── util.py
    ├── __init__.py
    ├── interpret.py
    ├── exception.py
    ├── filecopier.py
    ├── tokenization.py
    ├── gcs.py
    ├── decoration.py
    └── protocol.py
├── example
    ├── __init__.py
    ├── basic_workflow.py
    ├── hello_world.py
    ├── intro_workflow.py
    ├── ml_workflow_cli.py
    └── ml_workflow.py
├── .dockerignore
├── MANIFEST.in
├── docs
    ├── tutorials
    │   ├── _tutorial_setup.py
    │   └── hello_world.ipynb
    ├── api
    │   ├── index.rst
    │   ├── util.rst
    │   ├── flow.rst
    │   ├── decorators.rst
    │   └── protocols.rst
    ├── get-help.rst
    ├── Makefile
    ├── maintaining.rst
    ├── index.rst
    ├── get-started.rst
    ├── future.rst
    ├── contributing.rst
    ├── warnings.rst
    └── conf.py
├── .bumpversion.cfg
├── .readthedocs.yml
├── .github
    ├── CODEOWNERS
    └── workflows
    │   ├── publish.yml
    │   └── bionic-test.yml
├── Dockerfile
├── setup.cfg
├── .pre-commit-config.yaml
├── .gitignore
├── README.md
└── setup.py


/tests/__init__.py:
--------------------------------------------------------------------------------
1 | 


--------------------------------------------------------------------------------
/bionic/aip/__init__.py:
--------------------------------------------------------------------------------
1 | 


--------------------------------------------------------------------------------
/example/__init__.py:
--------------------------------------------------------------------------------
1 | 


--------------------------------------------------------------------------------
/bionic/flake8/__init__.py:
--------------------------------------------------------------------------------
1 | 


--------------------------------------------------------------------------------
/tests/test_flow/__init__.py:
--------------------------------------------------------------------------------
1 | 


--------------------------------------------------------------------------------
/tests/test_utils/__init__.py:
--------------------------------------------------------------------------------
1 | 


--------------------------------------------------------------------------------
/bionic/descriptors/__init__.py:
--------------------------------------------------------------------------------
1 | 


--------------------------------------------------------------------------------
/.dockerignore:
--------------------------------------------------------------------------------
1 | # Bionic cache files
2 | bndata
3 | 


--------------------------------------------------------------------------------
/bionic/core/__init__.py:
--------------------------------------------------------------------------------
1 | """
2 | Contains Bionic core logic to execute tasks and their dependencies.
3 | """
4 | 


--------------------------------------------------------------------------------
/bionic/utils/__init__.py:
--------------------------------------------------------------------------------
1 | """
2 | Contains reusable utility functions that don't have any Bionic-specific logic.
3 | """
4 | 


--------------------------------------------------------------------------------
/tests/test_flow/test_persistence_compatibility/artifacts/total_sum/2f000e88-5a8f-4762-b7c4-77eb444348f6/total_sum.json:
--------------------------------------------------------------------------------
1 | 1002


--------------------------------------------------------------------------------
/tests/test_flow/test_persistence_compatibility/artifacts/total_sum/faed6d2b-5b8c-449a-9fd6-946bec4f5b0d/total_sum.json:
--------------------------------------------------------------------------------
1 | 1002


--------------------------------------------------------------------------------
/bionic/deps/__init__.py:
--------------------------------------------------------------------------------
1 | """
2 | Contains modules for defining and importing Bionic's optional
3 | dependency packages.
4 | """
5 | 


--------------------------------------------------------------------------------
/tests/test_flow/test_persistence_compatibility/artifacts/lowercase_sum/1c22c085-6fa5-4df7-a69a-a03a3e880e90/lowercase_sum.json:
--------------------------------------------------------------------------------
1 | 597


--------------------------------------------------------------------------------
/tests/test_flow/test_persistence_compatibility/artifacts/lowercase_sum/44556b2a-bd8f-44c6-a1bb-1a03ed2a839f/lowercase_sum.json:
--------------------------------------------------------------------------------
1 | 597


--------------------------------------------------------------------------------
/tests/test_flow/test_persistence_compatibility/artifacts/uppercase_sum/5998ef92-4102-4e9c-9ef5-f996da3a9fd9/uppercase_sum.json:
--------------------------------------------------------------------------------
1 | 405


--------------------------------------------------------------------------------
/tests/test_flow/test_persistence_compatibility/artifacts/uppercase_sum/fbdc03e4-c713-4a7f-aca6-79bd31bb9d62/uppercase_sum.json:
--------------------------------------------------------------------------------
1 | 405


--------------------------------------------------------------------------------
/MANIFEST.in:
--------------------------------------------------------------------------------
1 | include README.md
2 | include LICENSE.txt
3 | graft bionic
4 | graft tests
5 | graft example
6 | global-exclude __pycache__ *.py[co] *.sw[po]
7 | 


--------------------------------------------------------------------------------
/docs/tutorials/_tutorial_setup.py:
--------------------------------------------------------------------------------
1 | import sys
2 | from pathlib import Path
3 | 
4 | project_path = str(Path("../..").resolve())
5 | if project_path not in sys.path:
6 |     sys.path.insert(0, project_path)
7 | 


--------------------------------------------------------------------------------
/docs/api/index.rst:
--------------------------------------------------------------------------------
 1 | =============
 2 | API Reference
 3 | =============
 4 | 
 5 | These are the APIs provided by Bionic.
 6 | 
 7 | .. toctree::
 8 | 
 9 |     flow
10 |     decorators
11 |     protocols
12 |     util
13 | 


--------------------------------------------------------------------------------
/.bumpversion.cfg:
--------------------------------------------------------------------------------
 1 | [bumpversion]
 2 | current_version = 0.11.1
 3 | commit = True
 4 | tag = False
 5 | 
 6 | [bumpversion:file:setup.py]
 7 | 
 8 | [bumpversion:file:docs/conf.py]
 9 | 
10 | [bumpversion:file:bionic/__init__.py]
11 | 


--------------------------------------------------------------------------------
/.readthedocs.yml:
--------------------------------------------------------------------------------
 1 | version: 2
 2 | formats: []
 3 | python:
 4 |     version: 3.7
 5 |     install:
 6 |         - method: pip
 7 |           path: .
 8 |           extra_requirements:
 9 |               - dev
10 |     system_packages: true
11 | 


--------------------------------------------------------------------------------
/tests/test_flow/test_persistence_compatibility/artifacts/lowercase_chars/cfe1e872-5b26-4733-9859-4d323d667ae5/lowercase_chars.setpkl/type.pkl:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/square/bionic/HEAD/tests/test_flow/test_persistence_compatibility/artifacts/lowercase_chars/cfe1e872-5b26-4733-9859-4d323d667ae5/lowercase_chars.setpkl/type.pkl


--------------------------------------------------------------------------------
/tests/test_flow/test_persistence_compatibility/artifacts/uppercase_chars/185898d4-eaeb-46dc-85db-498018b29756/uppercase_chars.setpkl/type.pkl:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/square/bionic/HEAD/tests/test_flow/test_persistence_compatibility/artifacts/uppercase_chars/185898d4-eaeb-46dc-85db-498018b29756/uppercase_chars.setpkl/type.pkl


--------------------------------------------------------------------------------
/bionic/utils/gcp_auth.py:
--------------------------------------------------------------------------------
 1 | from bionic.deps.optdep import import_optional_dependency
 2 | 
 3 | 
 4 | def get_gcp_project_id():
 5 |     google_auth = import_optional_dependency(
 6 |         "google.auth", purpose="Get GCP project id from the environment"
 7 |     )
 8 |     _, project = google_auth.default()
 9 |     return project
10 | 


--------------------------------------------------------------------------------
/.github/CODEOWNERS:
--------------------------------------------------------------------------------
1 | # Lines starting with '#' are comments.
2 | # Each line is a file pattern followed by one or more owners.
3 | # Check out the link below for more information.
4 | # https://help.github.com/en/github/creating-cloning-and-archiving-repositories/about-code-owners
5 | 
6 | # These owners will be the default owners for everything in the repo.
7 | *       @jqmp @namanjain @simonafk
8 | 


--------------------------------------------------------------------------------
/bionic/util.py:
--------------------------------------------------------------------------------
 1 | """
 2 | This module is deprecated and exists only for backwards compatibility.
 3 | 
 4 | Some older documentation recommended using `bionic.util.init_basic_logging` to expose
 5 | Bionic's logs. This function is now located at `bionic.utils.misc.init_basic_logging`.
 6 | Eventually we should remove the need for this function and deprecate it there too.
 7 | """
 8 | 
 9 | from .utils.misc import init_basic_logging  # noqa: F401
10 | 


--------------------------------------------------------------------------------
/tests/test_flow/test_persistence_compatibility/artifacts/lowercase_chars/cfe1e872-5b26-4733-9859-4d323d667ae5/lowercase_chars.setpkl/items/007b2b8ca6c265851d06cf3ba2ffcb4d6acc7e23883fbbde5b73afad3444260f_0.pkl:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/square/bionic/HEAD/tests/test_flow/test_persistence_compatibility/artifacts/lowercase_chars/cfe1e872-5b26-4733-9859-4d323d667ae5/lowercase_chars.setpkl/items/007b2b8ca6c265851d06cf3ba2ffcb4d6acc7e23883fbbde5b73afad3444260f_0.pkl


--------------------------------------------------------------------------------
/tests/test_flow/test_persistence_compatibility/artifacts/lowercase_chars/cfe1e872-5b26-4733-9859-4d323d667ae5/lowercase_chars.setpkl/items/0b5417898974f490fdf4a442f711925284da871660232c34ba2f8d98cae479dc_0.pkl:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/square/bionic/HEAD/tests/test_flow/test_persistence_compatibility/artifacts/lowercase_chars/cfe1e872-5b26-4733-9859-4d323d667ae5/lowercase_chars.setpkl/items/0b5417898974f490fdf4a442f711925284da871660232c34ba2f8d98cae479dc_0.pkl


--------------------------------------------------------------------------------
/tests/test_flow/test_persistence_compatibility/artifacts/lowercase_chars/cfe1e872-5b26-4733-9859-4d323d667ae5/lowercase_chars.setpkl/items/730c53e7abe3c1fa5ec658e2c1139bf73026d56b3b933cf34c7b663d905b28bf_0.pkl:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/square/bionic/HEAD/tests/test_flow/test_persistence_compatibility/artifacts/lowercase_chars/cfe1e872-5b26-4733-9859-4d323d667ae5/lowercase_chars.setpkl/items/730c53e7abe3c1fa5ec658e2c1139bf73026d56b3b933cf34c7b663d905b28bf_0.pkl


--------------------------------------------------------------------------------
/tests/test_flow/test_persistence_compatibility/artifacts/lowercase_chars/cfe1e872-5b26-4733-9859-4d323d667ae5/lowercase_chars.setpkl/items/897f2e2b559dd876ad870c82283197b8cfecdf84736192ea6fb9ee5a5080a3a4_0.pkl:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/square/bionic/HEAD/tests/test_flow/test_persistence_compatibility/artifacts/lowercase_chars/cfe1e872-5b26-4733-9859-4d323d667ae5/lowercase_chars.setpkl/items/897f2e2b559dd876ad870c82283197b8cfecdf84736192ea6fb9ee5a5080a3a4_0.pkl


--------------------------------------------------------------------------------
/tests/test_flow/test_persistence_compatibility/artifacts/lowercase_chars/cfe1e872-5b26-4733-9859-4d323d667ae5/lowercase_chars.setpkl/items/bb2940ae26249720daf30d8464d1002c8c09d8f87688aab9cfbbddcdaf22f79f_0.pkl:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/square/bionic/HEAD/tests/test_flow/test_persistence_compatibility/artifacts/lowercase_chars/cfe1e872-5b26-4733-9859-4d323d667ae5/lowercase_chars.setpkl/items/bb2940ae26249720daf30d8464d1002c8c09d8f87688aab9cfbbddcdaf22f79f_0.pkl


--------------------------------------------------------------------------------
/tests/test_flow/test_persistence_compatibility/artifacts/lowercase_chars/cfe1e872-5b26-4733-9859-4d323d667ae5/lowercase_chars.setpkl/items/e4e3cdb83096746758d4f418c1c11d93ffdfbab5a4eebffef734e4396c2ce181_0.pkl:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/square/bionic/HEAD/tests/test_flow/test_persistence_compatibility/artifacts/lowercase_chars/cfe1e872-5b26-4733-9859-4d323d667ae5/lowercase_chars.setpkl/items/e4e3cdb83096746758d4f418c1c11d93ffdfbab5a4eebffef734e4396c2ce181_0.pkl


--------------------------------------------------------------------------------
/tests/test_flow/test_persistence_compatibility/artifacts/uppercase_chars/185898d4-eaeb-46dc-85db-498018b29756/uppercase_chars.setpkl/items/0e359834dbf9b14f902538ac42ef4ce523a7f665f04a0a985c5e7fe83df360a3_0.pkl:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/square/bionic/HEAD/tests/test_flow/test_persistence_compatibility/artifacts/uppercase_chars/185898d4-eaeb-46dc-85db-498018b29756/uppercase_chars.setpkl/items/0e359834dbf9b14f902538ac42ef4ce523a7f665f04a0a985c5e7fe83df360a3_0.pkl


--------------------------------------------------------------------------------
/tests/test_flow/test_persistence_compatibility/artifacts/uppercase_chars/185898d4-eaeb-46dc-85db-498018b29756/uppercase_chars.setpkl/items/21c27bb5c58f87daff8b16ac6dcd17b62345515033e5d8fa66fd44bcfb357780_0.pkl:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/square/bionic/HEAD/tests/test_flow/test_persistence_compatibility/artifacts/uppercase_chars/185898d4-eaeb-46dc-85db-498018b29756/uppercase_chars.setpkl/items/21c27bb5c58f87daff8b16ac6dcd17b62345515033e5d8fa66fd44bcfb357780_0.pkl


--------------------------------------------------------------------------------
/tests/test_flow/test_persistence_compatibility/artifacts/uppercase_chars/185898d4-eaeb-46dc-85db-498018b29756/uppercase_chars.setpkl/items/2432d9437cf69add843d4b37526aafb6e28b4edbd3b65a13bec0c99b4628304b_0.pkl:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/square/bionic/HEAD/tests/test_flow/test_persistence_compatibility/artifacts/uppercase_chars/185898d4-eaeb-46dc-85db-498018b29756/uppercase_chars.setpkl/items/2432d9437cf69add843d4b37526aafb6e28b4edbd3b65a13bec0c99b4628304b_0.pkl


--------------------------------------------------------------------------------
/tests/test_flow/test_persistence_compatibility/artifacts/uppercase_chars/185898d4-eaeb-46dc-85db-498018b29756/uppercase_chars.setpkl/items/3523c5c4504ff1e243867443a194deac2b64c05fd43f6eee5b4c172fcfd5f5bf_0.pkl:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/square/bionic/HEAD/tests/test_flow/test_persistence_compatibility/artifacts/uppercase_chars/185898d4-eaeb-46dc-85db-498018b29756/uppercase_chars.setpkl/items/3523c5c4504ff1e243867443a194deac2b64c05fd43f6eee5b4c172fcfd5f5bf_0.pkl


--------------------------------------------------------------------------------
/tests/test_flow/test_persistence_compatibility/artifacts/uppercase_chars/185898d4-eaeb-46dc-85db-498018b29756/uppercase_chars.setpkl/items/b511f210249bc8eb40056e3fc2383161ca20585396904cc84f77c31f289be4aa_0.pkl:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/square/bionic/HEAD/tests/test_flow/test_persistence_compatibility/artifacts/uppercase_chars/185898d4-eaeb-46dc-85db-498018b29756/uppercase_chars.setpkl/items/b511f210249bc8eb40056e3fc2383161ca20585396904cc84f77c31f289be4aa_0.pkl


--------------------------------------------------------------------------------
/tests/test_flow/test_persistence_compatibility/artifacts/uppercase_chars/185898d4-eaeb-46dc-85db-498018b29756/uppercase_chars.setpkl/items/df62494217bc7fffc20d07542eeb8e269a35b6616fd80d26ada10561af513314_0.pkl:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/square/bionic/HEAD/tests/test_flow/test_persistence_compatibility/artifacts/uppercase_chars/185898d4-eaeb-46dc-85db-498018b29756/uppercase_chars.setpkl/items/df62494217bc7fffc20d07542eeb8e269a35b6616fd80d26ada10561af513314_0.pkl


--------------------------------------------------------------------------------
/Dockerfile:
--------------------------------------------------------------------------------
 1 | # An example docker image that can be used to test the AIP integration
 2 | FROM python:3.8
 3 | 
 4 | WORKDIR /code
 5 | 
 6 | COPY README.md setup.py /code/
 7 | COPY bionic/deps/ /code/bionic/deps
 8 | RUN ls /code/*
 9 | 
10 | 
11 | RUN python setup.py egg_info && \
12 |     sed '/^\[/d' bionic.egg-info/requires.txt | sort | uniq >> requirements.txt && \
13 |     pip install -r requirements.txt
14 | 
15 | COPY . ./
16 | 
17 | RUN pip install -e .
18 | 


--------------------------------------------------------------------------------
/tests/test_flow/test_execution.py:
--------------------------------------------------------------------------------
 1 | import pytest
 2 | 
 3 | import os
 4 | 
 5 | 
 6 | @pytest.mark.allows_parallel
 7 | def test_execution_mode(builder, parallel_execution_enabled):
 8 |     @builder
 9 |     def pid():
10 |         return os.getpid()
11 | 
12 |     current_pid = os.getpid()
13 |     returned_pid = builder.build().get("pid")
14 | 
15 |     if parallel_execution_enabled:
16 |         current_pid != returned_pid
17 |     else:
18 |         current_pid == returned_pid
19 | 


--------------------------------------------------------------------------------
/bionic/__init__.py:
--------------------------------------------------------------------------------
 1 | from .flow import Flow, FlowBuilder  # noqa: F401
 2 | from .decorators import (  # noqa: F401
 3 |     version,
 4 |     version_no_warnings,
 5 |     output,
 6 |     outputs,
 7 |     docs,
 8 |     gather,
 9 |     persist,
10 |     memoize,
11 |     pyplot,
12 |     immediate,
13 |     changes_per_run,
14 |     accepts,
15 |     returns,
16 |     run_in_aip,
17 | )
18 | 
19 | from . import protocol  # noqa: F401
20 | from . import util  # noqa: F401
21 | 
22 | __version__ = "0.11.1"
23 | 


--------------------------------------------------------------------------------
/docs/get-help.rst:
--------------------------------------------------------------------------------
 1 | ========
 2 | Get Help
 3 | ========
 4 | 
 5 | For help using Bionic, please post your question on `Stack Overflow
 6 | <https://stackoverflow.com>`_.  Until Bionic has its own `tag
 7 | <https://stackoverflow.com/tags>`_, it's good to use all three of the following
 8 | words in your question so we can find it easily: "bionic", "framework", and
 9 | "python".
10 | 
11 | For bug reports and feature requests: please use our `GitHub Issue
12 | Tracker <https://github.com/square/bionic/issues>`_.
13 | 


--------------------------------------------------------------------------------
/example/basic_workflow.py:
--------------------------------------------------------------------------------
 1 | import bionic as bn
 2 | 
 3 | builder = bn.FlowBuilder("basic_workflow")
 4 | 
 5 | builder.assign("x", values=[2, 3])
 6 | builder.assign("y", values=[5, 7])
 7 | 
 8 | 
 9 | @builder
10 | def x_plus_y(x, y):
11 |     return x + y
12 | 
13 | 
14 | flow = builder.build()
15 | 
16 | if __name__ == "__main__":
17 |     bn.utils.misc.init_basic_logging()
18 | 
19 |     for _, row in flow.get("x_plus_y", "series").reset_index().iterrows():
20 |         print(f"{row['x']} + {row['y']} = {row['x_plus_y']}")
21 | 


--------------------------------------------------------------------------------
/docs/api/util.rst:
--------------------------------------------------------------------------------
 1 | ====================
 2 | Utilities
 3 | ====================
 4 | 
 5 | FileCopier
 6 | ------------
 7 | When called with the ``mode='FileCopier'`` argument,
 8 | :meth:`Flow.get <bionic.Flow.get>` can return a
 9 | :class:`FileCopier <bionic.filecopier.FileCopier>` instance.  This is simply a
10 | utility class that exposes a
11 | :meth:`copy <bionic.filecopier.FileCopier.copy>` method, enabling the
12 | user to copy files around without knowing any internal details about where
13 | Bionic stores them.
14 | 
15 | FileCopier API
16 | ---------------
17 | 
18 | .. autoclass:: bionic.filecopier.FileCopier
19 |     :members:


--------------------------------------------------------------------------------
/bionic/aip/client.py:
--------------------------------------------------------------------------------
 1 | import logging
 2 | 
 3 | from bionic.deps.optdep import import_optional_dependency
 4 | 
 5 | logger = logging.getLogger(__name__)
 6 | 
 7 | _cached_aip_client = None
 8 | 
 9 | 
10 | def get_aip_client(cache_value=True):
11 |     if cache_value:
12 |         global _cached_aip_client
13 |         if _cached_aip_client is None:
14 |             _cached_aip_client = get_aip_client(cache_value=False)
15 |         return _cached_aip_client
16 | 
17 |     discovery = import_optional_dependency(
18 |         "googleapiclient.discovery", raise_on_missing=True
19 |     )
20 |     logger.info("Initializing AIP client ...")
21 |     return discovery.build("ml", "v1", cache_discovery=False)
22 | 


--------------------------------------------------------------------------------
/tests/test_flow/test_relative_cache_path.py:
--------------------------------------------------------------------------------
 1 | import os
 2 | import shutil
 3 | 
 4 | 
 5 | def test_move_cache_files(builder, tmp_path):
 6 |     builder.assign("x", 2)
 7 |     builder.assign("y", 3)
 8 | 
 9 |     @builder
10 |     def xy(x, y):
11 |         return x * y
12 | 
13 |     cur_dir = os.path.join(tmp_path, "current")
14 |     new_dir = os.path.join(tmp_path, "new")
15 | 
16 |     builder.set("core__persistent_cache__flow_dir", cur_dir)
17 |     flow = builder.build()
18 |     # call a method to create cache
19 |     assert flow.get("xy") == 6
20 | 
21 |     shutil.copytree(cur_dir, new_dir)
22 | 
23 |     builder.set("core__persistent_cache__flow_dir", new_dir)
24 |     flow = builder.build()
25 |     assert flow.get("xy") == 6
26 | 


--------------------------------------------------------------------------------
/.github/workflows/publish.yml:
--------------------------------------------------------------------------------
 1 | name: Upload Python Package
 2 | 
 3 | on:
 4 |   release:
 5 |     types: [published]
 6 | 
 7 | jobs:
 8 |   deploy:
 9 | 
10 |     runs-on: ubuntu-latest
11 | 
12 |     steps:
13 |     - uses: actions/checkout@v2
14 |     - name: Set up Python
15 |       uses: actions/setup-python@v2
16 |       with:
17 |         python-version: '3.7'
18 |     - name: Install dependencies
19 |       run: |
20 |         python -m pip install --upgrade pip
21 |         pip install setuptools wheel twine
22 |     - name: Build and publish
23 |       env:
24 |         TWINE_USERNAME: __token__
25 |         TWINE_PASSWORD: ${{ secrets.PYPI_PASSWORD }}
26 |       run: |
27 |         python setup.py sdist bdist_wheel --universal
28 |         twine upload --verbose dist/*
29 | 


--------------------------------------------------------------------------------
/tests/test_optdep.py:
--------------------------------------------------------------------------------
 1 | import pytest
 2 | 
 3 | from bionic.deps.optdep import (
 4 |     import_optional_dependency,
 5 |     TEST_EXTRA_NAME,
 6 |     TEST_PACKAGE_NAME,
 7 | )
 8 | 
 9 | 
10 | def test_import_missing_dependency():
11 |     with pytest.raises(
12 |         ImportError,
13 |         match=".*%s.*PURPOSE.*pip install 'bionic\\[%s\\]'.*"
14 |         % (TEST_PACKAGE_NAME, TEST_EXTRA_NAME),
15 |     ):
16 |         import_optional_dependency(TEST_PACKAGE_NAME, purpose="PURPOSE")
17 | 
18 | 
19 | def test_import_missing_dependency_without_raising():
20 |     module = import_optional_dependency(TEST_PACKAGE_NAME, raise_on_missing=False)
21 |     assert module is None
22 | 
23 | 
24 | def test_import_unrecognized_dependency():
25 |     with pytest.raises(AssertionError):
26 |         import_optional_dependency("_UNKNOWN_PACKAGE_", purpose="PURPOSE")
27 | 


--------------------------------------------------------------------------------
/bionic/aip/state.py:
--------------------------------------------------------------------------------
 1 | from enum import Enum, auto
 2 | 
 3 | 
 4 | class AipError(Exception):
 5 |     pass
 6 | 
 7 | 
 8 | class State(Enum):
 9 |     STATE_UNSPECIFIED = auto()
10 |     QUEUED = auto()
11 |     PREPARING = auto()
12 |     RUNNING = auto()
13 |     SUCCEEDED = auto()
14 |     FAILED = auto()
15 |     CANCELLING = auto()
16 |     CANCELLED = auto()
17 | 
18 |     def is_executing(self):
19 |         return self in {
20 |             State.STATE_UNSPECIFIED,
21 |             State.QUEUED,
22 |             State.PREPARING,
23 |             State.RUNNING,
24 |         }
25 | 
26 |     def is_cancelled(self):
27 |         return self in {State.CANCELLING, State.CANCELLED}
28 | 
29 |     def is_finished(self):
30 |         return self in {
31 |             State.SUCCEEDED,
32 |             State.FAILED,
33 |             State.CANCELLING,
34 |             State.CANCELLED,
35 |         }
36 | 


--------------------------------------------------------------------------------
/bionic/utils/files.py:
--------------------------------------------------------------------------------
 1 | """
 2 | Utilities for working with files.
 3 | """
 4 | 
 5 | import shutil
 6 | 
 7 | 
 8 | def ensure_parent_dir_exists(path):
 9 |     ensure_dir_exists(path.parent)
10 | 
11 | 
12 | def ensure_dir_exists(path):
13 |     path.mkdir(parents=True, exist_ok=True)
14 | 
15 | 
16 | def recursively_copy_path(src_path, dst_path):
17 |     if not src_path.exists():
18 |         raise ValueError(f"Path does not exist: {src_path}")
19 |     ensure_parent_dir_exists(dst_path)
20 | 
21 |     if src_path.is_file():
22 |         shutil.copyfile(str(src_path), str(dst_path))
23 |     else:
24 |         shutil.copytree(str(src_path), str(dst_path))
25 | 
26 | 
27 | def recursively_delete_path(path):
28 |     if not path.exists():
29 |         raise ValueError(f"Path does not exist: {path}")
30 | 
31 |     if path.is_file():
32 |         path.unlink()
33 |     else:
34 |         shutil.rmtree(path)
35 | 


--------------------------------------------------------------------------------
/docs/Makefile:
--------------------------------------------------------------------------------
 1 | # You can set these variables from the command line, and also from the environment for
 2 | # the first two.
 3 | SPHINXOPTS    ?=
 4 | SPHINXBUILD   ?= sphinx-build
 5 | SOURCEDIR     = .
 6 | BUILDDIR      = _build
 7 | 
 8 | # Put this first so that "make" without argument is like "make help".
 9 | help:
10 | 	@$(SPHINXBUILD) -M help "$(SOURCEDIR)" "$(BUILDDIR)" $(SPHINXOPTS) $(O)
11 | 
12 | open: html
13 | 	open _build/html/index.html
14 | 
15 | livehtml:
16 | 	sphinx-autobuild --ignore '*.swp' --ignore 'tutorials/bndata/**/*' --ignore 'tutorials/.ipynb_checkpoints/**/*' -b html $(ALLSPHINXOPTS) . $(BUILDDIR)/html
17 | 
18 | .PHONY: help Makefile
19 | 
20 | # Catch-all target: route all unknown targets to Sphinx using the new
21 | # "make mode" option.  $(O) is meant as a shortcut for $(SPHINXOPTS).
22 | %: Makefile
23 | 	@$(SPHINXBUILD) -M $@ "$(SOURCEDIR)" "$(BUILDDIR)" $(SPHINXOPTS) $(O)
24 | 


--------------------------------------------------------------------------------
/bionic/interpret.py:
--------------------------------------------------------------------------------
 1 | """
 2 | Convenience functions for handling arguments based on their type.  These can be
 3 | used to provide "Pandas-like" APIs that accept (e.g.) either a string or a list
 4 | of strings.
 5 | """
 6 | 
 7 | 
 8 | def str_or_seq_as_list(value):
 9 |     if isinstance(value, str):
10 |         return [value]
11 |     elif is_iterable(value):
12 |         return list(value)
13 |     else:
14 |         raise TypeError(f"Expected a string or sequence; got {value!r}")
15 | 
16 | 
17 | def str_or_seq_or_none_as_list(value):
18 |     if isinstance(value, str):
19 |         return [value]
20 |     elif is_iterable(value):
21 |         return list(value)
22 |     elif value is None:
23 |         return []
24 |     else:
25 |         raise TypeError(f"Expected a string or sequence or None; got {value!r}")
26 | 
27 | 
28 | def is_iterable(x):
29 |     try:
30 |         iter(x)
31 |         return True
32 |     except TypeError:
33 |         return False
34 | 


--------------------------------------------------------------------------------
/example/hello_world.py:
--------------------------------------------------------------------------------
 1 | import bionic as bn
 2 | 
 3 | # Initialize the builder object we'll use to construct our flow.
 4 | builder = bn.FlowBuilder("hello_world")
 5 | 
 6 | # Define new entities "greeting" and "subject" with fixed values.
 7 | builder.assign("greeting", "Hello")
 8 | builder.assign("subject", "world")
 9 | 
10 | 
11 | # Define a "message" entity, constructed by taking the values of "greeting" and
12 | # "subject" and combining them in a sentence.
13 | # The `@builder` decorator tells Bionic to define a new derived entity; Bionic
14 | # infers the name of the new entity ("message") and the names of its
15 | # dependencies ("greeting" and "subject").
16 | @builder
17 | def message(greeting, subject):
18 |     return f"{greeting} {subject}!"
19 | 
20 | 
21 | # Assemble the flow object, which is capable of computing any of the entities
22 | # we've defined.
23 | flow = builder.build()
24 | 
25 | if __name__ == "__main__":
26 |     # Use our flow to compute the message "Hello world!"
27 |     print(flow.get("message"))
28 | 


--------------------------------------------------------------------------------
/setup.cfg:
--------------------------------------------------------------------------------
 1 | [flake8]
 2 | max-line-length=88
 3 | exclude = docs,.venv
 4 | ignore =
 5 |     # These rules are not compatible with black (our code formatter).
 6 |     E203  # "whitespace before ':'"
 7 |     W503  # "line break occurred before a binary operator"
 8 |     # Black handles line lengths for us (slightly less strictly than flake8).
 9 |     E501  # "line too long"
10 |     # We allow TODO and XXX comments in code.
11 |     # (But we don't allow FIX-ME -- hyphen inserted so this string doesn't show up in
12 |     # searches.)
13 |     T101  # "fixme found (TODO)"
14 |     T102  # "fixme found (XXX)"
15 | per-file-ignores =
16 |     # Allow print statements in example code.
17 |     build/lib/example/*:T201
18 |     example/*:T201
19 | 
20 | # NOTE On my MacBook this plugin adds about 1 extra second to Flake8's runtime, making
21 | # it about 5s total. That's not trivial, so it might not be worth it to have this
22 | # enabled all the time.
23 | [flake8:local-plugins]
24 | extension =
25 |   DNM1 = bionic.flake8.check_dnode_match:Checker
26 | 
27 | [tool:pytest]
28 | filterwarnings=ignore::DeprecationWarning
29 | 


--------------------------------------------------------------------------------
/tests/test_flow/test_plotting.py:
--------------------------------------------------------------------------------
 1 | import pytest
 2 | 
 3 | import bionic as bn
 4 | 
 5 | 
 6 | def test_pyplot_no_parens(builder):
 7 |     @builder
 8 |     @bn.pyplot
 9 |     def plot(pyplot):
10 |         ax = pyplot.subplot()
11 |         ax.plot([1, 2, 3], [1, 3, 9])
12 | 
13 |     img = builder.build().get("plot")
14 |     assert img.width > 0
15 |     assert img.height > 0
16 | 
17 | 
18 | def test_pyplot_no_args(builder):
19 |     @builder
20 |     @bn.pyplot()
21 |     def plot(pyplot):
22 |         ax = pyplot.subplot()
23 |         ax.plot([1, 2, 3], [1, 3, 9])
24 | 
25 |     img = builder.build().get("plot")
26 |     assert img.width > 0
27 |     assert img.height > 0
28 | 
29 | 
30 | def test_pyplot_name_arg(builder):
31 |     @builder
32 |     @bn.pyplot("plt")
33 |     def plot(plt):
34 |         ax = plt.subplot()
35 |         ax.plot([1, 2, 3], [1, 3, 9])
36 | 
37 |     img = builder.build().get("plot")
38 |     assert img.width > 0
39 |     assert img.height > 0
40 | 
41 | 
42 | def test_pyplot_missing_dep(builder):
43 |     with pytest.raises(ValueError):
44 | 
45 |         @builder
46 |         @bn.pyplot
47 |         def plot(some_arg):
48 |             pass
49 | 


--------------------------------------------------------------------------------
/bionic/exception.py:
--------------------------------------------------------------------------------
 1 | """
 2 | Bionic-specific exception classes.
 3 | """
 4 | 
 5 | 
 6 | class UndefinedEntityError(KeyError):
 7 |     @classmethod
 8 |     def for_name(cls, name):
 9 |         return cls(f"Entity {name!r} is not defined")
10 | 
11 | 
12 | class AlreadyDefinedEntityError(ValueError):
13 |     @classmethod
14 |     def for_name(cls, name):
15 |         return cls(f"Entity {name!r} is already defined")
16 | 
17 | 
18 | class UnsetEntityError(ValueError):
19 |     pass
20 | 
21 | 
22 | class IncompatibleEntityError(ValueError):
23 |     pass
24 | 
25 | 
26 | class UnsupportedSerializedValueError(Exception):
27 |     pass
28 | 
29 | 
30 | class CodeVersioningError(Exception):
31 |     def __init__(self, message, bad_descriptor):
32 |         super(CodeVersioningError, self).__init__(message)
33 |         self.bad_descriptor = bad_descriptor
34 | 
35 | 
36 | class EntitySerializationError(Exception):
37 |     pass
38 | 
39 | 
40 | class EntityComputationError(Exception):
41 |     pass
42 | 
43 | 
44 | class EntityValueError(ValueError):
45 |     pass
46 | 
47 | 
48 | class AttributeValidationError(Exception):
49 |     pass
50 | 
51 | 
52 | class MalformedDescriptorError(Exception):
53 |     pass
54 | 


--------------------------------------------------------------------------------
/tests/test_tokenize.py:
--------------------------------------------------------------------------------
 1 | import pickle
 2 | 
 3 | from bionic.tokenization import tokenize
 4 | 
 5 | 
 6 | def test_tokenize_straight_translation():
 7 |     assert tokenize(1) == "1"
 8 |     assert tokenize(1.0) == "1.0"
 9 |     assert tokenize("hello") == "hello"
10 | 
11 | 
12 | def test_tokenize_simple_cleaning():
13 |     assert tokenize("Hello").startswith("hello_")
14 |     assert tokenize(True).startswith("true_")
15 |     assert tokenize("test\x00").startswith("test._")
16 | 
17 | 
18 | def test_avoid_initial_period():
19 |     assert tokenize(".test").startswith("_.test")
20 |     assert tokenize("\x00\x00").startswith("_..")
21 | 
22 | 
23 | def test_ensure_token_length_is_capped():
24 |     assert len(tokenize("a" * 1000)) < 50
25 | 
26 | 
27 | class Point:
28 |     def __init__(self, x, y):
29 |         self.x = x
30 |         self.y = y
31 | 
32 | 
33 | def test_tokenize_complex_type():
34 |     token = tokenize(Point(1, 2), pickle.dumps)
35 |     assert isinstance(token, str)
36 |     assert len(token) == 10
37 | 
38 | 
39 | def test_tokenize_no_collisions():
40 |     points = [Point(x, y) for x in range(100) for y in range(100)]
41 |     tokens = [tokenize(point, pickle.dumps) for point in points]
42 |     assert len(set(tokens)) == len(points)
43 | 


--------------------------------------------------------------------------------
/tests/test_utils/test_urls.py:
--------------------------------------------------------------------------------
 1 | from bionic.persistence import relativize_url, derelativize_url
 2 | 
 3 | 
 4 | rel_artifact_url = "../artifacts/artifact.pkl"
 5 | abs_artifact_url = "file:///Users/User/cache/artifacts/artifact.pkl"
 6 | abs_metadata_url = "file:///Users/User/cache/metadata/metadata.yaml"
 7 | gcs_artifact_url = "gs://my_bucket/cache/artifacts/artifact.pkl"
 8 | gcs_metadata_url = "gs://my_bucket/cache/metadata/metadata.yaml"
 9 | 
10 | 
11 | # file url tests
12 | def test_relativize_abs_file_urls():
13 |     assert relativize_url(abs_artifact_url, abs_metadata_url) == rel_artifact_url
14 | 
15 | 
16 | def test_relativize_relative_file_urls():
17 |     assert relativize_url(rel_artifact_url, abs_metadata_url) == rel_artifact_url
18 | 
19 | 
20 | def test_derelativize_abs_file_urls():
21 |     assert derelativize_url(abs_artifact_url, abs_metadata_url) == abs_artifact_url
22 | 
23 | 
24 | def test_derelativize_relative_file_urls():
25 |     assert derelativize_url(rel_artifact_url, abs_metadata_url) == abs_artifact_url
26 | 
27 | 
28 | # gcs url tests
29 | def test_relativize_gcs_urls():
30 |     assert relativize_url(gcs_artifact_url, gcs_metadata_url) == gcs_artifact_url
31 | 
32 | 
33 | def test_derelativize_gcs_urls():
34 |     assert relativize_url(gcs_artifact_url, gcs_metadata_url) == gcs_artifact_url
35 | 


--------------------------------------------------------------------------------
/.pre-commit-config.yaml:
--------------------------------------------------------------------------------
 1 | repos:
 2 | -   repo: https://gitlab.com/pycqa/flake8
 3 |     rev: 3.8.3
 4 |     hooks:
 5 |     -   id: flake8
 6 |         # Pre-commit does some static analysis by caching packages (can
 7 |         # be found in ~/.cache/pre-commit/). When used in a virtualenv
 8 |         # (like pyenv), flake8 does not work correctly withthe default
 9 |         # language and the custom dnode match linter breaks.
10 |         # To get around this, pre-commit devs recommend to use it as a
11 |         # "system" hook (default is "local"). See
12 |         # https://github.com/pre-commit/pre-commit-hooks/issues/157
13 |         # for more information on this issue.
14 |         language: system
15 | 
16 | -   repo: https://github.com/pre-commit/pre-commit-hooks
17 |     rev: v3.2.0
18 |     hooks:
19 |     -   id: trailing-whitespace
20 |     -   id: end-of-file-fixer
21 |     # TODO: This throws an error on MacOS Big Sur. Updating the rev
22 |     # to v3.3.0 does not fix the problem either. Here is the error for
23 |     # reference:
24 |     #
25 |     #   could not determine a constructor for the tag 'tag:yaml.org,2002:python/tuple'
26 |     #
27 |     # -   id: check-yaml
28 |     -   id: check-added-large-files
29 | 
30 | -   repo: https://github.com/psf/black
31 |     rev: 20.8b1
32 |     hooks:
33 |       - id: black
34 |         language_version: python3
35 | 


--------------------------------------------------------------------------------
/bionic/filecopier.py:
--------------------------------------------------------------------------------
 1 | """
 2 | Contains the ``FileCopier`` class, which is essentially a file path with a useful
 3 | ``copy`` method attached to it.
 4 | """
 5 | 
 6 | import subprocess
 7 | 
 8 | from bionic.gcs import upload_to_gcs
 9 | 
10 | 
11 | class FileCopier:
12 |     """
13 |     A wrapper for a Path object, exposing a ``copy`` method that will copy
14 |     the underlying file to a local or cloud destination.
15 | 
16 |     Parameters
17 |     ----------
18 |     src_file_path: Path
19 |         A path to a file.
20 |     """
21 | 
22 |     def __init__(self, src_file_path):
23 |         self.src_file_path = src_file_path
24 | 
25 |     def copy(self, destination):
26 |         """
27 |         Copies file that FileCopier represents to `destination`
28 | 
29 |         This supports both local and GCS destinations. For the former, we follow cp's
30 |         conventions and for the latter we follow fsspec's put / put_file APIs which
31 |         can be found at
32 |         https://filesystem-spec.readthedocs.io/en/latest/api.html#fsspec.spec.AbstractFileSystem.
33 | 
34 |         Parameters
35 |         ----------
36 | 
37 |         destination: Path or str
38 |             Where to copy the underlying file
39 |         """
40 | 
41 |         #  handle gcs
42 |         if str(destination).startswith("gs://"):
43 |             upload_to_gcs(self.src_file_path, str(destination))
44 |         else:
45 |             subprocess.check_call(
46 |                 ["cp", "-R", str(self.src_file_path), str(destination)]
47 |             )
48 | 


--------------------------------------------------------------------------------
/.gitignore:
--------------------------------------------------------------------------------
 1 | # Byte-compiled / optimized / DLL files
 2 | __pycache__/
 3 | *.py[cod]
 4 | *$py.class
 5 | 
 6 | # C extensions
 7 | *.so
 8 | 
 9 | # Distribution / packaging
10 | .Python
11 | env/
12 | build/
13 | develop-eggs/
14 | dist/
15 | downloads/
16 | eggs/
17 | .eggs/
18 | lib/
19 | lib64/
20 | parts/
21 | sdist/
22 | var/
23 | *.egg-info/
24 | .installed.cfg
25 | *.egg
26 | 
27 | # PyInstaller
28 | #  Usually these files are written by a python script from a template
29 | #  before PyInstaller builds the exe, so as to inject date/other infos into it.
30 | *.manifest
31 | *.spec
32 | 
33 | # Installer logs
34 | pip-log.txt
35 | pip-delete-this-directory.txt
36 | 
37 | # Unit test / coverage reports
38 | htmlcov/
39 | .tox/
40 | .coverage
41 | .coverage.*
42 | .cache
43 | .pytest_cache
44 | nosetests.xml
45 | coverage.xml
46 | *,cover
47 | .hypothesis/
48 | 
49 | # Pytest-profiling reports
50 | prof/
51 | 
52 | # Translations
53 | *.mo
54 | *.pot
55 | 
56 | # Django stuff:
57 | *.log
58 | 
59 | # Sphinx documentation
60 | docs/_build/
61 | 
62 | # PyBuilder
63 | target/
64 | 
65 | # pyenv python configuration file
66 | .python-version
67 | 
68 | # Datafiles
69 | *.csv
70 | *.gz
71 | *.h5
72 | *.pkl
73 | *.pk
74 | *.html
75 | *.log
76 | *.db
77 | *.db-journal
78 | 
79 | # iPython Notebooks
80 | *.ipynb
81 | .ipynb_checkpoints
82 | 
83 | # Vim swap files
84 | *.swp
85 | 
86 | # Bionic cache files
87 | bndata
88 | 
89 | # VSCode settings
90 | .vscode
91 | 
92 | # Python virtual environment
93 | .venv
94 | 
95 | # Test data
96 | !tests/test_flow/test_persistence_compatibility/artifacts/**/*.pkl
97 | 


--------------------------------------------------------------------------------
/tests/test_flow/test_persistence_compatibility.py:
--------------------------------------------------------------------------------
 1 | import pytest
 2 | import shutil
 3 | 
 4 | from .generate_test_compatibility_cache import Harness, CACHE_TEST_DIR
 5 | 
 6 | 
 7 | @pytest.fixture
 8 | def older_serialized_cache_harness(make_counter, tmp_path):
 9 |     # shutil.copytree dest should not exist
10 |     tmp_cache_path = tmp_path.joinpath("test_cache")
11 |     shutil.copytree(CACHE_TEST_DIR, tmp_cache_path)
12 |     harness = Harness(tmp_cache_path, make_counter)
13 |     return harness
14 | 
15 | 
16 | # Tests caching backward compatibility by loading and deserializaing
17 | # an old snapshot of the cache. Test failure indicates that the changes
18 | # made to the caching layer are backward incompatible.
19 | # In case of a failure, either
20 | # a) fix the caching logic so it's backward compatible or
21 | # b) update the cache schema version and generate a new cache snapshot.
22 | #
23 | # To update cache schema version, change `CACHE_SCHEMA_VERSION` in cache.py.
24 | #
25 | # To renegerate cache, run the following command from bionic/ dir
26 | #   `python -m tests.test_flow.generate_test_compatibility_cache`
27 | def test_caching_compatibility(older_serialized_cache_harness):
28 |     flows = older_serialized_cache_harness.flows
29 | 
30 |     for flow in flows:
31 |         assert (
32 |             flow.get("total_sum") == older_serialized_cache_harness.EXPECTED_TOTAL_SUM
33 |         )
34 | 
35 |         # Assert that no methods were called.
36 |         assert older_serialized_cache_harness.lowercase_sum_counter.times_called() == 0
37 |         assert older_serialized_cache_harness.uppercase_sum_counter.times_called() == 0
38 |         assert older_serialized_cache_harness.total_sum_counter.times_called() == 0
39 | 


--------------------------------------------------------------------------------
/README.md:
--------------------------------------------------------------------------------
 1 | # Bionic
 2 | 
 3 | Bionic is a framework for analyzing and modeling data in Python.  It's designed
 4 | to help you **iterate faster on your research**, and help your colleagues
 5 | **reuse your code more easily**.
 6 | 
 7 | Bionic is in alpha and evolving rapidly.  We recommend it for research projects
 8 | where the dataset fits in memory.  We do not recommend it for pipelines running
 9 | in production.
10 | 
11 | Check out the [full documentation](https://bionic.readthedocs.io/en/stable/),
12 | or go straight to [Get
13 | Started](https://bionic.readthedocs.io/en/stable/get-started.html).
14 | 
15 | ## Installation
16 | 
17 | Bionic can be installed from PyPI:
18 | 
19 |     pip install bionic[standard]
20 | 
21 | You'll probably want to install [Graphviz](https://www.graphviz.org/) as well.
22 | See the [Installation
23 | docs](https://bionic.readthedocs.io/en/stable/get-started.html#installation)
24 | for more details on installing and configuring Bionic's dependencies.
25 | 
26 | ## Contributing
27 | 
28 | See the
29 | [Contribution](https://bionic.readthedocs.io/en/stable/contributing.html)
30 | section of our docs.
31 | 
32 | ## License
33 | 
34 | Copyright 2019 Square, Inc.
35 | 
36 | Licensed under the Apache License, Version 2.0 (the "License");
37 | you may not use this file except in compliance with the License.
38 | You may obtain a copy of the License at
39 | 
40 |    http://www.apache.org/licenses/LICENSE-2.0
41 | 
42 | Unless required by applicable law or agreed to in writing, software
43 | distributed under the License is distributed on an "AS IS" BASIS,
44 | WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
45 | See the License for the specific language governing permissions and
46 | limitations under the License.
47 | 


--------------------------------------------------------------------------------
/example/intro_workflow.py:
--------------------------------------------------------------------------------
 1 | import numpy as np
 2 | import pandas as pd
 3 | from scipy.stats import multivariate_normal
 4 | from sklearn.linear_model import LinearRegression
 5 | 
 6 | import bionic as bn
 7 | 
 8 | builder = bn.FlowBuilder("intro")
 9 | 
10 | builder.assign("random_seed", 0)
11 | builder.assign("variance", 2)
12 | builder.assign("correlation", 0.5)
13 | builder.assign("n_samples", 1000)
14 | 
15 | 
16 | @builder
17 | def my_random_df(random_seed, variance, correlation, n_samples):
18 |     data = multivariate_normal(
19 |         mean=[0, 0],
20 |         cov=[[variance, correlation * variance], [correlation * variance, variance]],
21 |     ).rvs(size=n_samples, random_state=random_seed)
22 |     return pd.DataFrame(columns=["x", "y"], data=data)
23 | 
24 | 
25 | @builder
26 | def my_model(my_random_df):
27 |     model = LinearRegression()
28 |     model.fit(my_random_df[["x"]], my_random_df["y"])
29 |     return model
30 | 
31 | 
32 | @builder
33 | def est_correlation(my_model):
34 |     return my_model.coef_[0]
35 | 
36 | 
37 | @builder
38 | def est_intercept(my_model):
39 |     return my_model.intercept_
40 | 
41 | 
42 | @builder
43 | @bn.pyplot("plt")
44 | def my_plot(my_random_df, est_correlation, est_intercept, plt):
45 |     with plt.style.context("seaborn-whitegrid"):
46 |         plt.scatter(my_random_df["x"], my_random_df["y"], alpha=0.2)
47 | 
48 |         line_xs = np.array([my_random_df["x"].min(), my_random_df["x"].max()])
49 |         line_ys = (line_xs + est_correlation) + est_intercept
50 |         plt.plot(line_xs, line_ys)
51 | 
52 | 
53 | flow = builder.build()
54 | 
55 | if __name__ == "__main__":
56 |     bn.util.init_basic_logging()
57 | 
58 |     print("Estimated intercept:", flow.get("est_intercept"))
59 |     print("Estimated correlation:", flow.get("est_correlation"))
60 | 


--------------------------------------------------------------------------------
/setup.py:
--------------------------------------------------------------------------------
 1 | #!/usr/bin/env python
 2 | # -*- coding: utf-8 -*-
 3 | 
 4 | import os
 5 | from runpy import run_path
 6 | 
 7 | from setuptools import find_packages, setup
 8 | 
 9 | # This appears to be the least annoying Python-version-agnostic way of loading
10 | # an external file.
11 | extras_require = run_path(
12 |     os.path.join(os.path.dirname(__file__), "bionic", "deps/extras.py")
13 | )["extras_require"]
14 | 
15 | with open("README.md") as readme_file:
16 |     readme = readme_file.read()
17 | 
18 | requirements = [
19 |     "attrs>=20.1",
20 |     "cattrs",
21 |     "PyYAML",
22 |     "numpy",
23 |     "pandas",
24 |     "pyarrow",
25 |     # 0.19.1 had a regression which was fixed in 0.19.2
26 |     # See tobgu/pyrsistent#263 on GitHub.
27 |     "pyrsistent!=0.19.1",
28 |     "decorator<5",
29 | ]
30 | 
31 | setup(
32 |     name="bionic",
33 |     version="0.11.1",
34 |     description=(
35 |         "A Python framework for building, running, and sharing data science "
36 |         "workflows"
37 |     ),
38 |     long_description=readme,
39 |     long_description_content_type="text/markdown",
40 |     license="Apache License 2.0",
41 |     author="Janek Klawe",
42 |     author_email="janek@squareup.com",
43 |     url="https://github.com/square/bionic",
44 |     packages=find_packages(),
45 |     include_package_data=True,
46 |     install_requires=requirements,
47 |     extras_require=extras_require,
48 |     python_requires=">=3.7",
49 |     zip_safe=False,
50 |     keywords="bionic",
51 |     classifiers=[
52 |         "Development Status :: 3 - Alpha",
53 |         "Intended Audience :: Developers",
54 |         "Natural Language :: English",
55 |         "License :: OSI Approved :: Apache Software License",
56 |         "Programming Language :: Python :: 3",
57 |         "Programming Language :: Python :: 3.7",
58 |     ],
59 | )
60 | 


--------------------------------------------------------------------------------
/docs/api/flow.rst:
--------------------------------------------------------------------------------
 1 | ====================
 2 | Flow and FlowBuilder
 3 | ====================
 4 | 
 5 | Introduction
 6 | ------------
 7 | 
 8 | ``FlowBuilder`` and ``Flow`` are the primary interfaces for constructing and
 9 | running Bionic flows.  Either of them can be used to represent
10 | the collection of interdependent entities that make up a single analysis.  The
11 | difference is that a ``FlowBuilder`` is a mutable object which can be updated,
12 | while a ``Flow`` is an immutable object which can perform computation.
13 | 
14 | The typical pattern is to start with an empty ``FlowBuilder``, incrementally
15 | add entity definitions to it, then use ``FlowBuilder.build()`` to generate a
16 | ``Flow``.  This ``Flow`` can be used immediately to compute entity values, or
17 | passed to other code, which might reconfigure or extend it.
18 | 
19 | Although ``Flow`` objects are immutable, there is a mechanism for modifying
20 | them: instead of a method like ``set`` that mutates the ``Flow``, there is a
21 | ``setting`` method that returns a new copy with the requested change.  This
22 | allows ``Flow``\ s to be easily customized without worrying about shared state.
23 | However, this API can only be used to update existing entities; if you want to
24 | define new entities, you'll need to convert the ``Flow`` back to a
25 | ``FlowBuilder`` using ``to_builder``.
26 | 
27 | See `the Concepts documentation
28 | <../concepts.rst#flows-flowbuilders-and-entities>`_ for more details.
29 | 
30 | FlowBuilder API
31 | ---------------
32 | 
33 | .. autoclass:: bionic.FlowBuilder
34 |     :members:
35 | 
36 | FlowCase API
37 | ............
38 | 
39 | .. autoclass:: bionic.flow.FlowCase
40 |     :members:
41 | 
42 | Flow API
43 | --------
44 | 
45 | .. autoclass:: bionic.Flow
46 |     :members:
47 | 
48 | Cache API
49 | ---------
50 | 
51 | .. autoclass:: bionic.cache_api.Cache
52 |     :members:
53 | 
54 | CacheEntry API
55 | --------------
56 | 
57 | .. autoclass:: bionic.cache_api.CacheEntry
58 |     :members:


--------------------------------------------------------------------------------
/tests/test_flow/test_join.py:
--------------------------------------------------------------------------------
 1 | import pytest
 2 | 
 3 | from bionic.exception import UnsetEntityError
 4 | 
 5 | 
 6 | @pytest.fixture(scope="function")
 7 | def preset_builder(builder):
 8 |     builder.declare("x")
 9 |     builder.declare("y")
10 |     builder.declare("z")
11 | 
12 |     @builder
13 |     def xy(x, y):
14 |         return x * y
15 | 
16 |     @builder
17 |     def yz(y, z):
18 |         return y * z
19 | 
20 |     @builder
21 |     def xy_plus_yz(xy, yz):
22 |         return xy + yz
23 | 
24 |     return builder
25 | 
26 | 
27 | def test_simple(preset_builder):
28 |     builder = preset_builder
29 | 
30 |     builder.set("x", 2)
31 |     builder.set("y", 3)
32 |     builder.set("z", 4)
33 | 
34 |     flow = builder.build()
35 | 
36 |     assert flow.get("xy") == 6
37 |     assert flow.get("yz") == 12
38 |     assert flow.get("xy_plus_yz") == 18
39 | 
40 | 
41 | def test_cartesian_product(preset_builder):
42 |     builder = preset_builder
43 | 
44 |     builder.set("x", values=[2])
45 |     builder.set("y", values=[3, 4])
46 |     builder.set("z", values=[5, 6, 7])
47 | 
48 |     flow = builder.build()
49 | 
50 |     assert flow.get("xy", set) == {2 * 3, 2 * 4}  # noqa: E226
51 |     assert flow.get("yz", set) == {
52 |         3 * 5,
53 |         3 * 6,
54 |         3 * 7,
55 |         4 * 5,
56 |         4 * 6,
57 |         4 * 7,
58 |     }  # noqa: E226
59 |     assert flow.get("xy_plus_yz", set) == {
60 |         2 * 3 + 3 * 5,
61 |         2 * 3 + 3 * 6,
62 |         2 * 3 + 3 * 7,
63 |         2 * 4 + 4 * 5,
64 |         2 * 4 + 4 * 6,
65 |         2 * 4 + 4 * 7,
66 |     }  # noqa: E226
67 | 
68 | 
69 | def test_empty(preset_builder):
70 |     builder = preset_builder
71 | 
72 |     builder.set("y", 3)
73 |     builder.set("z", values=[4, 5])
74 | 
75 |     flow = builder.build()
76 | 
77 |     assert flow.get("xy", set) == set()
78 |     assert flow.get("yz", set) == {12, 15}
79 |     assert flow.get("xy_plus_yz", set) == set()
80 | 
81 |     with pytest.raises(UnsetEntityError):
82 |         flow.get("xy")
83 |     with pytest.raises(UnsetEntityError):
84 |         flow.get("xy_plus_yz")
85 | 


--------------------------------------------------------------------------------
/bionic/utils/reload.py:
--------------------------------------------------------------------------------
 1 | import builtins
 2 | import importlib
 3 | from sys import modules as module_registry
 4 | from fnmatch import fnmatch
 5 | from sysconfig import get_paths as sysconfig_paths
 6 | 
 7 | 
 8 | def recursive_reload(module):
 9 |     """
10 |     Helper method to reload a module recursively. If a module imports a set of
11 |     modules, then the modules in the set are also reloaded and so on.
12 | 
13 |     Modules that are part of the current python installation are not reloaded.
14 |     For example, modules part of python standard library or modules installed
15 |     through pip (or some other package manager that use distutils).
16 | 
17 |     Also note that this method may not be able to handle dynamic imports that
18 |     only happens at runtime. For example, if a module imports another module
19 |     only when a certain method is executed, reloading the former module does
20 |     not guarantee that the latter module is reloaded.
21 |     """
22 | 
23 |     original_import = builtins.__import__
24 |     already_reloaded = set()
25 | 
26 |     def custom_import(name, globals=None, locals=None, fromlist=[], level=0):
27 |         if name in module_registry:
28 |             module = module_registry[name]
29 |             if name not in already_reloaded and not is_internal_module(module):
30 |                 already_reloaded.add(name)
31 |                 importlib.reload(module)
32 |         return original_import(name, globals, locals, fromlist, level)
33 | 
34 |     try:
35 |         builtins.__import__ = custom_import
36 |         return importlib.reload(module)
37 |     finally:
38 |         builtins.__import__ = original_import
39 | 
40 | 
41 | def is_internal_module(module):
42 |     return not hasattr(module, "__file__") or is_internal_file(module.__file__)
43 | 
44 | 
45 | def is_internal_file(filename):
46 |     """
47 |     Helper method that determines whether the provided file is internal
48 |     to Python, i.e., it's in the Python installation paths.
49 |     """
50 |     return any(
51 |         fnmatch(filename, file_dir + "/*") for file_dir in sysconfig_paths().values()
52 |     )
53 | 


--------------------------------------------------------------------------------
/tests/test_flow/test_persistence_compatibility/inventory/lowercase_chars/c41a252b102715306f81212ca8465bed426ce061c4123344374beef09d8f3c19/d7da6d04d6196967cff9964eda07ad47ec9006da10f9a88b9882e5697fdb47a4/metadata_0c0598ce0c72797d2da87fde651ed6df34f0a7477bcebac7fb3cf0699c3c3f0f.yaml:
--------------------------------------------------------------------------------
 1 | artifact:
 2 |   content_hash: fae27d617aaf3e7d982e787694bfe8aa6184450716890f3529f5b9dbbe8a6f18
 3 |   url: ../../../../artifacts/lowercase_chars/cfe1e872-5b26-4733-9859-4d323d667ae5/lowercase_chars.setpkl
 4 | descriptor: lowercase_chars
 5 | provenance:
 6 |   case_key_elements:
 7 |   - !!python/tuple
 8 |     - lowercase_chars
 9 |     - 9ed0cd8e69
10 |   code_fingerprint:
11 |     bytecode_hash: b'`\xad\xae\x8e\xb8b5\x96%\x14\x04\xee\r\xc9w3'
12 |     is_identity: true
13 |     orig_flow_name: null
14 |     version:
15 |       includes_bytecode: true
16 |       major: '0'
17 |       minor: '0'
18 |   dep_digests:
19 |   - exact_hash: 0c0598ce0c72797d2da87fde651ed6df34f0a7477bcebac7fb3cf0699c3c3f0f
20 |     functional_hash: c41a252b102715306f81212ca8465bed426ce061c4123344374beef09d8f3c19
21 |     nominal_hash: d7da6d04d6196967cff9964eda07ad47ec9006da10f9a88b9882e5697fdb47a4
22 |     provenance:
23 |       case_key_elements:
24 |       - !!python/tuple
25 |         - lowercase_chars
26 |         - 9ed0cd8e69
27 |       code_fingerprint:
28 |         bytecode_hash: null
29 |         is_identity: false
30 |         orig_flow_name: null
31 |         version:
32 |           includes_bytecode: true
33 |           major: 9ed0cd8e69
34 |           minor: '0'
35 |       dep_digests: []
36 |       descriptor: <lowercase_chars>
37 |       exact_hash: 0c0598ce0c72797d2da87fde651ed6df34f0a7477bcebac7fb3cf0699c3c3f0f
38 |       functional_hash: c41a252b102715306f81212ca8465bed426ce061c4123344374beef09d8f3c19
39 |       nominal_hash: d7da6d04d6196967cff9964eda07ad47ec9006da10f9a88b9882e5697fdb47a4
40 |   descriptor: lowercase_chars
41 |   exact_hash: 0c0598ce0c72797d2da87fde651ed6df34f0a7477bcebac7fb3cf0699c3c3f0f
42 |   functional_hash: c41a252b102715306f81212ca8465bed426ce061c4123344374beef09d8f3c19
43 |   nominal_hash: d7da6d04d6196967cff9964eda07ad47ec9006da10f9a88b9882e5697fdb47a4
44 | 


--------------------------------------------------------------------------------
/tests/test_flow/test_persistence_compatibility/inventory/uppercase_chars/9a35aac5b21b31f32254590e01830edd7bd3df6b03c93b7186f733b5e6aeaa45/12aa58ae3a54347d38eef097626b97fd71ee9d5054f1c65b4a5ec40de608b975/metadata_2f13ca96c050b75e0a719b2dab30735d7a894dbf636cf756f109d671d10087c0.yaml:
--------------------------------------------------------------------------------
 1 | artifact:
 2 |   content_hash: 95f21ce003ce31aba89385b8f7bd69e443fa1376763b8f9168313d778950e2ec
 3 |   url: ../../../../artifacts/uppercase_chars/185898d4-eaeb-46dc-85db-498018b29756/uppercase_chars.setpkl
 4 | descriptor: uppercase_chars
 5 | provenance:
 6 |   case_key_elements:
 7 |   - !!python/tuple
 8 |     - uppercase_chars
 9 |     - e99019711a
10 |   code_fingerprint:
11 |     bytecode_hash: b'`\xad\xae\x8e\xb8b5\x96%\x14\x04\xee\r\xc9w3'
12 |     is_identity: true
13 |     orig_flow_name: null
14 |     version:
15 |       includes_bytecode: true
16 |       major: '0'
17 |       minor: '0'
18 |   dep_digests:
19 |   - exact_hash: 2f13ca96c050b75e0a719b2dab30735d7a894dbf636cf756f109d671d10087c0
20 |     functional_hash: 9a35aac5b21b31f32254590e01830edd7bd3df6b03c93b7186f733b5e6aeaa45
21 |     nominal_hash: 12aa58ae3a54347d38eef097626b97fd71ee9d5054f1c65b4a5ec40de608b975
22 |     provenance:
23 |       case_key_elements:
24 |       - !!python/tuple
25 |         - uppercase_chars
26 |         - e99019711a
27 |       code_fingerprint:
28 |         bytecode_hash: null
29 |         is_identity: false
30 |         orig_flow_name: null
31 |         version:
32 |           includes_bytecode: true
33 |           major: e99019711a
34 |           minor: '0'
35 |       dep_digests: []
36 |       descriptor: <uppercase_chars>
37 |       exact_hash: 2f13ca96c050b75e0a719b2dab30735d7a894dbf636cf756f109d671d10087c0
38 |       functional_hash: 9a35aac5b21b31f32254590e01830edd7bd3df6b03c93b7186f733b5e6aeaa45
39 |       nominal_hash: 12aa58ae3a54347d38eef097626b97fd71ee9d5054f1c65b4a5ec40de608b975
40 |   descriptor: uppercase_chars
41 |   exact_hash: 2f13ca96c050b75e0a719b2dab30735d7a894dbf636cf756f109d671d10087c0
42 |   functional_hash: 9a35aac5b21b31f32254590e01830edd7bd3df6b03c93b7186f733b5e6aeaa45
43 |   nominal_hash: 12aa58ae3a54347d38eef097626b97fd71ee9d5054f1c65b4a5ec40de608b975
44 | 


--------------------------------------------------------------------------------
/docs/maintaining.rst:
--------------------------------------------------------------------------------
 1 | :orphan:
 2 | 
 3 | ==================
 4 | Maintaining Bionic
 5 | ==================
 6 | 
 7 | This page documents project maintenance processes followed by Bionic’s core developers.
 8 | If you’re not a core developer but need a new release for some reason, please contact
 9 | one of the developers `listed here
10 | <https://github.com/square/bionic/blob/master/.github/CODEOWNERS>`_.
11 | 
12 | Release Process
13 | ---------------
14 | 
15 | We use `bumpversion <https://pypi.org/project/bumpversion/>`_ to manage our version
16 | strings and `GitHub Releases <https://github.com/square/bionic/releases>`_ to publish
17 | releases to PyPI. Follow these steps to release a new version of Bionic:
18 | 
19 | 1. Merge a PR with updates to our version strings and release notes.
20 | 
21 |    a. Check out the current master branch.
22 |    b. Create a new branch.
23 |    c. Run ``bumpversion minor`` or ``bumpversion patch`` to bump the version. Running
24 |       this command will create a new commit with all the version strings updated to the
25 |       new version.
26 |    d. Follow the commented instructions near ``Upcoming Version`` in the
27 |       `release-notes.rst
28 |       <https://github.com/square/bionic/blob/master/docs/release-notes.rst>`_ file and
29 |       update the upcoming version section.
30 |    e. Amend the commit to add the release notes changes.
31 |    f. Open a PR for your branch and merge it after approval from another core
32 |       developer.
33 | 
34 | 2. Once your PR is merged, create a release from the GitHub Releases page.
35 | 
36 |    a. On the `GitHub Releases <https://github.com/square/bionic/releases>`_ page, click
37 |       ``Draft a new release``.
38 |    b. Specify the bumped version as the ``Tag version`` and ``Release title``. Don't
39 |       forget to prefix the version with ``v``. E.g., if the new version is ``0.8.0``,
40 |       your tag and title should both be ``v0.8.0``.
41 |    c. Click ``Publish Release``.
42 |    d. Verify that the `Upload Python Package Action
43 |       <https://github.com/square/bionic/actions?query=workflow%3A%22Upload+Python+Package%22>`_
44 |       workflow was completed successfully and the new release is visible on `PyPI
45 |       <https://pypi.org/project/bionic/#history>`_.
46 | 


--------------------------------------------------------------------------------
/tests/test_flow/test_interactions.py:
--------------------------------------------------------------------------------
 1 | import pytest
 2 | 
 3 | from ..helpers import RoundingProtocol
 4 | import bionic as bn
 5 | 
 6 | 
 7 | @pytest.fixture(scope="function")
 8 | def preset_builder(builder):
 9 |     builder.assign("n", values=[1, 2, 3])
10 | 
11 |     @builder
12 |     def xs(n):
13 |         return list(range(n))
14 | 
15 |     @builder
16 |     def ys(xs):
17 |         return [x**2 for x in xs]
18 | 
19 |     return builder
20 | 
21 | 
22 | def test_pyplot_then_gather(preset_builder):
23 |     builder = preset_builder
24 | 
25 |     @builder
26 |     @bn.pyplot("plt")
27 |     @bn.gather("n", ["xs", "ys"])
28 |     def plot(gather_df, plt):
29 |         for row in gather_df.itertuples():
30 |             plt.plot(row.xs, row.ys)
31 | 
32 |     img = builder.build().get("plot")
33 |     assert img.width > 0
34 |     assert img.height > 0
35 | 
36 | 
37 | def test_gather_then_pyplot(preset_builder):
38 |     builder = preset_builder
39 | 
40 |     @builder
41 |     @bn.gather("n", ["xs", "ys"])
42 |     @bn.pyplot("plt")
43 |     def plot(gather_df, plt):
44 |         for row in gather_df.itertuples():
45 |             plt.plot(row.xs, row.ys)
46 | 
47 |     img = builder.build().get("plot")
48 |     assert img.width > 0
49 |     assert img.height > 0
50 | 
51 | 
52 | def test_outputs_with_multiplicity(builder):
53 |     builder.assign("x", values=[2, 3])
54 |     builder.assign("y", 4)
55 | 
56 |     @builder
57 |     @bn.outputs("x_plus_y", "xy")
58 |     def _(x, y):
59 |         return (x + y), (x * y)
60 | 
61 |     @builder
62 |     @bn.gather("xy")
63 |     def sum_xy(gather_df):
64 |         return gather_df["xy"].sum()
65 | 
66 |     @builder
67 |     @bn.gather("x_plus_y")
68 |     def sum_x_plus_y(gather_df):
69 |         return gather_df["x_plus_y"].sum()
70 | 
71 |     flow = builder.build()
72 |     assert flow.get("sum_xy") == 20
73 |     assert flow.get("sum_x_plus_y") == 13
74 | 
75 |     flow = flow.clearing_cases("x")
76 |     assert flow.get("sum_xy") == 0
77 |     assert flow.get("sum_x_plus_y") == 0
78 | 
79 | 
80 | def test_outputs_with_protocols(builder):
81 |     @builder
82 |     @RoundingProtocol()
83 |     @bn.outputs("x", "y")
84 |     def _():
85 |         return 0.1, 1.9
86 | 
87 |     flow = builder.build()
88 | 
89 |     assert flow.get("x") == 0
90 |     assert flow.get("y") == 2
91 | 


--------------------------------------------------------------------------------
/tests/test_helpers.py:
--------------------------------------------------------------------------------
 1 | import pytest
 2 | 
 3 | 
 4 | def test_longest_regex_prefix():
 5 |     from .helpers import longest_regex_prefix_match
 6 | 
 7 |     def longest_prefix(regex, string):
 8 |         return longest_regex_prefix_match(regex, string).re.pattern
 9 | 
10 |     assert longest_prefix("test", "test") == "test"
11 |     assert longest_prefix("test", "te") == "te"
12 |     assert longest_prefix("test", "text") == "te"
13 |     assert longest_prefix("test", "testtest") == "test"
14 |     assert longest_prefix("zest", "test") == ""
15 |     assert longest_prefix("(test)", "test") == "(test)"
16 |     assert longest_prefix("(test)", "text") == ""
17 |     assert longest_prefix("(test)test", "testtest") == "(test)test"
18 |     assert longest_prefix("(test)test", "testtext") == "(test)te"
19 |     assert longest_prefix("x\n\n\nx", "x\n\n\nx") == "x\n\n\nx"
20 |     assert longest_prefix("x\n\n\nx", "x\n\n\ny") == "x\n\n\n"
21 |     assert longest_prefix("x\n\n\nx", "x\n\ny") == "x\n\n"
22 |     assert longest_prefix("x\n\n\nx", "y\n\n\nx") == ""
23 |     assert longest_prefix("test.*test", "testtest") == "test.*test"
24 |     assert longest_prefix("test.*test", "testxxtest") == "test.*test"
25 |     assert longest_prefix("test.*test", "testxxzest") == "test.*t"
26 |     assert longest_prefix("test.*test", "testxxz") == "test.*"
27 |     assert longest_prefix("test.*test", "texttest") == "te"
28 | 
29 | 
30 | def test_assert_re_matches():
31 |     from .helpers import assert_re_matches
32 | 
33 |     def assert_re_nomatch(regex, string):
34 |         with pytest.raises(AssertionError):
35 |             assert_re_matches(regex, string)
36 | 
37 |     assert_re_matches("test", "test")
38 |     assert_re_matches("test", "testxxx")
39 |     assert_re_nomatch("test", "tesd")
40 | 
41 |     assert_re_matches("test$", "test")
42 |     assert_re_nomatch("test$", "testx")
43 | 
44 |     assert_re_matches(".*test", "test")
45 |     assert_re_matches(".*test", "xxtest")
46 |     assert_re_matches(".*test", "testxx")
47 |     assert_re_nomatch(".*test", "tesd")
48 | 
49 |     assert_re_matches("(test)", "test")
50 |     assert_re_matches("(test)", "testx")
51 |     assert_re_nomatch("(test)", "tesd")
52 | 
53 |     assert_re_matches("test.*test", "testtest")
54 |     assert_re_matches("test.*test", "testxxtest")
55 |     assert_re_nomatch("test.*test", "test\ntest")
56 | 
57 |     assert_re_matches("(?s)test.*test", "test\ntest")
58 | 


--------------------------------------------------------------------------------
/bionic/deps/extras.py:
--------------------------------------------------------------------------------
 1 | """
 2 | This file defines the ``extras_require`` argument used in setup.py -- i.e., the
 3 | set of available Bionic subpackages (like bionic[standard] or bionic[gcp]).
 4 | It's in its own file because Bionic uses the information here when importing
 5 | optional dependencies.
 6 | """
 7 | 
 8 | from collections import OrderedDict
 9 | 
10 | 
11 | def combine(*dep_lists):
12 |     """Combines multiple lists into a single sorted list of distinct items."""
13 |     return list(sorted(set(dep for dep_list in dep_lists for dep in dep_list)))
14 | 
15 | 
16 | # Construct the mapping from "extra name" to package descriptor.
17 | # We use an OrderedDict because the optdep module will want to know which
18 | # extras were added first.
19 | extras = OrderedDict()
20 | 
21 | extras["image"] = ["Pillow"]
22 | # We don't support versions of matplotlib below 3.1 because the default backend has
23 | # problems on OS X; and we don't support 3.2.x because of this bug:
24 | # https://github.com/matplotlib/matplotlib/issues/15410
25 | extras["matplotlib"] = combine(["matplotlib>=3.1,!=3.2.*"], extras["image"])
26 | extras["viz"] = combine(["hsluv", "networkx", "pydot"], extras["image"])
27 | 
28 | extras["standard"] = combine(extras["matplotlib"], extras["viz"])
29 | 
30 | extras["dill"] = ["dill"]
31 | extras["dask"] = ["dask[dataframe]"]
32 | extras["gcp"] = ["fsspec", "gcsfs"]
33 | extras["parallel"] = ["cloudpickle", "loky"]
34 | extras["geopandas"] = ["geopandas"]
35 | extras["aip"] = combine(
36 |     [
37 |         "google-auth",
38 |         "google-api-python-client",
39 |         "google-cloud-logging",
40 |         "cloudpickle",
41 |         "docker",
42 |     ],
43 |     extras["gcp"],
44 | )
45 | 
46 | extras["examples"] = combine(extras["standard"], ["scikit-learn"])
47 | extras["full"] = combine(*extras.values())
48 | 
49 | extras["dev"] = combine(
50 |     [
51 |         "pytest",
52 |         "pytest-shard",
53 |         "black",
54 |         "flake8",
55 |         "flake8-print",
56 |         "flake8-fixme",
57 |         "importlib-metadata<5",  # flake8 is incompatible with importlib 5.0.0
58 |         "sphinx!=3.2.0",
59 |         "sphinx_rtd_theme",
60 |         "sphinx-autobuild",
61 |         "nbsphinx",
62 |         "jupyter",
63 |         "bumpversion",
64 |         "GitPython",
65 |     ],
66 |     *extras.values()
67 | )
68 | 
69 | # This will be imported by setup.py.
70 | extras_require = extras
71 | 


--------------------------------------------------------------------------------
/tests/test_flow/test_persistence_compatibility/inventory/lowercase_sum/18f9fabca61690edac92e2e690a0238243a6765a5a323023ea921df8d167b365/abdeaf50842c524bad26317b37054a082ee1c42365af2cccc3ef44963c4e5ab7/metadata_d6cdbcac0ffae0019872657e0074ec86ba77748544bf477ebeb02c7fe1491beb.yaml:
--------------------------------------------------------------------------------
 1 | artifact:
 2 |   content_hash: 53f0c7cbf9464530929bbdb5991d003f15e53687898ab690af83ee8ea90c1533
 3 |   url: ../../../../artifacts/lowercase_sum/44556b2a-bd8f-44c6-a1bb-1a03ed2a839f/lowercase_sum.json
 4 | descriptor: lowercase_sum
 5 | provenance:
 6 |   case_key_elements:
 7 |   - !!python/tuple
 8 |     - lowercase_chars
 9 |     - 9ed0cd8e69
10 |   code_fingerprint:
11 |     bytecode_hash: b'`\xad\xae\x8e\xb8b5\x96%\x14\x04\xee\r\xc9w3'
12 |     is_identity: true
13 |     orig_flow_name: null
14 |     version:
15 |       includes_bytecode: true
16 |       major: '0'
17 |       minor: '0'
18 |   dep_digests:
19 |   - exact_hash: d6cdbcac0ffae0019872657e0074ec86ba77748544bf477ebeb02c7fe1491beb
20 |     functional_hash: 18f9fabca61690edac92e2e690a0238243a6765a5a323023ea921df8d167b365
21 |     nominal_hash: abdeaf50842c524bad26317b37054a082ee1c42365af2cccc3ef44963c4e5ab7
22 |     provenance:
23 |       case_key_elements:
24 |       - !!python/tuple
25 |         - lowercase_chars
26 |         - 9ed0cd8e69
27 |       code_fingerprint:
28 |         bytecode_hash: b'\xd8!\\\xa0wX\x8d\x1c\x06\xb8AK\xaf\xde\xa8\xc2'
29 |         is_identity: false
30 |         orig_flow_name: null
31 |         version:
32 |           includes_bytecode: true
33 |           major: '0'
34 |           minor: '0'
35 |       dep_digests:
36 |       - exact_hash: fae27d617aaf3e7d982e787694bfe8aa6184450716890f3529f5b9dbbe8a6f18
37 |         functional_hash: fae27d617aaf3e7d982e787694bfe8aa6184450716890f3529f5b9dbbe8a6f18
38 |         nominal_hash: fae27d617aaf3e7d982e787694bfe8aa6184450716890f3529f5b9dbbe8a6f18
39 |         provenance: null
40 |       descriptor: <lowercase_sum>
41 |       exact_hash: d6cdbcac0ffae0019872657e0074ec86ba77748544bf477ebeb02c7fe1491beb
42 |       functional_hash: 18f9fabca61690edac92e2e690a0238243a6765a5a323023ea921df8d167b365
43 |       nominal_hash: abdeaf50842c524bad26317b37054a082ee1c42365af2cccc3ef44963c4e5ab7
44 |   descriptor: lowercase_sum
45 |   exact_hash: d6cdbcac0ffae0019872657e0074ec86ba77748544bf477ebeb02c7fe1491beb
46 |   functional_hash: 18f9fabca61690edac92e2e690a0238243a6765a5a323023ea921df8d167b365
47 |   nominal_hash: abdeaf50842c524bad26317b37054a082ee1c42365af2cccc3ef44963c4e5ab7
48 | 


--------------------------------------------------------------------------------
/tests/test_flow/test_persistence_compatibility/inventory/lowercase_sum/df26876dd4463a18ff0c4fe5ed4088f6642b919fa5690e7c36314dab74b6aeae/928386e200f120009b0fba16b2f0de0c22974433d0e30690957ef441b254b74f/metadata_68023f00b2b8d8baf1e747165c0432eabc0148ce5f801b3da9103ede202da633.yaml:
--------------------------------------------------------------------------------
 1 | artifact:
 2 |   content_hash: a3e8bae3649bc57d44b928616c9a641643e36bd26e13467f24dffe7aba3eaff2
 3 |   url: ../../../../artifacts/lowercase_sum/1c22c085-6fa5-4df7-a69a-a03a3e880e90/lowercase_sum.json
 4 | descriptor: lowercase_sum
 5 | provenance:
 6 |   case_key_elements:
 7 |   - !!python/tuple
 8 |     - lowercase_chars
 9 |     - 9ed0cd8e69
10 |   code_fingerprint:
11 |     bytecode_hash: b'`\xad\xae\x8e\xb8b5\x96%\x14\x04\xee\r\xc9w3'
12 |     is_identity: true
13 |     orig_flow_name: null
14 |     version:
15 |       includes_bytecode: true
16 |       major: '0'
17 |       minor: '0'
18 |   dep_digests:
19 |   - exact_hash: 68023f00b2b8d8baf1e747165c0432eabc0148ce5f801b3da9103ede202da633
20 |     functional_hash: df26876dd4463a18ff0c4fe5ed4088f6642b919fa5690e7c36314dab74b6aeae
21 |     nominal_hash: 928386e200f120009b0fba16b2f0de0c22974433d0e30690957ef441b254b74f
22 |     provenance:
23 |       case_key_elements:
24 |       - !!python/tuple
25 |         - lowercase_chars
26 |         - 9ed0cd8e69
27 |       code_fingerprint:
28 |         bytecode_hash: b'\xd8!\\\xa0wX\x8d\x1c\x06\xb8AK\xaf\xde\xa8\xc2'
29 |         is_identity: false
30 |         orig_flow_name: null
31 |         version:
32 |           includes_bytecode: true
33 |           major: '0'
34 |           minor: '0'
35 |       dep_digests:
36 |       - exact_hash: fae27d617aaf3e7d982e787694bfe8aa6184450716890f3529f5b9dbbe8a6f18
37 |         functional_hash: fae27d617aaf3e7d982e787694bfe8aa6184450716890f3529f5b9dbbe8a6f18
38 |         nominal_hash: fae27d617aaf3e7d982e787694bfe8aa6184450716890f3529f5b9dbbe8a6f18
39 |         provenance: null
40 |       descriptor: <lowercase_sum>
41 |       exact_hash: 68023f00b2b8d8baf1e747165c0432eabc0148ce5f801b3da9103ede202da633
42 |       functional_hash: df26876dd4463a18ff0c4fe5ed4088f6642b919fa5690e7c36314dab74b6aeae
43 |       nominal_hash: 928386e200f120009b0fba16b2f0de0c22974433d0e30690957ef441b254b74f
44 |   descriptor: lowercase_sum
45 |   exact_hash: 68023f00b2b8d8baf1e747165c0432eabc0148ce5f801b3da9103ede202da633
46 |   functional_hash: df26876dd4463a18ff0c4fe5ed4088f6642b919fa5690e7c36314dab74b6aeae
47 |   nominal_hash: 928386e200f120009b0fba16b2f0de0c22974433d0e30690957ef441b254b74f
48 | 


--------------------------------------------------------------------------------
/tests/test_flow/test_persistence_compatibility/inventory/uppercase_sum/4eebae0400c86e94dbe61d3669149a8818be6c07a985c723330b97b87547e7b3/2131eba21ea8c21c6e4830c8be043b71ee21edd9caa43657e5647fe5e72feb91/metadata_dcc2a4fa9aaf6a7e06f5761c52c3ff9f00772e3c169286c31530badd602a4ed5.yaml:
--------------------------------------------------------------------------------
 1 | artifact:
 2 |   content_hash: 03cd8df2dd4256cd04907f6b281b2116f9188b3f32cd99d7e6d98f81b9a0e675
 3 |   url: ../../../../artifacts/uppercase_sum/5998ef92-4102-4e9c-9ef5-f996da3a9fd9/uppercase_sum.json
 4 | descriptor: uppercase_sum
 5 | provenance:
 6 |   case_key_elements:
 7 |   - !!python/tuple
 8 |     - uppercase_chars
 9 |     - e99019711a
10 |   code_fingerprint:
11 |     bytecode_hash: b'`\xad\xae\x8e\xb8b5\x96%\x14\x04\xee\r\xc9w3'
12 |     is_identity: true
13 |     orig_flow_name: null
14 |     version:
15 |       includes_bytecode: true
16 |       major: '0'
17 |       minor: '0'
18 |   dep_digests:
19 |   - exact_hash: dcc2a4fa9aaf6a7e06f5761c52c3ff9f00772e3c169286c31530badd602a4ed5
20 |     functional_hash: 4eebae0400c86e94dbe61d3669149a8818be6c07a985c723330b97b87547e7b3
21 |     nominal_hash: 2131eba21ea8c21c6e4830c8be043b71ee21edd9caa43657e5647fe5e72feb91
22 |     provenance:
23 |       case_key_elements:
24 |       - !!python/tuple
25 |         - uppercase_chars
26 |         - e99019711a
27 |       code_fingerprint:
28 |         bytecode_hash: b'\xc0j\x18\x8f\xeef\xff\x0e\x89%\xa8z#\x16\x8d\xc9'
29 |         is_identity: false
30 |         orig_flow_name: null
31 |         version:
32 |           includes_bytecode: true
33 |           major: '0'
34 |           minor: '0'
35 |       dep_digests:
36 |       - exact_hash: 95f21ce003ce31aba89385b8f7bd69e443fa1376763b8f9168313d778950e2ec
37 |         functional_hash: 95f21ce003ce31aba89385b8f7bd69e443fa1376763b8f9168313d778950e2ec
38 |         nominal_hash: 95f21ce003ce31aba89385b8f7bd69e443fa1376763b8f9168313d778950e2ec
39 |         provenance: null
40 |       descriptor: <uppercase_sum>
41 |       exact_hash: dcc2a4fa9aaf6a7e06f5761c52c3ff9f00772e3c169286c31530badd602a4ed5
42 |       functional_hash: 4eebae0400c86e94dbe61d3669149a8818be6c07a985c723330b97b87547e7b3
43 |       nominal_hash: 2131eba21ea8c21c6e4830c8be043b71ee21edd9caa43657e5647fe5e72feb91
44 |   descriptor: uppercase_sum
45 |   exact_hash: dcc2a4fa9aaf6a7e06f5761c52c3ff9f00772e3c169286c31530badd602a4ed5
46 |   functional_hash: 4eebae0400c86e94dbe61d3669149a8818be6c07a985c723330b97b87547e7b3
47 |   nominal_hash: 2131eba21ea8c21c6e4830c8be043b71ee21edd9caa43657e5647fe5e72feb91
48 | 


--------------------------------------------------------------------------------
/tests/test_flow/test_persistence_compatibility/inventory/uppercase_sum/77f703a7588b04005d8ba8db4cf58b8accd9b951a7c301e0bd7a844315aab6b8/25476be4b0032b37f58d1721d86043060b4c8647fe9fdeefc8dd30636231f542/metadata_154a158abcc7649a0948905714394ab5816346d05843df523261458828e60035.yaml:
--------------------------------------------------------------------------------
 1 | artifact:
 2 |   content_hash: cca69c8993ccba521327129f3f6d91ff72b7b1fb625ce355541dc349be74668b
 3 |   url: ../../../../artifacts/uppercase_sum/fbdc03e4-c713-4a7f-aca6-79bd31bb9d62/uppercase_sum.json
 4 | descriptor: uppercase_sum
 5 | provenance:
 6 |   case_key_elements:
 7 |   - !!python/tuple
 8 |     - uppercase_chars
 9 |     - e99019711a
10 |   code_fingerprint:
11 |     bytecode_hash: b'`\xad\xae\x8e\xb8b5\x96%\x14\x04\xee\r\xc9w3'
12 |     is_identity: true
13 |     orig_flow_name: null
14 |     version:
15 |       includes_bytecode: true
16 |       major: '0'
17 |       minor: '0'
18 |   dep_digests:
19 |   - exact_hash: 154a158abcc7649a0948905714394ab5816346d05843df523261458828e60035
20 |     functional_hash: 77f703a7588b04005d8ba8db4cf58b8accd9b951a7c301e0bd7a844315aab6b8
21 |     nominal_hash: 25476be4b0032b37f58d1721d86043060b4c8647fe9fdeefc8dd30636231f542
22 |     provenance:
23 |       case_key_elements:
24 |       - !!python/tuple
25 |         - uppercase_chars
26 |         - e99019711a
27 |       code_fingerprint:
28 |         bytecode_hash: b'\xc0j\x18\x8f\xeef\xff\x0e\x89%\xa8z#\x16\x8d\xc9'
29 |         is_identity: false
30 |         orig_flow_name: null
31 |         version:
32 |           includes_bytecode: true
33 |           major: '0'
34 |           minor: '0'
35 |       dep_digests:
36 |       - exact_hash: 95f21ce003ce31aba89385b8f7bd69e443fa1376763b8f9168313d778950e2ec
37 |         functional_hash: 95f21ce003ce31aba89385b8f7bd69e443fa1376763b8f9168313d778950e2ec
38 |         nominal_hash: 95f21ce003ce31aba89385b8f7bd69e443fa1376763b8f9168313d778950e2ec
39 |         provenance: null
40 |       descriptor: <uppercase_sum>
41 |       exact_hash: 154a158abcc7649a0948905714394ab5816346d05843df523261458828e60035
42 |       functional_hash: 77f703a7588b04005d8ba8db4cf58b8accd9b951a7c301e0bd7a844315aab6b8
43 |       nominal_hash: 25476be4b0032b37f58d1721d86043060b4c8647fe9fdeefc8dd30636231f542
44 |   descriptor: uppercase_sum
45 |   exact_hash: 154a158abcc7649a0948905714394ab5816346d05843df523261458828e60035
46 |   functional_hash: 77f703a7588b04005d8ba8db4cf58b8accd9b951a7c301e0bd7a844315aab6b8
47 |   nominal_hash: 25476be4b0032b37f58d1721d86043060b4c8647fe9fdeefc8dd30636231f542
48 | 


--------------------------------------------------------------------------------
/bionic/utils/urls.py:
--------------------------------------------------------------------------------
 1 | """
 2 | Utilities for working with URLs.
 3 | """
 4 | 
 5 | import os
 6 | from pathlib import Path
 7 | from urllib.parse import unquote, urlparse
 8 | 
 9 | FILE_SCHEME = "file"
10 | GCS_SCHEME = "gs"
11 | SUPPORTED_SCHEMES = [FILE_SCHEME, GCS_SCHEME]
12 | 
13 | 
14 | def is_file_url(url):
15 |     result = urlparse(url)
16 |     return result.scheme == FILE_SCHEME
17 | 
18 | 
19 | def is_gcs_url(url):
20 |     result = urlparse(url)
21 |     return result.scheme == GCS_SCHEME
22 | 
23 | 
24 | def is_absolute_url(url):
25 |     result = urlparse(url)
26 |     if not result.scheme:
27 |         return False
28 |     if result.scheme not in SUPPORTED_SCHEMES:
29 |         raise ValueError(f"Found a URL with unsupported scheme {result.scheme!r}.")
30 |     return True
31 | 
32 | 
33 | def path_from_url(url):
34 |     result = urlparse(url)
35 |     return Path(unquote(result.path))
36 | 
37 | 
38 | def url_from_path(path):
39 |     return Path(path).as_uri()
40 | 
41 | 
42 | def bucket_and_object_names_from_gs_url(url):
43 |     if not is_gcs_url(url):
44 |         raise ValueError(f'url must have schema "{GCS_SCHEME}": got {url}')
45 |     result = urlparse(url)
46 |     result_path = result.path
47 |     return result.netloc, result_path[1:]
48 | 
49 | 
50 | def relativize_url(absolute_url, base_url):
51 |     """
52 |     Converts an absolute file URL to one relative to a base file URL.
53 | 
54 |     If either URL is not a file URL, this returns the original absolute URL.
55 |     """
56 | 
57 |     if not is_file_url(absolute_url) or not is_file_url(base_url):
58 |         return absolute_url
59 |     absolute_path = path_from_url(absolute_url)
60 |     base_path = path_from_url(base_url)
61 |     # Using str(absolute_path.relative_to(base_path.parent)) doesn't work as well here,
62 |     # because it throws an exception if base_path is not a parent of absolute_path.
63 |     return os.path.relpath(absolute_path, base_path.parent)
64 | 
65 | 
66 | def derelativize_url(relative_url, base_url):
67 |     """
68 |     Given a URL relative to another base URL, returns an absolute URL.
69 | 
70 |     If the first URL is not relative, it is returned unchanged.
71 |     """
72 | 
73 |     if is_absolute_url(relative_url):
74 |         return relative_url
75 |     base_path = path_from_url(base_url)
76 |     relative_path = path_from_url(relative_url)
77 |     absolute_path = os.path.normpath(base_path.parent.joinpath(relative_path))
78 |     return url_from_path(absolute_path)
79 | 


--------------------------------------------------------------------------------
/docs/index.rst:
--------------------------------------------------------------------------------
 1 | ======
 2 | Bionic
 3 | ======
 4 | 
 5 | .. note::
 6 |     Bionic is in alpha and evolving rapidly.  We recommend it for research
 7 |     projects where the dataset fits in memory.  We do not recommend it for
 8 |     pipelines running in production.
 9 | 
10 | **Release:** v\ |version| ---
11 | **Quick Links:** `Source <https://github.com/square/bionic>`_ | `Issues <https://github.com/square/bionic/issues>`_ | `Installation <get-started.html#Installation>`_ | `Example <tutorials/ml_workflow.html>`_
12 | 
13 | Bionic is a framework for analyzing and modeling data in Python.  It's designed
14 | to help you **iterate faster on your research**, and help your colleagues
15 | **reuse your code more easily**.
16 | 
17 | You define the *entities* you care about -- dataframes, parameters, models,
18 | plots -- using individual Python functions.  Then Bionic assembles your
19 | definitions into a *flow*: a custom Python object that can efficiently compute
20 | any of your entities, and can be modified on the fly to test out new
21 | variations.
22 | 
23 | This approach has several benefits:
24 | 
25 | * Bionic automatically glues your functions into a coherent program, so your
26 |   **code stays modular** but behaves like an **integrated end-to-end tool**.
27 | * You can compute any entity with one function call, so it's **easy to iterate
28 |   and debug**.
29 | * Everything you compute is automatically cached, so you spend **less time
30 |   waiting** and **zero time managing data files**.
31 | * Flows are easy to use from a notebook, so you can **work interactively** but
32 |   keep your code in a **version-controlled** Python file.
33 | * Any part of a flow can be modified dynamically, so you can **quickly try
34 |   experiments**, and your colleagues can **reuse your code** without rewriting
35 |   it.
36 | 
37 | ..
38 |     This is super annoying, but it's the only way I've found to make a bold
39 |     internal link in RST.  (I really want the link to be bold so you can see
40 |     the example link easily when you're scanning.)
41 | 
42 | Check out an |bold link|!
43 | 
44 | .. |bold link| raw:: html
45 | 
46 |    <a class="reference internal" href="tutorials/ml_workflow.html">
47 |    <strong>example here</strong></a>
48 | 
49 | Documentation Contents
50 | ----------------------
51 | 
52 | .. toctree::
53 |     :maxdepth: 2
54 | 
55 |     what
56 |     get-started
57 |     concepts
58 |     warnings
59 |     api/index.rst
60 |     get-help
61 |     contributing
62 |     future
63 |     release-notes
64 | 


--------------------------------------------------------------------------------
/docs/api/decorators.rst:
--------------------------------------------------------------------------------
 1 | ==========
 2 | Decorators
 3 | ==========
 4 | 
 5 | Introduction
 6 | ------------
 7 | 
 8 | Bionic decorators are Python decorators designed to be used in conjunction with
 9 | a ``FlowBuilder``.  They modify the way functions are incorporated into flows.
10 | 
11 | The normal way (without decorators) of incorporating functions into flows is
12 | as follows:
13 | 
14 | .. code-block:: python
15 | 
16 |     import bionic as bn
17 | 
18 |     builder = FlowBuilder('my_flow')
19 | 
20 |     builder.assign('x', 1)
21 | 
22 |     @builder
23 |     def x_plus_one(x):
24 |         return x + 1
25 | 
26 |     print(builder.build().get('x_plus_one'))  # Prints "2".
27 | 
28 | In the simple case above, the function is interpreted as a new entity named
29 | ``x_plus_one`` which depends on the existing entity ``x``.  However, in many
30 | cases we want Bionic to process the function in a more complex way.  In these
31 | cases we can add additional decorators:
32 | 
33 | .. code-block:: python
34 | 
35 |     import bionic as bn
36 | 
37 |     builder = FlowBuilder('my_flow')
38 | 
39 |     builder.assign('x', 1)
40 | 
41 |     @builder
42 |     @bn.outputs('x_plus_one', 'x_plus_two')
43 |     @bn.persist(False)
44 |     def some_function(x):
45 |         return (x + 1), (x + 2)
46 | 
47 |     print(builder.build().get('x_plus_one'))  # Prints "2".
48 |     print(builder.build().get('x_plus_two'))  # Prints "3".
49 | 
50 | These decorators tell Bionic that our function actually generates two values
51 | for two different entities (``x_plus_one`` and ``x_plus_two``), and these
52 | values should not be persisted to disk.
53 | 
54 | All Bionic decorators should be placed *after* the initial ``@builder``
55 | decorator, but *before* any regular (non-Bionic) decorators.  Finally, the
56 | ``@builder`` decorator returns the original function, so it can be called
57 | normally, as if it had been defined without any of the Bionic decorators.
58 | E.g.:
59 | 
60 | .. code-block:: python
61 | 
62 |     @builder
63 |     @bn.persist(False)
64 |     def f(x):
65 |         return x + 1
66 | 
67 |     assert f(7) == 8
68 | 
69 | Built-In Decorators
70 | -------------------
71 | 
72 | .. autofunction:: bionic.run_in_aip
73 | .. autofunction:: bionic.changes_per_run
74 | .. autofunction:: bionic.docs
75 | .. autofunction:: bionic.gather
76 | .. autofunction:: bionic.immediate
77 | .. autofunction:: bionic.memoize
78 | .. autofunction:: bionic.output
79 | .. autofunction:: bionic.outputs
80 | .. autofunction:: bionic.persist
81 | .. autofunction:: bionic.pyplot
82 | .. autofunction:: bionic.version
83 | 
84 | 


--------------------------------------------------------------------------------
/tests/test_flow/test_multi_out.py:
--------------------------------------------------------------------------------
  1 | import pytest
  2 | 
  3 | import bionic as bn
  4 | 
  5 | 
  6 | def test_no_doc(builder):
  7 |     @builder
  8 |     @bn.outputs("a", "b")
  9 |     def f():
 10 |         return 1, 2
 11 | 
 12 |     flow = builder.build()
 13 |     assert flow.entity_doc("a") is None
 14 |     assert flow.entity_doc("b") is None
 15 | 
 16 | 
 17 | def test_multi_docs(builder):
 18 |     @builder
 19 |     @bn.outputs("a", "b")
 20 |     @bn.docs("a doc", "b doc")
 21 |     def f():
 22 |         return 1, 2
 23 | 
 24 |     flow = builder.build()
 25 |     assert flow.entity_doc("a") == "a doc"
 26 |     assert flow.entity_doc("b") == "b doc"
 27 | 
 28 | 
 29 | def test_multi_docs_decorated_first(builder):
 30 |     @builder
 31 |     @bn.docs("a doc", "b doc")
 32 |     @bn.outputs("a", "b")
 33 |     def f():
 34 |         return 1, 2
 35 | 
 36 |     flow = builder.build()
 37 |     assert flow.entity_doc("a") == "a doc"
 38 |     assert flow.entity_doc("b") == "b doc"
 39 | 
 40 | 
 41 | def test_too_many_docs(builder):
 42 |     with pytest.raises(ValueError):
 43 | 
 44 |         @builder
 45 |         @bn.docs("a doc", "b doc")
 46 |         def f():
 47 |             return 1, 2
 48 | 
 49 | 
 50 | def test_too_few_docs(builder):
 51 |     with pytest.warns(Warning):
 52 | 
 53 |         @builder
 54 |         @bn.outputs("a", "b")
 55 |         def f():
 56 |             "a and b doc"
 57 |             return 1, 2
 58 | 
 59 |     flow = builder.build()
 60 |     assert flow.entity_doc("a") == "a and b doc"
 61 |     assert flow.entity_doc("b") == "a and b doc"
 62 | 
 63 | 
 64 | def test_multi_default_protocols(builder):
 65 |     @builder
 66 |     @bn.outputs("a", "b")
 67 |     def f():
 68 |         return 1, 2
 69 | 
 70 |     flow = builder.build()
 71 |     assert flow.entity_protocol("a") == bn.flow.DEFAULT_PROTOCOL
 72 |     assert flow.entity_protocol("b") == bn.flow.DEFAULT_PROTOCOL
 73 | 
 74 | 
 75 | def test_multi_custom_protocols(builder):
 76 |     protocol = bn.protocol.dillable()
 77 | 
 78 |     @builder
 79 |     @bn.outputs("a", "b")
 80 |     @protocol
 81 |     def f():
 82 |         return 1, 2
 83 | 
 84 |     flow = builder.build()
 85 |     assert flow.entity_protocol("a") == protocol
 86 |     assert flow.entity_protocol("b") == protocol
 87 | 
 88 | 
 89 | def test_multi_custom_protocols_decorated_first(builder):
 90 |     protocol = bn.protocol.dillable()
 91 | 
 92 |     @builder
 93 |     @protocol
 94 |     @bn.outputs("a", "b")
 95 |     def f():
 96 |         return 1, 2
 97 | 
 98 |     flow = builder.build()
 99 |     assert flow.entity_protocol("a") == protocol
100 |     assert flow.entity_protocol("b") == protocol
101 | 


--------------------------------------------------------------------------------
/bionic/tokenization.py:
--------------------------------------------------------------------------------
 1 | """
 2 | Contains a tokenize() function which can be used to convert arbitrary values
 3 | into nice strings, suitable for use as filenames.
 4 | """
 5 | 
 6 | from .utils.misc import hash_to_hex
 7 | 
 8 | 
 9 | def char_range(first, last):
10 |     "Return a list of all the characters from first to last, inclusive."
11 |     return [chr(i) for i in range(ord(first), ord(last) + 1)]
12 | 
13 | 
14 | CLEAN_CHARS = set(
15 |     char_range("a", "z") + char_range("A", "Z") + char_range("0", "9") + ["_", "-", "."]
16 | )
17 | MAX_CLEAN_STR_LEN = 32
18 | 
19 | 
20 | def clean_str(string):
21 |     "Converts an arbitary string to one that could be used as a filename."
22 |     cleaned = "".join((c if c in CLEAN_CHARS else ".") for c in string)
23 |     # Some filesystems are case insensitive, so we don't want uppercase
24 |     # letters.
25 |     cleaned = cleaned.lower()
26 |     # Some filesystems treat files differently if they start with a period, so
27 |     # let's avoid that.
28 |     if cleaned.startswith("."):
29 |         cleaned = "_" + cleaned
30 |     if len(cleaned) > MAX_CLEAN_STR_LEN:
31 |         head_len = (MAX_CLEAN_STR_LEN // 2) - 1
32 |         tail_len = MAX_CLEAN_STR_LEN - (head_len + 3)
33 |         cleaned = cleaned[:head_len] + "..." + cleaned[-tail_len:]
34 |     return cleaned
35 | 
36 | 
37 | # When hashing values for tokens, we'll hash down to 5 bytes (10 hex chars).
38 | # The reasoning is:
39 | # - we want to support up to 1e6 distinct values
40 | # - to avoid collisions, we need a hash space of 1e6 squared, or 1e12
41 | # - that's 36 bits
42 | # - rounding up, that's 5 bytes
43 | # I picked 1e6 arbitrarily; the hash is only used when two values have the same
44 | # "clean string" value OR when they can't be converted to strings at all, but
45 | # that will include things like dicts of hyperparameter values.
46 | HASH_LEN = 5
47 | 
48 | 
49 | # TODO: add optional directory parameter for where to write/read from
50 | def tokenize(value, serialize_func=None):
51 |     """
52 |     Convert an arbitrary value to a nice, unique string that could be used as a
53 |     filename.  If a serialization function is provided, the value will be
54 |     serialized and hashed.  Otherwise it will be converted to a string; if that
55 |     string is not suitable for a filename, it will be cleaned and a hash will
56 |     be appended.
57 |     """
58 | 
59 |     if serialize_func is not None:
60 |         bytestring = serialize_func(value)
61 |         token = hash_to_hex(bytestring, HASH_LEN)
62 |     else:
63 |         value_str = str(value)
64 |         token = clean_str(value_str)
65 |         if token != value_str:
66 |             token += "_" + hash_to_hex(value_str.encode("utf-8"), HASH_LEN)
67 | 
68 |     return token
69 | 


--------------------------------------------------------------------------------
/bionic/gcs.py:
--------------------------------------------------------------------------------
 1 | """
 2 | Utilities for working with Google Cloud Storage.
 3 | """
 4 | 
 5 | import logging
 6 | import warnings
 7 | 
 8 | from .deps.optdep import import_optional_dependency
 9 | 
10 | logger = logging.getLogger(__name__)
11 | 
12 | 
13 | _cached_gcs_fs = None
14 | 
15 | 
16 | def get_gcs_fs_without_warnings(cache_value=True):
17 |     # TODO It's not expensive to create the gcs filesystem, but caching this enables
18 |     # us to mock the cached gcs_fs with a mock implementation in tests. We should
19 |     # change the tests to inject the filesystem in a different way and get rid of
20 |     # this caching.
21 |     if cache_value:
22 |         global _cached_gcs_fs
23 |         if _cached_gcs_fs is None:
24 |             _cached_gcs_fs = get_gcs_fs_without_warnings(cache_value=False)
25 |         return _cached_gcs_fs
26 | 
27 |     fsspec = import_optional_dependency("fsspec", purpose="caching to GCS")
28 | 
29 |     with warnings.catch_warnings():
30 |         # Google's SDK warns if you use end user credentials instead of a
31 |         # service account.  I think this warning is intended for production
32 |         # server code, where you don't want GCP access to be tied to a
33 |         # particular user.  However, this code is intended to be run by
34 |         # individuals, so using end user credentials seems appropriate.
35 |         # Hence, we'll suppress this warning.
36 |         warnings.filterwarnings(
37 |             "ignore", "Your application has authenticated using end user credentials"
38 |         )
39 |         logger.info("Initializing GCS filesystem ...")
40 |         return fsspec.filesystem("gcs")
41 | 
42 | 
43 | # TODO: Consider using persistence.GcsFilesystem instead of exposing this function.
44 | def upload_to_gcs(path, url):
45 |     """
46 |     Copy a local path to GCS URL.
47 |     """
48 |     gcs_fs = get_gcs_fs_without_warnings()
49 |     if path.is_dir():
50 |         gcs_fs.put(str(path), url, recursive=True)
51 |     else:
52 |         # If the GCS URL is a folder, we want to write the file in the folder.
53 |         # There seems to be a bug in fsspec due to which, the file is uploaded
54 |         # as the url, instead of inside the folder. What this means is, writing
55 |         # a file c.json to gs://a/b/ would result in file gs://a/b instead of
56 |         # gs://a/b/c.json.
57 |         #
58 |         # The `put` API is supposed to write the file inside the folder but it
59 |         # strips the ending "/" at the end in fsspec's `_strip_protocol` method.
60 |         # See https://github.com/intake/filesystem_spec/issues/448 for more
61 |         # details and tracking this issue.
62 |         if url.endswith("/"):
63 |             url = url + path.name
64 |         gcs_fs.put_file(str(path), url)
65 | 


--------------------------------------------------------------------------------
/tests/test_flow/test_persistence_compatibility/inventory/total_sum/401bf02deffc7c8d58ba69ed3187ed5b47c8b69fe92da389d7161a78581ce1d0/f723892d217a64ac4124c1e92c00d73b8d8986cf17f50a4de70ad5224e54d17b/metadata_3e3b9ef2b6a3946f569202f99045b3b61d7b1a8e327566282ec558dd0254af34.yaml:
--------------------------------------------------------------------------------
 1 | artifact:
 2 |   content_hash: 927b8935346c55e24d2cf6de1a771cf018fb38df4bd182acad078557b88a773d
 3 |   url: ../../../../artifacts/total_sum/faed6d2b-5b8c-449a-9fd6-946bec4f5b0d/total_sum.json
 4 | descriptor: total_sum
 5 | provenance:
 6 |   case_key_elements:
 7 |   - !!python/tuple
 8 |     - lowercase_chars
 9 |     - 9ed0cd8e69
10 |   - !!python/tuple
11 |     - uppercase_chars
12 |     - e99019711a
13 |   code_fingerprint:
14 |     bytecode_hash: b'`\xad\xae\x8e\xb8b5\x96%\x14\x04\xee\r\xc9w3'
15 |     is_identity: true
16 |     orig_flow_name: null
17 |     version:
18 |       includes_bytecode: true
19 |       major: '0'
20 |       minor: '0'
21 |   dep_digests:
22 |   - exact_hash: 3e3b9ef2b6a3946f569202f99045b3b61d7b1a8e327566282ec558dd0254af34
23 |     functional_hash: 401bf02deffc7c8d58ba69ed3187ed5b47c8b69fe92da389d7161a78581ce1d0
24 |     nominal_hash: f723892d217a64ac4124c1e92c00d73b8d8986cf17f50a4de70ad5224e54d17b
25 |     provenance:
26 |       case_key_elements:
27 |       - !!python/tuple
28 |         - lowercase_chars
29 |         - 9ed0cd8e69
30 |       - !!python/tuple
31 |         - uppercase_chars
32 |         - e99019711a
33 |       code_fingerprint:
34 |         bytecode_hash: b'\xf9\x8a\xd9\xda\xf2\xc3Hb<\xa5\xde\xc2g\x04xA'
35 |         is_identity: false
36 |         orig_flow_name: null
37 |         version:
38 |           includes_bytecode: true
39 |           major: '0'
40 |           minor: '0'
41 |       dep_digests:
42 |       - exact_hash: 53f0c7cbf9464530929bbdb5991d003f15e53687898ab690af83ee8ea90c1533
43 |         functional_hash: 53f0c7cbf9464530929bbdb5991d003f15e53687898ab690af83ee8ea90c1533
44 |         nominal_hash: 53f0c7cbf9464530929bbdb5991d003f15e53687898ab690af83ee8ea90c1533
45 |         provenance: null
46 |       - exact_hash: cca69c8993ccba521327129f3f6d91ff72b7b1fb625ce355541dc349be74668b
47 |         functional_hash: cca69c8993ccba521327129f3f6d91ff72b7b1fb625ce355541dc349be74668b
48 |         nominal_hash: cca69c8993ccba521327129f3f6d91ff72b7b1fb625ce355541dc349be74668b
49 |         provenance: null
50 |       descriptor: <total_sum>
51 |       exact_hash: 3e3b9ef2b6a3946f569202f99045b3b61d7b1a8e327566282ec558dd0254af34
52 |       functional_hash: 401bf02deffc7c8d58ba69ed3187ed5b47c8b69fe92da389d7161a78581ce1d0
53 |       nominal_hash: f723892d217a64ac4124c1e92c00d73b8d8986cf17f50a4de70ad5224e54d17b
54 |   descriptor: total_sum
55 |   exact_hash: 3e3b9ef2b6a3946f569202f99045b3b61d7b1a8e327566282ec558dd0254af34
56 |   functional_hash: 401bf02deffc7c8d58ba69ed3187ed5b47c8b69fe92da389d7161a78581ce1d0
57 |   nominal_hash: f723892d217a64ac4124c1e92c00d73b8d8986cf17f50a4de70ad5224e54d17b
58 | 


--------------------------------------------------------------------------------
/tests/test_flow/test_persistence_compatibility/inventory/total_sum/e06c59dc8c0982d1a495ac7525ed5f3b6cc09fcfe9b82b905f199adbaebc7d98/e34bb2007d114a11e1fd278c00dbf9a244d935eeb2e4a788b50d542152ae655c/metadata_354167e0f9a7bdc5d7170980d1c0e278e2875bde04db5ac6ccd7bd7f1a73bd6d.yaml:
--------------------------------------------------------------------------------
 1 | artifact:
 2 |   content_hash: 6c7cfc2deb288fa30f5036fa442d8082d755c0bbcbb8f93f3c49c957142114fb
 3 |   url: ../../../../artifacts/total_sum/2f000e88-5a8f-4762-b7c4-77eb444348f6/total_sum.json
 4 | descriptor: total_sum
 5 | provenance:
 6 |   case_key_elements:
 7 |   - !!python/tuple
 8 |     - lowercase_chars
 9 |     - 9ed0cd8e69
10 |   - !!python/tuple
11 |     - uppercase_chars
12 |     - e99019711a
13 |   code_fingerprint:
14 |     bytecode_hash: b'`\xad\xae\x8e\xb8b5\x96%\x14\x04\xee\r\xc9w3'
15 |     is_identity: true
16 |     orig_flow_name: null
17 |     version:
18 |       includes_bytecode: true
19 |       major: '0'
20 |       minor: '0'
21 |   dep_digests:
22 |   - exact_hash: 354167e0f9a7bdc5d7170980d1c0e278e2875bde04db5ac6ccd7bd7f1a73bd6d
23 |     functional_hash: e06c59dc8c0982d1a495ac7525ed5f3b6cc09fcfe9b82b905f199adbaebc7d98
24 |     nominal_hash: e34bb2007d114a11e1fd278c00dbf9a244d935eeb2e4a788b50d542152ae655c
25 |     provenance:
26 |       case_key_elements:
27 |       - !!python/tuple
28 |         - lowercase_chars
29 |         - 9ed0cd8e69
30 |       - !!python/tuple
31 |         - uppercase_chars
32 |         - e99019711a
33 |       code_fingerprint:
34 |         bytecode_hash: b'\xf9\x8a\xd9\xda\xf2\xc3Hb<\xa5\xde\xc2g\x04xA'
35 |         is_identity: false
36 |         orig_flow_name: null
37 |         version:
38 |           includes_bytecode: true
39 |           major: '0'
40 |           minor: '0'
41 |       dep_digests:
42 |       - exact_hash: a3e8bae3649bc57d44b928616c9a641643e36bd26e13467f24dffe7aba3eaff2
43 |         functional_hash: a3e8bae3649bc57d44b928616c9a641643e36bd26e13467f24dffe7aba3eaff2
44 |         nominal_hash: a3e8bae3649bc57d44b928616c9a641643e36bd26e13467f24dffe7aba3eaff2
45 |         provenance: null
46 |       - exact_hash: 03cd8df2dd4256cd04907f6b281b2116f9188b3f32cd99d7e6d98f81b9a0e675
47 |         functional_hash: 03cd8df2dd4256cd04907f6b281b2116f9188b3f32cd99d7e6d98f81b9a0e675
48 |         nominal_hash: 03cd8df2dd4256cd04907f6b281b2116f9188b3f32cd99d7e6d98f81b9a0e675
49 |         provenance: null
50 |       descriptor: <total_sum>
51 |       exact_hash: 354167e0f9a7bdc5d7170980d1c0e278e2875bde04db5ac6ccd7bd7f1a73bd6d
52 |       functional_hash: e06c59dc8c0982d1a495ac7525ed5f3b6cc09fcfe9b82b905f199adbaebc7d98
53 |       nominal_hash: e34bb2007d114a11e1fd278c00dbf9a244d935eeb2e4a788b50d542152ae655c
54 |   descriptor: total_sum
55 |   exact_hash: 354167e0f9a7bdc5d7170980d1c0e278e2875bde04db5ac6ccd7bd7f1a73bd6d
56 |   functional_hash: e06c59dc8c0982d1a495ac7525ed5f3b6cc09fcfe9b82b905f199adbaebc7d98
57 |   nominal_hash: e34bb2007d114a11e1fd278c00dbf9a244d935eeb2e4a788b50d542152ae655c
58 | 


--------------------------------------------------------------------------------
/.github/workflows/bionic-test.yml:
--------------------------------------------------------------------------------
 1 | name: CI
 2 | 
 3 | on:
 4 |   push:
 5 |   pull_request:
 6 |   workflow_dispatch:
 7 | 
 8 | jobs:
 9 |   build:
10 | 
11 |     # TODO Consider running on macos-latest as well.
12 |     runs-on: ubuntu-latest
13 |     # Just in case we start having to pay for our CI compute costs, it's probably wise
14 |     # to have a time limit.
15 |     timeout-minutes: 60
16 |     strategy:
17 |       # Keep running all test configurations, even if one of them fails. This is helpful
18 |       # because if one configuration fails, it's useful to see whether the other ones
19 |       # fail too. (This helps diagnose tests that are flaky or specific to one Python
20 |       # version.)
21 |       fail-fast: false
22 |       matrix:
23 |         python-version: [3.7, 3.8]
24 |         include:
25 |           - python-version: 3.7
26 |             shard-id: 1
27 |           - python-version: 3.8
28 |             shard-id: 2
29 | 
30 |     steps:
31 |     - uses: actions/checkout@v2
32 |     - name: Set up Python ${{ matrix.python-version }}
33 |       uses: actions/setup-python@v2
34 |       with:
35 |         python-version: ${{ matrix.python-version }}
36 |     - name: Install dependencies
37 |       run: |
38 |         python -m pip install --upgrade pip
39 |         sudo apt-get install graphviz
40 |         pip install --upgrade --upgrade-strategy eager '.[dev]'
41 |         # This prints out all installed package versions, which may help for debugging
42 |         # build failures.
43 |         pip freeze
44 |     - name: Set up gcloud
45 |       uses: google-github-actions/setup-gcloud@v0.2.1
46 |       with:
47 |         service_account_key: ${{ secrets.GCP_SA_KEY }}
48 |         export_default_credentials: true
49 |     - name: Set up GCS bucket argument
50 |       # If we have access to a GCS bucket, we want to run our tests with it.
51 |       # But if we were triggered from a pull request (as opposed to a push) then
52 |       # we won't have access to any secrets, in which case we need to omit the
53 |       # `--bucket` argument.
54 |       # Unfortunately this seems to be the simplest way to make this work. See
55 |       #    https://github.community/t/how-can-i-test-if-secrets-are-available-in-an-action/17911
56 |       # for more details.
57 |       run: |
58 |         ([ -z ${{ secrets.GCP_BUCKET }} ] || echo "BUCKET_ARG=--bucket=${{ secrets.GCP_BUCKET }}" >> $GITHUB_ENV)
59 |     - name: Lint with flake8 and black
60 |       run: |
61 |         flake8
62 |         black --check .
63 |     - name: Run baseline tests
64 |       # Running GCS tests in CI costs less than a dollar per day on average.
65 |       run: |
66 |         pytest $BUCKET_ARG
67 |     - name: Run extra tests (sharded)
68 |       # Running each test on each Python version is expensive, so we compromise: we run
69 |       # the baseline tests above on each version, since they're fast and hopefully
70 |       # comprehensive enough to shake out any version-specific bugs; and we run each of
71 |       # the other tests on just one Python version, reducing the total build time.
72 |       run: |
73 |         pytest $BUCKET_ARG --parallel --slow -m 'not baseline' --num-shards 3 --shard-id ${{matrix.shard-id}}
74 | 


--------------------------------------------------------------------------------
/bionic/decoration.py:
--------------------------------------------------------------------------------
 1 | """
 2 | Code for creating and applying Bionic decorators.
 3 | 
 4 | Bionic decorators are expected to be applied like this:
 5 | 
 6 |     @builder
 7 |     @decorator1
 8 |     @decorator2
 9 |     def func(arg1, arg2, ...):
10 |         ...
11 | 
12 | Each decorator attaches information to the decorated function by by creating or updating
13 | a DecorationAccumulator object, set as an attribute on the function. The assumption is
14 | that the decorator at the top, (``@builder``) will be a FlowBuilder object which removes
15 | this accumulator object and uses it to define a new entity.
16 | """
17 | 
18 | import attr
19 | import warnings
20 | 
21 | from .exception import AttributeValidationError
22 | from .provider import FunctionProvider
23 | from .utils.misc import oneline
24 | 
25 | 
26 | @attr.s
27 | class DecorationAccumulator:
28 |     provider = attr.ib()
29 | 
30 |     protocol = attr.ib(default=None)
31 |     docs = attr.ib(default=None)
32 |     should_persist = attr.ib(default=None)
33 |     should_memoize = attr.ib(default=None)
34 | 
35 |     def wrap_provider(self, wrapper_fn, *args, **kwargs):
36 |         self.provider = wrapper_fn(self.provider, *args, **kwargs)
37 | 
38 |     def update_attr(
39 |         self, attr_name, attr_value, decorator_name, raise_if_already_set=True
40 |     ):
41 |         old_attr_value = getattr(self, attr_name)
42 |         if old_attr_value is not None:
43 |             message = f"""
44 |             Tried to use {decorator_name} with value {attr_value!r},
45 |             but this decorator was already used with value {old_attr_value!r}
46 |             """
47 |             if raise_if_already_set:
48 |                 raise AttributeValidationError(oneline(message))
49 |             else:
50 |                 preamble = """
51 |                 Applying this type of decorator multiple times is deprecated and will
52 |                 become an error condition in a future release; please remove all but
53 |                 the uppermost uses of this decorator. Details:
54 |                 """
55 |                 warnings.warn(oneline(preamble) + "\n" + oneline(message))
56 |         setattr(self, attr_name, attr_value)
57 | 
58 | 
59 | def decorator_updating_accumulator(acc_update_func):
60 |     """
61 |     Creates a decorator which applies a transformation to the DecorationAccumulator
62 |     attached to the decorated function. (If no accumulator is attached, the decorator
63 |     will initialize one.)
64 |     """
65 | 
66 |     def decorator(func):
67 |         init_accumulator_if_not_set_on_func(func)
68 |         acc = get_accumulator_from_func(func)
69 |         acc_update_func(acc)
70 |         return func
71 | 
72 |     return decorator
73 | 
74 | 
75 | ACC_ATTR_NAME = "bionic_decorator_accumulator"
76 | 
77 | 
78 | def init_accumulator_if_not_set_on_func(func):
79 |     if not hasattr(func, ACC_ATTR_NAME):
80 |         setattr(
81 |             func,
82 |             ACC_ATTR_NAME,
83 |             DecorationAccumulator(provider=FunctionProvider(func)),
84 |         )
85 | 
86 | 
87 | def get_accumulator_from_func(func):
88 |     return getattr(func, ACC_ATTR_NAME)
89 | 
90 | 
91 | def pop_accumulator_from_func(func):
92 |     acc = get_accumulator_from_func(func)
93 |     delattr(func, ACC_ATTR_NAME)
94 |     return acc
95 | 


--------------------------------------------------------------------------------
/tests/test_flow/generate_test_compatibility_cache.py:
--------------------------------------------------------------------------------
 1 | # This script generates cache for a flow represented in Harness class
 2 | # inside test_dir (tests/test_flow/test_persistence_compatibility).
 3 | # The generated cache is used by test_persistence_compatibility.py tests
 4 | # to validate that the cache can be deserialized by current Bionic.
 5 | # In case the caching has changed, this file is used to replace the
 6 | # test cache.
 7 | #
 8 | # To renegerate cache, run the following command from bionic/ dir
 9 | #   `python -m tests.test_flow.generate_test_compatibility_cache`
10 | #
11 | # Note that the repo ignores *.pkl datafiles which is bypassed using
12 | # "Test data" section in .gitignore.
13 | 
14 | import os
15 | import shutil
16 | 
17 | import bionic as bn
18 | 
19 | from ..helpers import ResettingCallCounter
20 | 
21 | 
22 | CACHE_TEST_DIR = os.path.join(
23 |     os.path.dirname(__file__), "test_persistence_compatibility"
24 | )
25 | 
26 | 
27 | class Harness:
28 |     """
29 |     Holds a simple Bionic flow with counters to all the functions in it.
30 |     """
31 | 
32 |     EXPECTED_TOTAL_SUM = 1002
33 | 
34 |     def __init__(self, cache_dir, make_counter):
35 |         lowercase_sum_counter = make_counter()
36 |         uppercase_sum_counter = make_counter()
37 |         total_sum_counter = make_counter()
38 | 
39 |         builder = bn.FlowBuilder("test")
40 | 
41 |         builder.set("core__persistent_cache__flow_dir", cache_dir)
42 | 
43 |         # It's important that this test uses sets, because we want to check that sets
44 |         # are hashed deterministically. (Set iteration is non-deterministic, but it's
45 |         # always the same within one Python process, so a simpler test where we just
46 |         # run a flow multiple times won't work for this.)
47 |         builder.assign("lowercase_chars", set("abcdef"))
48 |         builder.assign("uppercase_chars", frozenset("ABCDEF"))
49 | 
50 |         @builder
51 |         @bn.version_no_warnings
52 |         def lowercase_sum(lowercase_chars):
53 |             lowercase_sum_counter.mark()
54 |             return sum(ord(char) for char in lowercase_chars)
55 | 
56 |         @builder
57 |         @bn.version_no_warnings
58 |         def uppercase_sum(uppercase_chars):
59 |             uppercase_sum_counter.mark()
60 |             return sum(ord(char) for char in uppercase_chars)
61 | 
62 |         @builder
63 |         @bn.version_no_warnings
64 |         def total_sum(lowercase_sum, uppercase_sum):
65 |             total_sum_counter.mark()
66 |             return lowercase_sum + uppercase_sum
67 | 
68 |         self.lowercase_sum_counter = lowercase_sum_counter
69 |         self.uppercase_sum_counter = uppercase_sum_counter
70 |         self.total_sum_counter = total_sum_counter
71 | 
72 |         self.manual_flow = builder.build()
73 |         builder.set("core__versioning_mode", "auto")
74 |         self.auto_flow = builder.build()
75 | 
76 |     @property
77 |     def flows(self):
78 |         return [self.manual_flow, self.auto_flow]
79 | 
80 | 
81 | if __name__ == "__main__":
82 | 
83 |     def make_counter():
84 |         return ResettingCallCounter()
85 | 
86 |     harness = Harness(CACHE_TEST_DIR, make_counter)
87 | 
88 |     shutil.rmtree(CACHE_TEST_DIR)
89 | 
90 |     for flow in harness.flows:
91 |         # Make sure everything is written to the cache.
92 |         flow.get("total_sum")
93 | 


--------------------------------------------------------------------------------
/tests/test_flow/test_new_api.py:
--------------------------------------------------------------------------------
  1 | """
  2 | These tests are for experimental descriptor-based uses of Bionic's API.
  3 | """
  4 | 
  5 | import pytest
  6 | 
  7 | import bionic as bn
  8 | from bionic.exception import EntityValueError
  9 | 
 10 | 
 11 | def test_returns(builder):
 12 |     @builder
 13 |     @bn.returns("one")
 14 |     def _():
 15 |         return 1
 16 | 
 17 |     @builder
 18 |     @bn.returns("two,")
 19 |     def _():
 20 |         return (2,)
 21 | 
 22 |     @builder
 23 |     @bn.returns("three, four")
 24 |     def _():
 25 |         return 3, 4
 26 | 
 27 |     @builder
 28 |     @bn.returns("five, (six, seven)")
 29 |     def _():
 30 |         return 5, (6, 7)
 31 | 
 32 |     flow = builder.build()
 33 | 
 34 |     assert flow.get("one") == 1
 35 |     assert flow.get("two") == 2
 36 |     assert flow.get("three") == 3
 37 |     assert flow.get("four") == 4
 38 |     assert flow.get("five") == 5
 39 |     assert flow.get("six") == 6
 40 |     assert flow.get("seven") == 7
 41 | 
 42 | 
 43 | def test_failing_returns(builder):
 44 |     @builder
 45 |     @bn.returns("a, b")
 46 |     def wrong_number_of_values():
 47 |         return 1, 2, 3
 48 | 
 49 |     @builder
 50 |     @bn.returns("c, d")
 51 |     def not_a_sequence():
 52 |         return 1
 53 | 
 54 |     @builder
 55 |     @bn.returns("(e, f), g")
 56 |     def wrong_tuple_structure():
 57 |         return 1, (2, 3)
 58 | 
 59 |     flow = builder.build()
 60 | 
 61 |     with pytest.raises(EntityValueError):
 62 |         flow.get("a")
 63 | 
 64 |     with pytest.raises(EntityValueError):
 65 |         flow.get("c")
 66 | 
 67 |     with pytest.raises(EntityValueError):
 68 |         flow.get("e")
 69 | 
 70 | 
 71 | def test_accepts(builder):
 72 |     builder.assign("x", 2)
 73 |     builder.assign("y", 3)
 74 |     builder.assign("z", 4)
 75 | 
 76 |     @builder
 77 |     @bn.accepts(my_x="x")
 78 |     def x_plus_one(my_x):
 79 |         return my_x + 1
 80 | 
 81 |     @builder
 82 |     @bn.accepts(x_="x,")
 83 |     def x_plus_two(x_):
 84 |         (x,) = x_
 85 |         return x + 2
 86 | 
 87 |     @builder
 88 |     @bn.accepts(my_y="y", my_other_y="y")
 89 |     def x_plus_two_y(x, my_y, my_other_y):
 90 |         return x + my_y + my_other_y
 91 | 
 92 |     @builder
 93 |     @bn.accepts(x_y="x, y")
 94 |     def x_plus_y(x_y):
 95 |         x, y = x_y
 96 |         return x + y
 97 | 
 98 |     @builder
 99 |     @bn.accepts(my_x="x", my_y="y")
100 |     def xy(my_x, my_y):
101 |         return my_x * my_y
102 | 
103 |     @builder
104 |     @bn.accepts(x_y_z="x, (y, z)")
105 |     def x_plus_y_plus_z(x_y_z):
106 |         x, (y, z) = x_y_z
107 |         return x + y + z
108 | 
109 |     flow = builder.build()
110 | 
111 |     assert flow.get("x_plus_one") == 3
112 |     assert flow.get("x_plus_two") == 4
113 |     assert flow.get("x_plus_y") == 5
114 |     assert flow.get("x_plus_two_y") == 8
115 |     assert flow.get("xy") == 6
116 |     assert flow.get("x_plus_y_plus_z") == 9
117 | 
118 | 
119 | @pytest.mark.skip("Not implemented yet")
120 | def test_get(builder):
121 |     builder.assign("x", 2)
122 |     builder.assign("y", 3)
123 |     builder.assign("z", 4)
124 | 
125 |     flow = builder.build()
126 | 
127 |     assert flow.get("()") == ()
128 |     assert flow.get("x,") == (2,)
129 |     assert flow.get("x, x") == (2, 2)
130 |     assert flow.get("x, y") == (2, 3)
131 |     assert flow.get("x, (y, z)") == (2, (3, 4))
132 | 


--------------------------------------------------------------------------------
/bionic/aip/main.py:
--------------------------------------------------------------------------------
 1 | """
 2 | This module is run as main in order to execute a task on a worker.
 3 | """
 4 | 
 5 | import logging
 6 | import pickle
 7 | import os
 8 | import sys
 9 | 
10 | from bionic.deps.optdep import import_optional_dependency
11 | from bionic.gcs import get_gcs_fs_without_warnings
12 | 
13 | 
14 | def _run(ipath, gcs_fs):
15 |     cloudpickle = import_optional_dependency("cloudpickle")
16 | 
17 |     with gcs_fs.open(ipath, "rb") as f:
18 |         task = cloudpickle.load(f)
19 | 
20 |     # Now that we have the task, set up logging.
21 |     _set_up_logging(task.job_id, task.config.project_id)
22 |     logging.info(f"Read task from {ipath}")
23 | 
24 |     result = task.function()
25 | 
26 |     opath = task.output_uri
27 |     logging.info(f"Uploading result to {opath}")
28 |     with gcs_fs.open(opath, "wb") as f:
29 |         pickle.dump(result, f)
30 | 
31 | 
32 | # Main entry point for AIP
33 | def run():
34 |     """
35 |     This method is a proxy to _run which does the actual work. The proxy exists
36 |     so that _run can be replaced for testing.
37 |     """
38 |     _run(sys.argv[-1], get_gcs_fs_without_warnings())
39 | 
40 | 
41 | def _set_up_logging(job_id, project_id):
42 |     if os.environ.get("BIONIC_NO_STACKDRIVER", False):
43 |         return
44 | 
45 |     # TODO This is the ID of the hyperparameter tuning trial currently
46 |     # running on this VM. This field is only set if the current
47 |     # training job is a hyperparameter tuning job. Conductor uses this
48 |     # environment variable but AIP documentation suggests us to use
49 |     # TF_CONFIG. Check whether we need to update this env variable.
50 |     # Find more details on TF_CONFIG at this link:
51 |     # https://cloud.google.com/ai-platform/training/docs/distributed-training-details
52 |     trial_id = os.environ.get("CLOUD_ML_TRIAL_ID", None)
53 | 
54 |     glogging = import_optional_dependency("google.cloud.logging")
55 | 
56 |     client = glogging.Client(project=project_id)
57 |     resource = glogging.resource.Resource(
58 |         type="ml_job",
59 |         # AIP expects a default task_name for the master cluster. We
60 |         # use a placeholder value till we start using clusters. Once we
61 |         # do, it should be configured based on the cluster.
62 |         labels=dict(job_id=job_id, project_id=project_id, task_name="master-replica-0"),
63 |     )
64 |     labels = None
65 |     if trial_id is not None:
66 |         # Enable grouping by trial when present.
67 |         labels = {"ml.googleapis.com/trial_id": trial_id}
68 | 
69 |     # Enable only the cloud logger to avoid duplicate messages.
70 |     handler = glogging.handlers.handlers.CloudLoggingHandler(
71 |         client, resource=resource, labels=labels
72 |     )
73 |     root_logger = logging.getLogger()
74 |     # Remote the StreamHandler. Any logs logged by it shows up as error
75 |     # logs in Stackdriver.
76 |     root_logger.handlers = []
77 |     # We should ideally make this configurable, but till then, let's
78 |     # set the level to DEBUG to write all the logs. It's not hard to
79 |     # filter using log level on Stackdriver so it doesn't create too
80 |     # much noise anyway.
81 |     root_logger.setLevel(logging.DEBUG)
82 |     root_logger.addHandler(handler)
83 |     for logger_name in glogging.handlers.handlers.EXCLUDED_LOGGER_DEFAULTS:
84 |         logging.getLogger(logger_name).propagate = False
85 | 
86 | 
87 | if __name__ == "__main__":
88 |     run()
89 | 


--------------------------------------------------------------------------------
/bionic/protocol.py:
--------------------------------------------------------------------------------
 1 | from . import protocols
 2 | from .utils.misc import oneline
 3 | 
 4 | # These are callable with or without arguments.  See BaseProtocol.__call__ for
 5 | # why we instantiate them here.
 6 | picklable = protocols.PicklableProtocol()  # noqa: F401
 7 | dillable = protocols.DillableProtocol()  # noqa: F401
 8 | dask = protocols.DaskProtocol()  # noqa: F401
 9 | image = protocols.ImageProtocol()  # noqa: F401
10 | numpy = protocols.NumPyProtocol()  # noqa: F401
11 | yaml = protocols.YamlProtocol()  # noqa: F401
12 | path = protocols.PathProtocol()  # noqa: F401
13 | geodataframe = protocols.GeoPandasProtocol()  # noqa: F401
14 | json = protocols.JsonProtocol()  # noqa: F401
15 | 
16 | 
17 | def frame(func=None, file_format=None, check_dtypes=None):
18 |     """
19 |     Decorator indicating that an entity will always have a pandas DataFrame
20 |     type.
21 | 
22 |     The frame values will be serialized to either Parquet (default) or Feather.
23 |     Parquet is more popular, but some types of data or frame structures are
24 |     only supported by one format or the other.  In particular, ordered
25 |     categorical columns are supported by Feather and not Parquet.
26 | 
27 |     This decorator can be used with or without arguments:
28 | 
29 |     .. code-block:: python
30 | 
31 |         @frame
32 |         def dataframe(...):
33 |             ...
34 | 
35 |         @frame(file_format='feather')
36 |         def dataframe(...):
37 |             ...
38 | 
39 |     Parameters
40 |     ----------
41 |     file_format: {'parquet', 'feather'} (default: 'parquet')
42 |         Which file format to use when saving values to disk.
43 |     check_dtypes: boolean (default: True)
44 |         Check for column types not supported by the file format.  This
45 |         check is best-effort and not guaranteed to catch all problems.  If
46 |         an unsupported data type is found, an exception will be thrown at
47 |         serialization time.
48 |     """
49 | 
50 |     # If the first argument is present, we were (hopefully) used as a decorator
51 |     # without any other arguments.
52 |     if func is not None:
53 |         if file_format is not None or check_dtypes is not None:
54 |             raise ValueError("frame can't be called with both a function and keywords")
55 |         if not callable(func):
56 |             raise ValueError(
57 |                 oneline(
58 |                     """
59 |                 frame must be used either (a) directly as a decorator or
60 |                 (b) with keyword arguments;
61 |                 it can't take positional arguments.
62 |                 """
63 |                 )
64 |             )
65 |         return protocols.ParquetDataFrameProtocol()(func)
66 | 
67 |     # Otherwise, we have arguments and should return a decorator.
68 |     if file_format is None or file_format == "parquet":
69 |         kwargs = {}
70 |         if check_dtypes is not None:
71 |             kwargs["check_dtypes"] = check_dtypes
72 |         return protocols.ParquetDataFrameProtocol(**kwargs)
73 |     elif file_format == "feather":
74 |         return protocols.FeatherDataFrameProtocol()
75 |     else:
76 |         raise ValueError(
77 |             oneline(
78 |                 f"""
79 |             file_format must be one of {'parquet', 'feather'};
80 |             got {file_format!r}"""
81 |             )
82 |         )
83 | 
84 | 
85 | # These need to be called with arguments.
86 | enum = protocols.EnumProtocol  # noqa: F401
87 | type = protocols.TypeProtocol  # noqa: F401
88 | 


--------------------------------------------------------------------------------
/docs/api/protocols.rst:
--------------------------------------------------------------------------------
 1 | ===========
 2 | Protocols
 3 | ===========
 4 | 
 5 | Introduction
 6 | ------------
 7 | 
 8 | Protocols are special cases of Bionic decorators; their effect is to specify
 9 | the `Serialization Protocol <../concepts.rst#serialization-protocols>`_ for the
10 | entity being defined.  For example:
11 | 
12 | .. code-block:: python
13 | 
14 |     # This entity should only have values equal to "short" or "long".
15 |     @builder
16 |     @bn.protocol.enum('short', 'long')
17 |     def name_length(name):
18 |         if len(name) < 10:
19 |             return 'short'
20 |         else:
21 |             return 'long'
22 | 
23 |     # This entity's value will always be a ``pandas.DataFrame``.
24 |     @builder
25 |     @bn.protocol.frame
26 |     def raw_df():
27 |         from sklearn import datasets
28 |         dataset = datasets.load_breast_cancer()
29 |         df = pd.DataFrame(
30 |             data=dataset.data,
31 |         )
32 |         df['target'] = dataset.target
33 |         return df
34 | 
35 | Protocols are used to tell Bionic how to serialize, deserialize, and validate
36 | entity values.  In most cases, Bionic's default protocol can figure out an
37 | appropriate way to handle each value, so explicit protocol decorators are
38 | usually not required.  However, they can be useful for data types that need
39 | special handling, or just to add clarity, safety, or documentation to a
40 | entity definition.
41 | 
42 | Protocols can also be used when creating new entities with ``declare`` or
43 | ``assign``:
44 | 
45 | .. code-block:: python
46 | 
47 |     builder.assign('name_length', 'short', bn.protocol.enum('short', 'long'))
48 |     builder.declare('raw_df', bn.protocol.frame)
49 | 
50 | Custom Protocols
51 | ----------------
52 | 
53 | If you need to control how an entity is serialized, you can write your own
54 | custom protocol.  (However, since Bionic is still at an early stage, future
55 | API changes may break your implementation.)
56 | 
57 | .. code-block:: python
58 | 
59 |     class MyProtocol(BaseProtocol):
60 |         def get_fixed_file_extension(self):
61 |             """
62 |             Returns a file extension identifying this protocol. This value will be appended
63 |             to the name of any file written by the protocol, and may be used to determine
64 |             whether a file can be read by the protocol.
65 | 
66 |             This string should be unique, not shared with any other protocol. By
67 |             convention, it doesn't include an initial period, but may include periods in
68 |             the middle.  (For example, `"csv"`, and `"csv.zip"` would both be sensible
69 |             file extensions.)
70 |             """
71 |             raise NotImplementedError()
72 | 
73 |         def write(self, value, path):
74 |             """Serializes the object ``value`` to the pathlib path ``path``."""
75 |             raise NotImplementedError()
76 | 
77 |         def read(self, path):
78 |             """Deserializes an object from the pathlib path ``path``, and returns it."""
79 |             raise NotImplementedError()
80 | 
81 | Built-In Protocol Decorators
82 | ----------------------------
83 | 
84 | .. autofunction:: bionic.protocol.dask
85 | .. autofunction:: bionic.protocol.dillable
86 | .. autofunction:: bionic.protocol.enum
87 | .. autofunction:: bionic.protocol.frame
88 | .. autofunction:: bionic.protocol.geodataframe
89 | .. autofunction:: bionic.protocol.image
90 | .. autofunction:: bionic.protocol.json
91 | .. autofunction:: bionic.protocol.numpy
92 | .. autofunction:: bionic.protocol.path
93 | .. autofunction:: bionic.protocol.picklable
94 | .. autofunction:: bionic.protocol.type
95 | .. autofunction:: bionic.protocol.yaml
96 | 


--------------------------------------------------------------------------------
/tests/test_flow/test_logging.py:
--------------------------------------------------------------------------------
  1 | import logging
  2 | import threading
  3 | 
  4 | import pytest
  5 | 
  6 | 
  7 | @pytest.mark.allows_parallel
  8 | def test_logging_details(builder, log_checker, parallel_execution_enabled):
  9 |     """
 10 |     Test the details of the log messages we emit. Since these messages are currently the
 11 |     best way to get visibility into what Bionic is doing, we have much more detailed
 12 |     tests than we'd normally want for logging. This means we'll have to tweak these
 13 |     tests as we update the format or implementation details of our logging.
 14 | 
 15 |     At some point we should introduce a separate system for user-facing
 16 |     progress reporting instead of using logs.
 17 |     """
 18 | 
 19 |     builder.assign("x", 1)
 20 | 
 21 |     @builder
 22 |     def x_plus_one(x):
 23 |         return x + 1
 24 | 
 25 |     @builder
 26 |     def x_plus_two(x_plus_one):
 27 |         return x_plus_one + 1
 28 | 
 29 |     flow = builder.build()
 30 |     assert flow.get("x_plus_one") == 2
 31 |     log_checker.expect_all(
 32 |         "Accessed   x(x=1) from definition",
 33 |         "Computing  x_plus_one(x=1) ...",
 34 |         "Computed   x_plus_one(x=1)",
 35 |     )
 36 | 
 37 |     assert flow.get("x_plus_two") == 3
 38 | 
 39 |     if parallel_execution_enabled:
 40 |         # This is different from serial execution because we don't pass
 41 |         # in-memory cache to the subprocesses. The subprocess loads the
 42 |         # entities from disk cache instead.
 43 |         log_checker.expect_all(
 44 |             "Loaded     x_plus_one(x=1) from disk cache",
 45 |             "Computing  x_plus_two(x=1) ...",
 46 |             "Computed   x_plus_two(x=1)",
 47 |         )
 48 |     else:
 49 |         log_checker.expect_all(
 50 |             "Accessed   x_plus_one(x=1) from in-memory cache",
 51 |             "Computing  x_plus_two(x=1) ...",
 52 |             "Computed   x_plus_two(x=1)",
 53 |         )
 54 | 
 55 |     flow = builder.build()
 56 |     assert flow.get("x_plus_one") == 2
 57 |     # We don't access the definitions for simple lookup objects in
 58 |     # parallel execution unless we use the objects for computation.
 59 |     # Since we load x_plus_one from disk cache, we don't access the
 60 |     # definition for x.
 61 |     # To clarify: we do access it for looking at the cache, but it's
 62 |     # taken from the case key where it is loaded by default and is not
 63 |     # counted as definition access in the flow.
 64 |     log_checker.expect_all("Loaded     x_plus_one(x=1) from disk cache")
 65 | 
 66 |     flow = builder.build()
 67 |     assert flow.get("x_plus_two") == 3
 68 |     log_checker.expect_all("Loaded     x_plus_two(x=1) from disk cache")
 69 | 
 70 |     flow = flow.setting("x_plus_one", 3)
 71 |     assert flow.get("x_plus_two") == 4
 72 |     log_checker.expect_all(
 73 |         "Accessed   x_plus_one(x_plus_one=3) from definition",
 74 |         "Computing  x_plus_two(x_plus_one=3) ...",
 75 |         "Computed   x_plus_two(x_plus_one=3)",
 76 |     )
 77 | 
 78 | 
 79 | class CannotPickleMe:
 80 |     def __init__(self):
 81 |         # Storing a lock makes it unpickleable
 82 |         self.lock = threading.Lock()
 83 | 
 84 |     def __str__(self):
 85 |         return "Cannot pickle me"
 86 | 
 87 | 
 88 | def test_log_unpickleable_value(builder, log_checker):
 89 |     @builder
 90 |     def log_unpickleable_value():
 91 |         # Test that we handle unpickleable value in `LogRecord.msg`.
 92 |         logging.info(CannotPickleMe())
 93 |         # Test that we handle unpickleable value in `LogRecord.args`.
 94 |         logging.info("Logging unpickleable class: %s", CannotPickleMe())
 95 |         return 5
 96 | 
 97 |     assert builder.build().get("log_unpickleable_value") == 5
 98 | 
 99 |     log_checker.expect_all(
100 |         "Computing  log_unpickleable_value() ...",
101 |         "Cannot pickle me",
102 |         "Logging unpickleable class: Cannot pickle me",
103 |         "Computed   log_unpickleable_value()",
104 |     )
105 | 


--------------------------------------------------------------------------------
/docs/tutorials/hello_world.ipynb:
--------------------------------------------------------------------------------
  1 | {
  2 |  "cells": [
  3 |   {
  4 |    "cell_type": "markdown",
  5 |    "metadata": {},
  6 |    "source": [
  7 |     "# Hello World"
  8 |    ]
  9 |   },
 10 |   {
 11 |    "cell_type": "raw",
 12 |    "metadata": {
 13 |     "raw_mimetype": "text/restructuredtext"
 14 |    },
 15 |    "source": [
 16 |     "Let's start with a very simple example: a Bionic flow that generates the text\n",
 17 |     "\"Hello world!\"\n",
 18 |     "\n",
 19 |     "*(The code for this example is available in the Bionic repo at\n",
 20 |     "example/hello_world.py.)*\n",
 21 |     "\n",
 22 |     ".. literalinclude:: ../../example/hello_world.py\n",
 23 |     "    :language: python\n",
 24 |     "    :linenos:\n",
 25 |     "\n",
 26 |     "We can run this code (assuming we've checked out the ``bionic`` repo) like\n",
 27 |     "this:\n",
 28 |     "\n",
 29 |     ".. code-block:: bash\n",
 30 |     "\n",
 31 |     "    > cd bionic\n",
 32 |     "    > python example/hello_world.py\n",
 33 |     "    Hello world!\n",
 34 |     "\n",
 35 |     "We can also import it in an interpreter or notebook:"
 36 |    ]
 37 |   },
 38 |   {
 39 |    "cell_type": "code",
 40 |    "execution_count": null,
 41 |    "metadata": {},
 42 |    "outputs": [],
 43 |    "source": [
 44 |     "# Configure the PYTHONPATH for this notebook.\n",
 45 |     "import _tutorial_setup\n",
 46 |     "\n",
 47 |     "from example.hello_world import flow\n",
 48 |     "flow.get('message')"
 49 |    ]
 50 |   },
 51 |   {
 52 |    "cell_type": "markdown",
 53 |    "metadata": {},
 54 |    "source": [
 55 |     "Although our `flow` object is immutable, we can easily make a new copy with a different value of `subject`:"
 56 |    ]
 57 |   },
 58 |   {
 59 |    "cell_type": "code",
 60 |    "execution_count": null,
 61 |    "metadata": {},
 62 |    "outputs": [],
 63 |    "source": [
 64 |     "new_flow = flow.setting('subject', 'universe')\n",
 65 |     "new_flow.get('message')"
 66 |    ]
 67 |   },
 68 |   {
 69 |    "cell_type": "markdown",
 70 |    "metadata": {},
 71 |    "source": [
 72 |     "We can also try changing the `message` directly:"
 73 |    ]
 74 |   },
 75 |   {
 76 |    "cell_type": "code",
 77 |    "execution_count": null,
 78 |    "metadata": {},
 79 |    "outputs": [],
 80 |    "source": [
 81 |     "flow.setting('message', 'Goodbye world!').get('message')"
 82 |    ]
 83 |   },
 84 |   {
 85 |    "cell_type": "markdown",
 86 |    "metadata": {},
 87 |    "source": [
 88 |     "As a convenience, `setting` and `get` can be called by an alternative syntax which makes it easier for your notebook or interpreter to autocomplete entity names:"
 89 |    ]
 90 |   },
 91 |   {
 92 |    "cell_type": "code",
 93 |    "execution_count": null,
 94 |    "metadata": {},
 95 |    "outputs": [],
 96 |    "source": [
 97 |     "flow.setting.subject('universe').get.message()"
 98 |    ]
 99 |   },
100 |   {
101 |    "cell_type": "markdown",
102 |    "metadata": {},
103 |    "source": [
104 |     "Finally, we can visualize our flow as a directed acyclic graph:"
105 |    ]
106 |   },
107 |   {
108 |    "cell_type": "code",
109 |    "execution_count": null,
110 |    "metadata": {},
111 |    "outputs": [],
112 |    "source": [
113 |     "flow.render_dag()"
114 |    ]
115 |   },
116 |   {
117 |    "cell_type": "markdown",
118 |    "metadata": {},
119 |    "source": [
120 |     "This flow doesn't do much, but it illustrates how flows can be constructed, used, and modified.\n",
121 |     "The next tutorial will demonstrate a more practical example."
122 |    ]
123 |   }
124 |  ],
125 |  "metadata": {
126 |   "kernelspec": {
127 |    "display_name": "Python 3",
128 |    "language": "python",
129 |    "name": "python3"
130 |   },
131 |   "language_info": {
132 |    "codemirror_mode": {
133 |     "name": "ipython",
134 |     "version": 3
135 |    },
136 |    "file_extension": ".py",
137 |    "mimetype": "text/x-python",
138 |    "name": "python",
139 |    "nbconvert_exporter": "python",
140 |    "pygments_lexer": "ipython3",
141 |    "version": "3.7.3"
142 |   }
143 |  },
144 |  "nbformat": 4,
145 |  "nbformat_minor": 2
146 | }
147 | 


--------------------------------------------------------------------------------
/tests/test_flow/test_outputs.py:
--------------------------------------------------------------------------------
  1 | import pytest
  2 | 
  3 | import pandas as pd
  4 | import pandas.testing as pdt
  5 | 
  6 | from ..helpers import RoundingProtocol
  7 | 
  8 | import bionic as bn
  9 | from bionic.exception import EntityValueError, UndefinedEntityError
 10 | 
 11 | 
 12 | @pytest.fixture(scope="function")
 13 | def preset_builder(builder):
 14 |     builder.assign("x", 2)
 15 |     builder.assign("y", 3)
 16 | 
 17 |     return builder
 18 | 
 19 | 
 20 | def test_output(preset_builder):
 21 |     builder = preset_builder
 22 | 
 23 |     @builder
 24 |     @bn.output("g")
 25 |     def f(x, y):
 26 |         return x + y
 27 | 
 28 |     flow = builder.build()
 29 | 
 30 |     assert flow.get("g") == 5
 31 | 
 32 |     with pytest.raises(UndefinedEntityError):
 33 |         flow.get("f")
 34 | 
 35 | 
 36 | def test_outputs(builder):
 37 |     builder.assign("numerator", 14)
 38 |     builder.assign("denominator", 3)
 39 | 
 40 |     @builder
 41 |     @bn.outputs("quotient", "remainder")
 42 |     def divide(numerator, denominator):
 43 |         quotient = numerator // denominator
 44 |         remainder = numerator % denominator
 45 |         return quotient, remainder
 46 | 
 47 |     flow = builder.build()
 48 | 
 49 |     assert flow.get("quotient") == 4
 50 |     assert flow.get("remainder") == 2
 51 | 
 52 |     with pytest.raises(UndefinedEntityError):
 53 |         flow.get("divide")
 54 | 
 55 | 
 56 | def test_outputs_custom_protocols_first(builder):
 57 |     builder.assign("location", (37.7, -122.4))
 58 | 
 59 |     @builder
 60 |     @bn.outputs("lat", "lon")
 61 |     def latlon(location):
 62 |         return location
 63 | 
 64 |     @builder
 65 |     @RoundingProtocol()
 66 |     @bn.outputs("rounded_lat", "rounded_lon")
 67 |     def rounded_latlon(lat, lon):
 68 |         return lat, lon
 69 | 
 70 |     @builder
 71 |     @bn.outputs("other_rounded_lat", "other_rounded_lon")
 72 |     @RoundingProtocol()
 73 |     def other_rounded_latlon(lat, lon):
 74 |         return lat, lon
 75 | 
 76 |     flow = builder.build()
 77 | 
 78 |     assert flow.get("lat") == 37.7
 79 |     assert flow.get("lon") == -122.4
 80 | 
 81 |     assert flow.get("rounded_lat") == 38
 82 |     assert flow.get("rounded_lon") == -122
 83 | 
 84 |     assert flow.get("other_rounded_lat") == 38
 85 |     assert flow.get("other_rounded_lon") == -122
 86 | 
 87 | 
 88 | # I'm not sure if there's an easy way to test that we're using the correct
 89 | # default protocol for each type, but at least we can check that nothing
 90 | # breaks.
 91 | def test_outputs_default_protocols(builder):
 92 |     expected_df = pd.DataFrame(columns=["x", "y"], data=[[1, 2], [3, 4]])
 93 | 
 94 |     @builder
 95 |     @bn.outputs("size", "df")
 96 |     def f():
 97 |         df = expected_df.copy()
 98 |         return len(df), df
 99 | 
100 |     flow = builder.build()
101 | 
102 |     assert flow.get("size") == 2
103 |     pdt.assert_frame_equal(flow.get("df"), expected_df)
104 | 
105 | 
106 | def test_singleton_outputs(builder):
107 |     @builder
108 |     @bn.outputs("a")
109 |     def one_output():
110 |         return (2,)
111 | 
112 |     assert builder.build().get("a") == 2
113 | 
114 | 
115 | def test_wrong_number_of_outputs(builder):
116 |     @builder
117 |     @bn.outputs("a", "b")
118 |     def three_outputs():
119 |         return (1, 2, 3)
120 | 
121 |     flow = builder.build()
122 |     with pytest.raises(EntityValueError):
123 |         flow.get("a")
124 | 
125 | 
126 | def test_non_sequence_outputs(builder):
127 |     @builder
128 |     @bn.outputs("a", "b")
129 |     def non_sequence_output():
130 |         return 1
131 | 
132 |     flow = builder.build()
133 |     with pytest.raises(EntityValueError):
134 |         flow.get("a")
135 | 
136 | 
137 | def test_non_sequence_outputs_message(builder):
138 |     @builder
139 |     @bn.outputs("a")
140 |     def non_sequence_output():
141 |         return 7
142 | 
143 |     flow = builder.build()
144 |     with pytest.raises(EntityValueError) as e:
145 |         flow.get("a")
146 |         assert "did you mean to use @output instead of @outputs?" in e.value
147 | 


--------------------------------------------------------------------------------
/example/ml_workflow_cli.py:
--------------------------------------------------------------------------------
  1 | """
  2 | A CLI for running an extended version of the ML example in `ml_workflow`.
  3 | 
  4 | Fits and evaluates a model on a scikit-learn dataset.
  5 | """
  6 | 
  7 | import time
  8 | 
  9 | import pandas as pd
 10 | from sklearn import datasets, metrics
 11 | 
 12 | import bionic as bn
 13 | from .ml_workflow import flow as base_ml_flow
 14 | 
 15 | # Add an AUC score summary to our flow.
 16 | builder = base_ml_flow.to_builder()
 17 | 
 18 | 
 19 | @builder
 20 | def auc_score(test_frame, prediction_frame):
 21 |     """
 22 |     The Area Under the (Receiver Operating Characteristic) Curve.
 23 |     """
 24 |     return metrics.roc_auc_score(
 25 |         test_frame["target"],
 26 |         prediction_frame["proba"],
 27 |     )
 28 | 
 29 | 
 30 | @builder
 31 | @bn.gather(over="hyperparams_dict", also="auc_score", into="gathered_frame")
 32 | @bn.outputs("best_hyperparams_dict", "best_auc_score")
 33 | @bn.docs(
 34 |     """The hyperparameter settings with the highest AUC score.""",
 35 |     """The best (highest) AUC score, compared over all hyperparameter settings.""",
 36 | )
 37 | def best_settings(gathered_frame):
 38 |     best_row = gathered_frame.sort_values("auc_score", ascending=False).iloc[0]
 39 |     return best_row[["hyperparams_dict", "auc_score"]]
 40 | 
 41 | 
 42 | flow = builder.build()
 43 | 
 44 | # Compute and print the model performance.
 45 | if __name__ == "__main__":
 46 |     import argparse
 47 | 
 48 |     parser = argparse.ArgumentParser(description="Runs a simple ML workflow example")
 49 |     parser.add_argument("--bucket", "-b", help="Name of GCS bucket to cache in")
 50 |     parser.add_argument(
 51 |         "--quiet", "-q", help="Don't enable INFO-level logging", action="store_true"
 52 |     )
 53 |     parser.add_argument(
 54 |         "--parallel", "-p", help="Run flow in parallel", action="store_true"
 55 |     )
 56 |     parser.add_argument(
 57 |         "-C",
 58 |         help="Value or values (comma-separated) for "
 59 |         "the inverse regularization parameter 'C'",
 60 |     )
 61 |     parser.add_argument(
 62 |         "--big-dataset", "-B", help="Use bigger covertype dataset", action="store_true"
 63 |     )
 64 |     parser.add_argument(
 65 |         "--render-dag",
 66 |         "-D",
 67 |         help="Render DAG visualization to file instead of running",
 68 |     )
 69 | 
 70 |     args = parser.parse_args()
 71 |     if not args.quiet:
 72 |         bn.utils.misc.init_basic_logging()
 73 |     if args.bucket is not None:
 74 |         flow = flow.setting(
 75 |             "core__persistent_cache__gcs__bucket_name", args.bucket
 76 |         ).setting("core__persistent_cache__gcs__enabled", True)
 77 |     if args.C is not None:
 78 |         c_values_str = args.C
 79 |         c_values = [
 80 |             float(c_value_str.strip()) for c_value_str in c_values_str.split(",")
 81 |         ]
 82 |         flow = flow.setting("hyperparams_dict", values=[{"C": c} for c in c_values])
 83 |     if args.big_dataset:
 84 |         builder = flow.to_builder()
 85 | 
 86 |         @builder
 87 |         @bn.version("covtype dataset")
 88 |         def raw_frame():
 89 |             dataset = datasets.fetch_covtype()
 90 |             feature_names = [f"feature_{ix}" for ix in range(dataset.data.shape[1])]
 91 |             df = pd.DataFrame(data=dataset.data, columns=feature_names)
 92 |             # This is a multiclass dataset, but we want to treat it as a binary one.
 93 |             # We'll just try to detect class 2, since that one is the most common.
 94 |             df["target"] = dataset.target == 2
 95 |             return df
 96 | 
 97 |         flow = builder.build()
 98 |     if args.parallel:
 99 |         flow = flow.setting("core__parallel_execution__enabled", True)
100 | 
101 |     if args.render_dag:
102 |         flow.render_dag().save(args.render_dag)
103 |         exit()
104 | 
105 |     start = time.time()
106 |     all_hpds = flow.get("hyperparams_dict", collection=list)
107 |     best_hpd = flow.get("best_hyperparams_dict")
108 |     best_auc_score = flow.get("best_auc_score")  # noqa: F811
109 |     end = time.time()
110 | 
111 |     print(f"Number of hyperparameter configurations tested: {len(all_hpds)}")
112 |     print(f"Best hyperparameter configuration: {best_hpd!r}")
113 |     print(f"Best AUC: {best_auc_score!r}")
114 |     print(f"Total time taken: {end - start}")
115 | 


--------------------------------------------------------------------------------
/bionic/deps/optdep.py:
--------------------------------------------------------------------------------
  1 | import re
  2 | import importlib
  3 | 
  4 | from .extras import extras_require as package_desc_lists_by_extra
  5 | from ..utils.misc import oneline
  6 | 
  7 | 
  8 | ILLEGAL_NAME_CHAR = re.compile("[^a-zA-Z0-9\\-._\\[\\]]")
  9 | 
 10 | 
 11 | def first_token_from_package_desc(desc):
 12 |     first_mismatch = ILLEGAL_NAME_CHAR.search(desc)
 13 |     if first_mismatch is None:
 14 |         return desc
 15 | 
 16 |     if desc[first_mismatch.start()] not in " <>!=":
 17 |         raise AssertionError(
 18 |             oneline(
 19 |                 f"""
 20 |             Package descriptor {desc!r} contained
 21 |             unexpected character {desc[first_mismatch.start()]!r}"""
 22 |             )
 23 |         )
 24 | 
 25 |     return desc[: first_mismatch.start()]
 26 | 
 27 | 
 28 | # For packages that we don't import by the exact package name, these are
 29 | # aliases we use.
 30 | alias_lists_by_package = {
 31 |     "google-cloud-logging": ["google.cloud.logging"],
 32 |     "google-auth": ["google.auth"],
 33 |     "Pillow": ["PIL.Image"],
 34 |     "dask[dataframe]": ["dask.dataframe"],
 35 |     "google-api-python-client": ["googleapiclient.discovery"],
 36 | }
 37 | 
 38 | # Now we contruct a new data structure to allow us to give helpful error
 39 | # messages when the user tries to import a package that's not available.
 40 | extras_by_importable_name = {}
 41 | for extra, package_descs in package_desc_lists_by_extra.items():
 42 |     for package_desc in package_descs:
 43 |         package = first_token_from_package_desc(package_desc)
 44 | 
 45 |         # Associate this package with the extra it belongs to -- as long as
 46 |         # we haven't seen this package before.  (Because we're iterating over
 47 |         # an OrderedDict, this will end up associating each package with the
 48 |         # first extra that requires it, which should also be the most specific
 49 |         # extra, and therefore the most helpful one to mention in an error
 50 |         # message.)
 51 |         if package not in extras_by_importable_name:
 52 |             extras_by_importable_name[package] = extra
 53 | 
 54 |             if package in alias_lists_by_package:
 55 |                 for importable_name in alias_lists_by_package[package]:
 56 |                     assert importable_name not in extras_by_importable_name
 57 |                     extras_by_importable_name[importable_name] = extra
 58 | 
 59 | # This is a fake entry for testing, since it's annoying to mock this.
 60 | TEST_EXTRA_NAME = "_FAKE_TEST_EXTRA_"
 61 | TEST_PACKAGE_NAME = "_FAKE_TEST_PACKAGE_"
 62 | extras_by_importable_name[TEST_PACKAGE_NAME] = TEST_EXTRA_NAME
 63 | 
 64 | 
 65 | # This is based on a similar function in Pandas:
 66 | # https://github.com/pandas-dev/pandas/blob/8ea102acdb45bb70cb30ea77108a50054c28c24d/pandas/compat/_optional.py
 67 | def import_optional_dependency(name, purpose=None, raise_on_missing=True):
 68 |     """
 69 |     Attempts to import a Python module that may or may not be available.  If
 70 |     it's not available, this function throws an ImportError explaining what the
 71 |     user needs to install.  (Unless ``raise_on_missing`` is set to False, in
 72 |     which case it returns None.)
 73 |     """
 74 | 
 75 |     if name not in extras_by_importable_name:
 76 |         raise AssertionError(
 77 |             oneline(
 78 |                 f"""
 79 |             Attempted to import {name!r},
 80 |             which is not registered as a dependency"""
 81 |             )
 82 |         )
 83 | 
 84 |     # TODO Once we have specific version requirements for our optional
 85 |     # packages, we should check that the version is correct.
 86 | 
 87 |     try:
 88 |         return importlib.import_module(name)
 89 |     except ImportError:
 90 |         if raise_on_missing:
 91 |             extra_name = extras_by_importable_name[name]
 92 | 
 93 |             if purpose is None:
 94 |                 description = "required"
 95 |             else:
 96 |                 description = "required for " + purpose
 97 | 
 98 |             raise ImportError(
 99 |                 oneline(
100 |                     f"""
101 |                 Unable to import package {name!r}, which is {description};
102 |                 you can use ``pip install 'bionic[{extra_name}]'``
103 |                 to resolve this"""
104 |                 )
105 |             )
106 | 
107 |         else:
108 |             return None
109 | 


--------------------------------------------------------------------------------
/bionic/utils/keyed_priority_stack.py:
--------------------------------------------------------------------------------
  1 | """
  2 | Provides an implementation of an updatable priority queue with specific ordering rules.
  3 | 
  4 | The other implementations I found (including `heapq` and `queue.PriorityQueue`) all have
  5 | one or more problems:
  6 | 
  7 | 1. They're based on min-heaps, so they return the lowest value first; this is
  8 | counterintuitive when dealing with priorities.
  9 | 2. Built-in tiebreaking is not provided.
 10 | 3. Updating priorities is impossible, or needs to be implemented separately.
 11 | 
 12 | The implementation here is based on the `heapq` implementation of a binary heap, but
 13 | adds reversed ordering (highest priority first), LIFO tiebreaking, and keyed lookup.
 14 | """
 15 | 
 16 | import heapq
 17 | from functools import total_ordering
 18 | 
 19 | 
 20 | class KeyedPriorityStack:
 21 |     """
 22 |     An updatable priority queue where ties are broken in last-in-first-out (LIFO) order.
 23 | 
 24 |     This data structure has a stack-like interface, supporting `push` and `pop`, but
 25 |     each element on the stack also has an associated key and priority. By default,
 26 |     `pop` removes returns the element with the *highest* priority (breaking ties in
 27 |     LIFO order), but it also accepts an optional key argument that specifies a specific
 28 |     element to be popped. This can be used to easily update an element's priority.
 29 |     """
 30 | 
 31 |     def __init__(self):
 32 |         self._heap = []
 33 |         self._next_seq_id = 0
 34 |         self._n_unremoved_entries = 0
 35 |         self._unremoved_entries_by_key = {}
 36 | 
 37 |     def push(self, key, value, priority):
 38 |         """
 39 |         Adds a value to the stack with associated key and priority.
 40 |         """
 41 | 
 42 |         if key is None:
 43 |             raise KeyError("Attempted to add None as key to priority stack")
 44 |         if key in self._unremoved_entries_by_key:
 45 |             raise ValueError(
 46 |                 f"Attempted to add duplicate key to priority stack: {key!r}"
 47 |             )
 48 |         seq_id = self._next_seq_id
 49 |         self._next_seq_id += 1
 50 |         entry = PriorityEntry(priority, seq_id, key, value)
 51 |         self._unremoved_entries_by_key[key] = entry
 52 |         heapq.heappush(self._heap, entry)
 53 |         self._n_unremoved_entries += 1
 54 | 
 55 |     def pop(self, key=None):
 56 |         """
 57 |         Removes a value from the stack and returns it.
 58 | 
 59 |         If no key is provided, removes and returns the highest-priority element (or
 60 |         the last-added such element, if there is a tie).
 61 | 
 62 |         If a key is provided, removes and returns the element with that key.
 63 |         """
 64 | 
 65 |         if key is not None:
 66 |             if key not in self._unremoved_entries_by_key:
 67 |                 raise KeyError(f"Key not found in priority stack: {key!r}")
 68 |             entry = self._unremoved_entries_by_key.pop(key)
 69 |             entry.is_removed = True
 70 |             self._n_unremoved_entries -= 1
 71 |             return entry.value
 72 | 
 73 |         else:
 74 |             while True:
 75 |                 if self._n_unremoved_entries == 0:
 76 |                     raise IndexError("Attempted to get item from empty priority stack")
 77 |                 entry = heapq.heappop(self._heap)
 78 |                 if entry.is_removed:
 79 |                     continue
 80 |                 self._n_unremoved_entries -= 1
 81 |                 del self._unremoved_entries_by_key[entry.key]
 82 |                 return entry.value
 83 | 
 84 |     def __len__(self):
 85 |         """
 86 |         Returns the number of elements on the stack.
 87 |         """
 88 | 
 89 |         return self._n_unremoved_entries
 90 | 
 91 | 
 92 | @total_ordering
 93 | class PriorityEntry:
 94 |     def __init__(self, priority, seq_id, key, value):
 95 |         self.priority = priority
 96 |         self.seq_id = seq_id
 97 |         self.key = key
 98 |         self.value = value
 99 |         self.is_removed = False
100 | 
101 |     def __lt__(self, other):
102 |         assert isinstance(other, PriorityEntry)
103 | 
104 |         return (self.priority, self.seq_id) > (other.priority, other.seq_id)
105 | 
106 |     def __eq__(self, other):
107 |         if not isinstance(other, PriorityEntry):
108 |             return False
109 |         return (self.priority, self.seq_id) == (other.priority, other.seq_id)
110 | 


--------------------------------------------------------------------------------
/docs/get-started.rst:
--------------------------------------------------------------------------------
  1 | ===========
  2 | Get Started
  3 | ===========
  4 | 
  5 | Installation
  6 | ------------
  7 | 
  8 | Bionic can be installed using ``pip``:
  9 | 
 10 | .. code-block:: bash
 11 | 
 12 |     pip install 'bionic[standard]'
 13 | 
 14 | The ``bionic[standard]`` package includes the core framework as well as the
 15 | most commonly-used dependencies.  There are several other subpackages offering
 16 | different dependencies, documented :ref:`below<extra-packages>`.
 17 | 
 18 | You will probably also want to install `Graphviz <https://www.graphviz.org/>`_,
 19 | which Bionic uses to generate visualizations of its workflow graph.
 20 | Unfortunately Graphviz is not written in Python and can't be installed by
 21 | ``pip``.  On Mac OS X, you can use `Homebrew <https://brew.sh/>`_ to install
 22 | it:
 23 | 
 24 | .. code-block:: bash
 25 | 
 26 |     brew install graphviz
 27 | 
 28 | If you want your data to be automatically `cached to Google Cloud Storage`_,
 29 | you'll also need to have the `Google Cloud SDK`_ installed, have access to a
 30 | GCS bucket, and install the ``bionic[gcp]`` subpackage.
 31 | 
 32 | .. _cached to Google Cloud Storage: concepts.rst#caching-in-google-cloud-storage
 33 | .. _Google Cloud SDK : https://cloud.google.com/sdk/
 34 | 
 35 | Finally, installing `LibYAML <https://github.com/yaml/libyaml>`_ will improve
 36 | performance for some workloads.  LibYAML is also available via Homebrew:
 37 | 
 38 | .. code-block:: bash
 39 | 
 40 |     brew install libyaml
 41 | 
 42 | Bionic supports Python 3.7 and above.
 43 | 
 44 | .. _extra-packages:
 45 | 
 46 | Extra Packages
 47 | ..............
 48 | 
 49 | The default ``bionic`` PyPI package installs only the minimal dependencies for
 50 | building and running flows.  However, many other dependency configurations are
 51 | available.  Most users will want the ``bionic[standard]`` package, which
 52 | supports common integrations like `Matplotlib <https://matplotlib.org/>`_,
 53 | as well as `graph visualization`_.
 54 | 
 55 | .. _graph visualization: concepts.rst#visualizing-flows
 56 | 
 57 | The full set of subpackages is as follows:
 58 | 
 59 | ========== ==================================== ================================
 60 | Subpackage  Installation Command                Enables
 61 | ========== ==================================== ================================
 62 | dask       ``pip install 'bionic[dask]'``       the ``@dask`` decorator
 63 | ---------- ------------------------------------ --------------------------------
 64 | dev        ``pip install 'bionic[dev]'``        every feature; testing; building
 65 |                                                 documentation
 66 | ---------- ------------------------------------ --------------------------------
 67 | dill       ``pip install 'bionic[dill]'``       the ``@dillable`` decorator
 68 | ---------- ------------------------------------ --------------------------------
 69 | examples   ``pip install 'bionic[examples]'``   the tutorial example code
 70 | ---------- ------------------------------------ --------------------------------
 71 | full       ``pip install 'bionic[full]'``       every non-development feature
 72 | ---------- ------------------------------------ --------------------------------
 73 | gcp        ``pip install 'bionic[gcp]'``        caching to GCS
 74 | ---------- ------------------------------------ --------------------------------
 75 | geopandas  ``pip install 'bionic[geopandas]'``  the ``@geodataframe`` decorator
 76 | ---------- ------------------------------------ --------------------------------
 77 | image      ``pip install 'bionic[image]'``      automatic de/serialization of
 78 |                                                 ``PIL.Image`` objects
 79 | ---------- ------------------------------------ --------------------------------
 80 | matplotlib ``pip install 'bionic[matplotlib]'`` the ``@pyplot`` decorator
 81 | ---------- ------------------------------------ --------------------------------
 82 | parallel   ``pip install 'bionic[parallel]'``   parallel execution
 83 | ---------- ------------------------------------ --------------------------------
 84 | standard   ``pip install 'bionic[standard]'``   graph visualization; ``Image``
 85 |                                                 handling; ``@pyplot``
 86 | ---------- ------------------------------------ --------------------------------
 87 | viz        ``pip install 'bionic[viz]'``        graph visualization
 88 | ========== ==================================== ================================
 89 | 
 90 | Tutorials
 91 | ---------
 92 | 
 93 | These two worked examples illustrate the basic mechanics of Bionic.
 94 | 
 95 | .. toctree::
 96 |     :maxdepth: 1
 97 | 
 98 |     tutorials/hello_world.ipynb
 99 |     tutorials/ml_workflow.ipynb
100 | 


--------------------------------------------------------------------------------
/bionic/aip/docker_image_builder.py:
--------------------------------------------------------------------------------
  1 | """
  2 | Builds a docker image for Google AI Platform execution using the current Python
  3 | environment.
  4 | """
  5 | import pathlib
  6 | import subprocess
  7 | from concurrent.futures import Future
  8 | from concurrent.futures.thread import ThreadPoolExecutor
  9 | from textwrap import dedent
 10 | 
 11 | from bionic.deps.optdep import import_optional_dependency
 12 | 
 13 | import hashlib
 14 | import sys
 15 | import tempfile
 16 | import re
 17 | import logging
 18 | 
 19 | logger = logging.getLogger(__name__)
 20 | 
 21 | 
 22 | _cached_docker_module = None
 23 | _cached_docker_client = None
 24 | 
 25 | 
 26 | def get_docker_module():
 27 |     global _cached_docker_module
 28 | 
 29 |     if _cached_docker_module is None:
 30 |         _cached_docker_module = import_optional_dependency(
 31 |             "docker", purpose="Build Docker images"
 32 |         )
 33 | 
 34 |     return _cached_docker_module
 35 | 
 36 | 
 37 | def get_docker_client():
 38 |     global _cached_docker_client
 39 | 
 40 |     if _cached_docker_client is None:
 41 |         docker = get_docker_module()
 42 |         logger.info("Initializing Docker client ...")
 43 |         _cached_docker_client = docker.from_env()
 44 | 
 45 |     return _cached_docker_client
 46 | 
 47 | 
 48 | def fix_pip_requirements(pip_requirements: str) -> str:
 49 |     # Pip freeze may contain entries with editable installs pointing to remote
 50 |     # git repositories. This can happen when doing Bionic development. Docker
 51 |     # service is not able to access repositories using the git+git protocol.
 52 |     # Hence, any entries containing git+git is converted to use git+https.
 53 |     #
 54 |     # Example entry:
 55 |     # -e git+git@github.com:square/bionic.git@f13f5405e928d92b553d2cbee41084eecccf7de3#egg=bionic
 56 |     #
 57 |     # Converted entry:
 58 |     # -e git+https://github.com/square/bionic.git@f13f5405e928d92b553d2cbee41084eecccf7de3#egg=bionic
 59 | 
 60 |     def fix_line(line: str) -> str:
 61 |         if line.startswith("-e git+git"):
 62 |             return re.sub(r"-e ([^@]*)@", "-e git+https://", line.replace(":", "/"))
 63 |         else:
 64 |             return line
 65 | 
 66 |     return "\n".join([fix_line(line) for line in pip_requirements.split("\n")])
 67 | 
 68 | 
 69 | def get_pip_freeze() -> str:
 70 |     return subprocess.run(
 71 |         ["pip", "freeze"], capture_output=True, check=True, encoding="utf-8"
 72 |     ).stdout
 73 | 
 74 | 
 75 | def get_pip_requirements() -> str:
 76 |     return fix_pip_requirements(get_pip_freeze())
 77 | 
 78 | 
 79 | def get_image_uri(project_name: str, pip_requirements: str) -> str:
 80 |     m = hashlib.sha256()
 81 |     m.update(pip_requirements.encode("utf-8"))
 82 |     m.update(str(sys.version_info).encode("utf-8"))
 83 | 
 84 |     image_tag = f"bionic_{m.hexdigest()}"
 85 | 
 86 |     return f"gcr.io/{project_name}/bionic:{image_tag}"
 87 | 
 88 | 
 89 | def build_image(
 90 |     docker_client,
 91 |     pip_requirements: str,
 92 |     image_uri: str,
 93 | ):
 94 |     with tempfile.TemporaryDirectory() as tmp_dir:
 95 |         tmp_path = pathlib.Path(tmp_dir)
 96 | 
 97 |         (tmp_path / "requirements.txt").write_text(pip_requirements)
 98 | 
 99 |         container_image = f"python:{sys.version_info[0]}.{sys.version_info[1]}"
100 | 
101 |         (tmp_path / "Dockerfile").write_text(
102 |             dedent(
103 |                 f"""
104 |                     FROM {container_image}
105 |                     COPY requirements.txt requirements.txt
106 |                     RUN pip install -r requirements.txt
107 |                 """
108 |             )
109 |         )
110 | 
111 |         logger.info(f"Building {image_uri} using {container_image}")
112 |         image, _ = docker_client.images.build(path=tmp_dir, tag=f"{image_uri}")
113 | 
114 |         logger.info(f"Pushing {image_uri}")
115 |         for line in docker_client.images.push(f"{image_uri}", stream=True, decode=True):
116 |             logger.debug(line)
117 | 
118 |         logger.info(f"Uploaded {image_uri}")
119 | 
120 | 
121 | def build_image_if_missing(project_name: str) -> str:
122 |     pip_requirements = get_pip_requirements()
123 |     image_uri = get_image_uri(project_name, pip_requirements)
124 | 
125 |     docker = get_docker_module()
126 |     docker_client = get_docker_client()
127 | 
128 |     try:
129 |         docker_client.images.get_registry_data(image_uri)
130 |         logger.info(f"{image_uri} already exists")
131 |     except docker.errors.NotFound:
132 |         build_image(
133 |             docker_client=docker_client,
134 |             pip_requirements=pip_requirements,
135 |             image_uri=image_uri,
136 |         )
137 | 
138 |     return image_uri
139 | 
140 | 
141 | def build_image_if_missing_async(project_name: str) -> Future:
142 |     return ThreadPoolExecutor(max_workers=1).submit(
143 |         build_image_if_missing, project_name
144 |     )
145 | 


--------------------------------------------------------------------------------
/tests/test_flow/test_copy.py:
--------------------------------------------------------------------------------
  1 | import json
  2 | from pathlib import Path
  3 | 
  4 | import dask.dataframe as dd
  5 | import pytest
  6 | 
  7 | import bionic as bn
  8 | from ..helpers import df_from_csv_str, equal_frame_and_index_content
  9 | 
 10 | 
 11 | @pytest.fixture
 12 | def preset_builder(builder):
 13 |     builder.assign("x", 2)
 14 |     builder.assign("y", 3)
 15 | 
 16 |     @builder
 17 |     def f(x, y):
 18 |         return x + y
 19 | 
 20 |     return builder
 21 | 
 22 | 
 23 | @pytest.fixture
 24 | def flow(preset_builder):
 25 |     return preset_builder.build()
 26 | 
 27 | 
 28 | @pytest.fixture
 29 | def expected_dask_df():
 30 |     df_value = df_from_csv_str(
 31 |         """
 32 |     color,number
 33 |     red,1
 34 |     blue,2
 35 |     green,3
 36 |     """
 37 |     )
 38 |     return dd.from_pandas(df_value, npartitions=1)
 39 | 
 40 | 
 41 | @pytest.fixture
 42 | def dask_flow(builder, expected_dask_df):
 43 |     @builder
 44 |     @bn.protocol.dask
 45 |     def dask_df():
 46 |         return expected_dask_df
 47 | 
 48 |     return builder.build()
 49 | 
 50 | 
 51 | @pytest.fixture
 52 | def override_gcs_for_copy_if_fake_gcp(use_fake_gcp, gcs_fs, monkeypatch):
 53 |     """
 54 |     A flow has an instance of GCS filesystem if GCS caching is enabled. But we
 55 |     still need to support the case where the user wants to upload the results to
 56 |     GCS even though GCS caching is disabled for the flow. Hence, the
 57 |     upload_to_gcs method does not use the flow's GCS filesystem in case GCS
 58 |     caching is disabled. If we have a fake GCS filesystem, we have to patch it
 59 |     manually.
 60 |     """
 61 | 
 62 |     if use_fake_gcp:
 63 |         monkeypatch.setattr("bionic.gcs.get_gcs_fs_without_warnings", lambda: gcs_fs)
 64 | 
 65 | 
 66 | def test_copy_file_to_existing_local_dir(flow, tmp_path):
 67 |     dir_path = tmp_path / "output"
 68 |     dir_path.mkdir()
 69 |     flow.get("f", mode="FileCopier").copy(destination=dir_path)
 70 | 
 71 |     expected_file_path = dir_path / "f.json"
 72 |     assert json.loads(expected_file_path.read_bytes()) == 5
 73 | 
 74 | 
 75 | def test_copy_file_to_local_file(flow, tmp_path):
 76 |     file_path = tmp_path / "data.json"
 77 |     flow.get("f", mode="FileCopier").copy(destination=file_path)
 78 | 
 79 |     assert json.loads(file_path.read_bytes()) == 5
 80 | 
 81 | 
 82 | def test_copy_file_to_local_file_using_str(flow, tmp_path):
 83 |     file_path = tmp_path / "data.json"
 84 |     file_path_str = str(file_path)
 85 |     flow.get("f", mode="FileCopier").copy(destination=file_path_str)
 86 |     assert json.loads(file_path.read_bytes()) == 5
 87 | 
 88 | 
 89 | @pytest.mark.needs_gcs
 90 | def test_copy_file_to_gcs_dir(
 91 |     flow, tmp_path, tmp_gcs_url_prefix, override_gcs_for_copy_if_fake_gcp, gcs_fs
 92 | ):
 93 |     flow.get("f", mode="FileCopier").copy(destination=tmp_gcs_url_prefix)
 94 |     cloud_url = tmp_gcs_url_prefix + "f.json"
 95 |     local_path = tmp_path / "f.json"
 96 |     gcs_fs.get_file(cloud_url, local_path)
 97 |     assert json.loads(local_path.read_bytes()) == 5
 98 | 
 99 | 
100 | @pytest.mark.needs_gcs
101 | def test_copy_file_to_gcs_file(
102 |     flow, tmp_path, tmp_gcs_url_prefix, override_gcs_for_copy_if_fake_gcp, gcs_fs
103 | ):
104 |     cloud_url = tmp_gcs_url_prefix + "f.json"
105 |     flow.get("f", mode="FileCopier").copy(destination=cloud_url)
106 |     local_path = tmp_path / "f.json"
107 |     gcs_fs.get_file(cloud_url, local_path)
108 |     assert json.loads(local_path.read_bytes()) == 5
109 | 
110 | 
111 | def test_copy_dask_to_dir(tmp_path, expected_dask_df, dask_flow):
112 |     destination = tmp_path / "output"
113 |     destination.mkdir()
114 |     expected_dir_path = destination / "dask_df.pq.dask"
115 | 
116 |     dask_flow.get("dask_df", mode="FileCopier").copy(destination=destination)
117 | 
118 |     actual = dd.read_parquet(expected_dir_path)
119 |     assert equal_frame_and_index_content(actual.compute(), expected_dask_df.compute())
120 | 
121 | 
122 | @pytest.mark.needs_gcs
123 | def test_copy_dask_to_gcs_dir(
124 |     tmp_path,
125 |     tmp_gcs_url_prefix,
126 |     expected_dask_df,
127 |     dask_flow,
128 |     override_gcs_for_copy_if_fake_gcp,
129 |     gcs_fs,
130 | ):
131 |     cloud_url = tmp_gcs_url_prefix + "output"
132 |     local_path = tmp_path / "output"
133 | 
134 |     dask_flow.get("dask_df", mode="FileCopier").copy(destination=cloud_url)
135 | 
136 |     gcs_fs.get(cloud_url, str(local_path), recursive=True)
137 |     actual = dd.read_parquet(local_path)
138 |     assert equal_frame_and_index_content(actual.compute(), expected_dask_df.compute())
139 | 
140 | 
141 | def test_get_multi_value_entity(builder):
142 |     my_set = {"oscar", "the", "grouch"}
143 |     builder.assign("val", values=my_set)
144 | 
145 |     @builder
146 |     def multi_entity(val):
147 |         return val
148 | 
149 |     flow = builder.build()
150 |     results = flow.get("multi_entity", collection=set, mode=Path)
151 |     results = {json.loads(res.read_bytes()) for res in results}
152 | 
153 |     assert results == my_set
154 | 


--------------------------------------------------------------------------------
/tests/test_flow/test_dagviz.py:
--------------------------------------------------------------------------------
  1 | """
  2 | Tests for dagviz and FlowImage class.
  3 | """
  4 | 
  5 | import pytest
  6 | from xml.etree import ElementTree as ET
  7 | from PIL import Image
  8 | 
  9 | import bionic as bn
 10 | from bionic import dagviz
 11 | 
 12 | 
 13 | @pytest.fixture
 14 | def flow(builder):
 15 |     builder.assign("first_name", values=["Alice", "Bob"])
 16 |     builder.assign("last_name", "Smith")
 17 | 
 18 |     @builder
 19 |     @bn.outputs("full_name", "initials")
 20 |     @bn.docs(
 21 |         """The full name.""",
 22 |         """Just the initials.""",
 23 |     )
 24 |     def _(first_name, last_name):
 25 |         return f"{first_name} {last_name}", f"{first_name[0]}{last_name[0]}"
 26 | 
 27 |     @builder
 28 |     @bn.gather(over="full_name")
 29 |     @bn.returns("all_names,")
 30 |     def _(gather_df):
 31 |         """Comma-separated list of names."""
 32 |         return ", ".join(gather_df["full_name"])
 33 | 
 34 |     return builder.build()
 35 | 
 36 | 
 37 | @pytest.fixture
 38 | def flow_image(flow):
 39 |     return flow.render_dag()
 40 | 
 41 | 
 42 | @pytest.fixture
 43 | def flow_graph(flow):
 44 |     return flow._deriver.export_dag()
 45 | 
 46 | 
 47 | @pytest.fixture
 48 | def flow_dot(flow_graph):
 49 |     return dagviz.dot_from_graph(flow_graph)
 50 | 
 51 | 
 52 | def nodes_by_name_from_dot(dot):
 53 |     return {
 54 |         node.get_name(): node
 55 |         for subgraph in dot.get_subgraphs()
 56 |         for node in subgraph.get_nodes()
 57 |     }
 58 | 
 59 | 
 60 | def test_dag_size(flow_graph):
 61 |     assert len(flow_graph.nodes) == 11
 62 | 
 63 | 
 64 | def test_dot_names_and_colors(flow_dot):
 65 |     nodes = nodes_by_name_from_dot(flow_dot)
 66 |     same_color_name_groups = [
 67 |         # We've wrapped all our names in quotes to work around pydot. However, they're
 68 |         # not visible in the visualization.
 69 |         ['"first_name[0]"', '"first_name[1]"'],
 70 |         ['"last_name"'],
 71 |         [
 72 |             '"<full_name, initials>[0]"',
 73 |             '"<full_name, initials>[1]"',
 74 |             '"full_name[0]"',
 75 |             '"full_name[1]"',
 76 |             '"initials[0]"',
 77 |             '"initials[1]"',
 78 |         ],
 79 |         ['"<all_names,>"', '"all_names"'],
 80 |     ]
 81 | 
 82 |     all_names = [name for name_group in same_color_name_groups for name in name_group]
 83 |     assert set(nodes.keys()) == set(all_names)
 84 | 
 85 |     all_group_colors = set()
 86 |     for name_group in same_color_name_groups:
 87 |         group_colors = set(nodes[name].get_fillcolor() for name in name_group)
 88 |         assert len(group_colors) == 1
 89 |         (group_color,) = group_colors
 90 |         assert group_color not in all_group_colors
 91 |         all_group_colors.add(group_color)
 92 | 
 93 | 
 94 | def test_dot_tooltips(flow_dot):
 95 |     nodes = nodes_by_name_from_dot(flow_dot)
 96 |     assert nodes['"last_name"'].get_tooltip() == "Persisted: True"
 97 |     assert (
 98 |         nodes['"all_names"'].get_tooltip()
 99 |         == "Comma-separated list of names.\n\nPersisted: True"
100 |     )
101 |     assert (
102 |         nodes['"initials[0]"'].get_tooltip() == "Just the initials.\n\nPersisted: True"
103 |     )
104 |     assert (
105 |         nodes['"initials[1]"'].get_tooltip() == "Just the initials.\n\nPersisted: True"
106 |     )
107 |     assert (
108 |         nodes['"<full_name, initials>[0]"'].get_tooltip()
109 |         == "(Intermediate value) A Python tuple with 2 values.\n\nPersisted: False"
110 |     )
111 | 
112 | 
113 | def test_save_flowimage_file_path(tmp_path, flow_image):
114 |     """When a file path is given as input, and type is supported by PIL
115 |     check that output image format is preserved."""
116 |     filepath = tmp_path / "test.png"
117 |     flow_image.save(filepath)
118 |     output = Image.open(filepath)
119 |     assert output.format == "PNG"
120 | 
121 | 
122 | def test_save_flowimage_file_path_svg(tmp_path, flow_image):
123 |     """When a file path is given as input and svg as the format"""
124 |     filepath = tmp_path / "test.svg"
125 |     flow_image.save(filepath)
126 |     output_text = (tmp_path / "test.svg").read_text()
127 |     try:
128 |         ET.fromstring(output_text)
129 |     except ET.ParseError:
130 |         pytest.fail(
131 |             "output from saving SVG to file object not well formed XML {}".format(
132 |                 output_text
133 |             )
134 |         )
135 | 
136 | 
137 | def test_save_flowimage_file_object(tmp_path, flow_image):
138 |     """When a file object is given as input, use PIL interface to save"""
139 |     with open(tmp_path / "test.png", "wb") as file_object:
140 |         flow_image.save(file_object, format="png")
141 |     output = Image.open(tmp_path / "test.png")
142 |     assert output.format == "PNG"
143 | 
144 | 
145 | def test_save_flowimage_file_object_svg(tmp_path, flow_image):
146 |     """When a file object is given as input and file is svg, use builtin interface to save"""
147 |     with open(tmp_path / "test.svg", "wb") as file_object:
148 |         flow_image.save(file_object, format="svg")
149 |     output_text = (tmp_path / "test.svg").read_text()
150 |     try:
151 |         ET.fromstring(output_text)
152 |     except ET.ParseError:
153 |         pytest.fail(
154 |             "output from saving SVG to file object not well formed XML {}".format(
155 |                 output_text
156 |             )
157 |         )
158 | 


--------------------------------------------------------------------------------
/example/ml_workflow.py:
--------------------------------------------------------------------------------
  1 | """
  2 | A toy ML workflow intended to demonstrate basic Bionic features.  Trains a
  3 | logistic regression model on the UCI ML Breast Cancer Wisconsin (Diagnostic)
  4 | dataset.
  5 | """
  6 | 
  7 | import re
  8 | 
  9 | import pandas as pd
 10 | from sklearn import datasets, linear_model, metrics, model_selection
 11 | 
 12 | import bionic as bn
 13 | 
 14 | # Initialize our builder.
 15 | builder = bn.FlowBuilder("ml_workflow")
 16 | 
 17 | # Define some basic parameters.
 18 | builder.assign(
 19 |     "random_seed", 0, doc="Arbitrary seed for all random decisions in the flow."
 20 | )
 21 | builder.assign(
 22 |     "test_split_fraction", 0.3, doc="Fraction of data to include in test set."
 23 | )
 24 | builder.assign(
 25 |     "hyperparams_dict", {"C": 1}, doc="Hyperparameters to use when training the model."
 26 | )
 27 | builder.assign(
 28 |     "feature_inclusion_regex",
 29 |     ".*",
 30 |     doc="Regular expression specifying which feature names to include.",
 31 | )
 32 | 
 33 | 
 34 | # Load the raw data.
 35 | @builder
 36 | def raw_frame():
 37 |     """
 38 |     The raw data, including all features and a `target` column of labels.
 39 |     """
 40 |     dataset = datasets.load_breast_cancer()
 41 |     df = pd.DataFrame(data=dataset.data, columns=dataset.feature_names)
 42 |     df["target"] = dataset.target
 43 |     return df
 44 | 
 45 | 
 46 | # Select a subset of the columns to use as features.
 47 | @builder
 48 | def features_frame(raw_frame, feature_inclusion_regex):
 49 |     """Labeled data with a selected subset of the feature columns."""
 50 |     included_feature_cols = [
 51 |         col
 52 |         for col in raw_frame.columns.drop("target")
 53 |         if re.match(feature_inclusion_regex, col)
 54 |     ]
 55 |     return raw_frame[included_feature_cols + ["target"]]
 56 | 
 57 | 
 58 | # Split the data into train and test sets.
 59 | @builder
 60 | # The `@outputs` decorator tells Bionic to define two new entities from this
 61 | # function (which returns a tuple of two values).
 62 | @bn.outputs("train_frame", "test_frame")
 63 | @bn.docs(
 64 |     "Subset of feature data rows, used for model training.",
 65 |     "Subset of feature data rows, used for model testing.",
 66 | )
 67 | def split_raw_frame(features_frame, test_split_fraction, random_seed):
 68 |     return model_selection.train_test_split(
 69 |         features_frame,
 70 |         test_size=test_split_fraction,
 71 |         random_state=random_seed,
 72 |     )
 73 | 
 74 | 
 75 | # Fit a logistic regression model on the training data.
 76 | @builder
 77 | def model(train_frame, random_seed, hyperparams_dict):
 78 |     """A binary classifier sklearn model."""
 79 |     m = linear_model.LogisticRegression(
 80 |         solver="liblinear", random_state=random_seed, **hyperparams_dict
 81 |     )
 82 |     m.fit(train_frame.drop("target", axis=1), train_frame["target"])
 83 |     return m
 84 | 
 85 | 
 86 | # Predict probabilities for the test data.
 87 | @builder
 88 | def prediction_frame(model, test_frame):
 89 |     """
 90 |     A dataframe with one column, `proba`, containing predicted probabilities for the
 91 |     test data.
 92 |     """
 93 |     predictions = model.predict_proba(test_frame.drop("target", axis=1))[:, 1]
 94 |     df = pd.DataFrame()
 95 |     df["proba"] = predictions
 96 |     return df
 97 | 
 98 | 
 99 | # Evaluate the model's precision and recall over a range of threshold values.
100 | @builder
101 | def precision_recall_frame(test_frame, prediction_frame):
102 |     """
103 |     A dataframe with three columns:
104 |     - `threshold`: a probability threshold for the model
105 |     - `precision`: the test set precision resulting from that threshold
106 |     - `recall`: the test set recall resulting from that threshold
107 |     """
108 |     precisions, recalls, thresholds = metrics.precision_recall_curve(
109 |         test_frame["target"],
110 |         prediction_frame["proba"],
111 |     )
112 | 
113 |     df = pd.DataFrame()
114 |     df["threshold"] = [0] + list(thresholds) + [1]
115 |     df["precision"] = list(precisions) + [1]
116 |     df["recall"] = list(recalls) + [0]
117 | 
118 |     return df
119 | 
120 | 
121 | # Plot the precision against the recall.
122 | @builder
123 | # The `@pyplot` decorator makes the Matplotlib plotting context available to
124 | # our function, then translates our plot into an image object.
125 | @bn.pyplot("plt")
126 | # The `@gather` decorator collects the values of of "hyperparams_dict" and
127 | # "precision_recall_frame" into a single dataframe named "gathered_frame".
128 | # This might not seem very interesting since "gathered_frame" only has one row,
129 | # but it will become useful once we introduce multiplicity.
130 | @bn.gather(
131 |     over="hyperparams_dict", also="precision_recall_frame", into="gathered_frame"
132 | )
133 | def all_hyperparams_pr_plot(gathered_frame, plt):
134 |     """
135 |     A plot of precision against recall.  Includes one curve for each set of
136 |     hyperparameters.
137 |     """
138 |     _, ax = plt.subplots(figsize=(4, 3))
139 |     for row in gathered_frame.itertuples():
140 |         label = ", ".join(
141 |             f"{key}={value}" for key, value in row.hyperparams_dict.items()
142 |         )
143 |         row.precision_recall_frame.plot(x="recall", y="precision", label=label, ax=ax)
144 |     ax.set_xlabel("Recall")
145 |     ax.set_ylabel("Precision")
146 | 
147 | 
148 | # Assemble our flow object.
149 | flow = builder.build()
150 | 


--------------------------------------------------------------------------------
/docs/future.rst:
--------------------------------------------------------------------------------
  1 | ====================
  2 | The Future of Bionic
  3 | ====================
  4 | 
  5 | Development Status
  6 | -------------------
  7 | 
  8 | Bionic is still at an early stage, and many features have been planned but not
  9 | implemented.  All of these features should be developed at some point, but the
 10 | exact timeline is not fixed.
 11 | 
 12 | Future Work
 13 | -----------
 14 | 
 15 | Distributed Computation
 16 | .......................
 17 | 
 18 | Currently Bionic computes everything on a single machine, using either a single
 19 | process or many in parallel. Later it will be able to dispatch jobs to other machines
 20 | (such as a cloud-based compute cluster) to achieve even more parallelization.
 21 | 
 22 | Direct Access to Persisted Files
 23 | ................................
 24 | 
 25 | Bionic is built around the idea that the user's code generally wants to operate
 26 | on in-memory objects rather than files.  However, in some cases it's preferable
 27 | to operate on the raw files.  For example, if a file is large we might want to
 28 | load only small parts into memory at a time; or we might want to call an
 29 | external script that only knows how to operate on files.  In these cases it
 30 | would be helpful to be able to do something like this:
 31 | 
 32 | .. code-block:: python
 33 | 
 34 |     @builder
 35 |     @bionic.arg_as_file_path('raw_frame', 'raw_frame_path')
 36 |     def transformed_data(raw_frame_path):
 37 |         assert raw_frame_path.suffix == '.pq'
 38 |         subprocess.check_call(['transform_data.sh', str(raw_frame_path)])
 39 | 
 40 | Graph-Rewriting Decorators
 41 | ..........................
 42 | 
 43 | Normally Bionic translates each entity into a single node (or a parallel set of
 44 | nodes) in its dependency graph.  However, in somes cases we might want to
 45 | generate a more complex subgraph.  For example, the author of an entity might
 46 | know that its computation can be safely broken into chunks and run in parallel:
 47 | 
 48 | .. code-block:: python
 49 | 
 50 |     @builder
 51 |     @bionic.parallelize_by_row('raw_frame'):
 52 |     def filtered_data(raw_frame, relevant_categories):
 53 |         return raw_frame[raw_frame['category'].isin(relevant_categories)]
 54 | 
 55 | 
 56 | User-Defined Decorators
 57 | .......................
 58 | 
 59 | Bionic currently provides several built-in decorators, but their implementation
 60 | is complex and tightly coupled with Bionic's internals.  This is partly because
 61 | we're still figuring out what Bionic's internal data model should look like.
 62 | Once those internals are cleaner and more stable, it will be possible for users
 63 | to write (and share) their own decorators.
 64 | 
 65 | For example, Bionic provides a built-in :func:`@pyplot <bionic.pyplot>`
 66 | decorator to make Matplotlib plotting easier.  We might want similar decorators
 67 | for other external libraries that are awkward to use in the Bionic framework.
 68 | 
 69 | Smarter Cache Invalidation
 70 | ..........................
 71 | 
 72 | Although Bionic attempts to automatically figure out when cached data can be
 73 | used and when it needs to be recomputed, the user still needs to tell it about
 74 | code changes using :func:`@version <bionic.version>`.  We have some experimental
 75 | features (see :ref:`automatic-versioning`) to help with this, but they aren't
 76 | 100% accurate. We believe we can improve their accuracy to the point where
 77 | cache invalidation can be inferred automatically, without requiring the
 78 | ``@version`` decorator at all.
 79 | 
 80 | Automatic Regression Tests
 81 | ..........................
 82 | 
 83 | Following up on the concept of non-functional changes above: when a user
 84 | performs a change that is supposed to be non-functional, they might actually
 85 | want Bionic to verify this by re-running their code and confirming that the
 86 | output is the same as the previous version's.
 87 | 
 88 | Data Validation
 89 | ...............
 90 | 
 91 | Often we'd like to make assertions about an entity's output and be alerted if
 92 | those assertions are violated.  Currently this can be done in two ways: adding
 93 | ``assert`` statements to the entity's function, or writing
 94 | a custom `Protocol <api/protocols.rst>`_ with a special ``validate`` method.
 95 | These solutions share two problems.  First, they have to be written by the
 96 | person who defines the entity; it's not possible to add new assertions about
 97 | pre-existing entities.  Second, if the assertions fail, the entity's value
 98 | never gets persisted, so it's difficult to debug the problem -- especially if
 99 | the value was expensive to compute.
100 | 
101 | A better approach would be a first-class concept of an entity that validates
102 | other entities, after their value has been persisted but before it can be
103 | consumed by any other (non-validator) entities.
104 | 
105 | Better Multiplicity Abstractions
106 | ................................
107 | 
108 | Bionic's concept of creating multiple values for an entity and then gathering
109 | them together is fairly novel (as far as we know), which means it will probably
110 | require some iteration before we find the best way to work with it.  There are
111 | definitely many use cases of multiplicity that are awkward or impossible to
112 | express with the current API.  For example, we might want one entity to be able
113 | to generate multiple downstream instances of another: for example, a
114 | ``hyperparameter_search_strategy`` entity which creates multiple instances of a
115 | ``hyperparameters_dict`` entity.
116 | 


--------------------------------------------------------------------------------
/tests/test_flow/test_persistence_aip.py:
--------------------------------------------------------------------------------
  1 | import subprocess
  2 | from textwrap import dedent
  3 | 
  4 | import pytest
  5 | 
  6 | import bionic as bn
  7 | 
  8 | # This is detected by pytest and applied to all the tests in this module.
  9 | from bionic.aip.docker_image_builder import fix_pip_requirements
 10 | from bionic.aip.state import AipError
 11 | 
 12 | pytestmark = pytest.mark.needs_aip
 13 | 
 14 | 
 15 | def test_aip_jobs(aip_builder, log_checker):
 16 |     builder = aip_builder
 17 | 
 18 |     builder.assign("x1", 1)
 19 | 
 20 |     # Test various combinations of memoize and persist settings for these
 21 |     # function entities.
 22 | 
 23 |     @builder
 24 |     def x2():
 25 |         return 2
 26 | 
 27 |     @builder
 28 |     @bn.persist(False)
 29 |     def x3():
 30 |         return 3
 31 | 
 32 |     @builder
 33 |     @bn.memoize(False)
 34 |     def x4():
 35 |         return 4
 36 | 
 37 |     @builder
 38 |     @bn.persist(False)
 39 |     @bn.memoize(False)
 40 |     def x5():
 41 |         return 5
 42 | 
 43 |     @builder
 44 |     @bn.run_in_aip("n1-standard-4")
 45 |     def y1(x1, x2, x3, x4, x5):
 46 |         return x1 + x2 + x3 + x4 + x5 + 1
 47 | 
 48 |     @builder
 49 |     @bn.run_in_aip("n1-standard-8")
 50 |     def y2(x1, x2, x3, x4, x5):
 51 |         return x1 + x2 + x3 + x4 + x5 + 2
 52 | 
 53 |     @builder
 54 |     def y3(x1, x2, x3, x4, x5):
 55 |         return x1 + x2 + x3 + x4 + x5 + 3
 56 | 
 57 |     @builder
 58 |     def y4(x1, x2, x3, x4, x5):
 59 |         return x1 + x2 + x3 + x4 + x5 + 4
 60 | 
 61 |     @builder
 62 |     def y5(x1, x2, x3, x4, x5):
 63 |         return x1 + x2 + x3 + x4 + x5 + 5
 64 | 
 65 |     @builder
 66 |     def total(y1, y2, y3, y4, y5):
 67 |         return y1 + y2 + y3 + y4 + y5
 68 | 
 69 |     assert builder.build().get("y1") == 16
 70 | 
 71 |     log_checker.expect_regex(
 72 |         r"Staging AI Platform task .* at gs://.*bionic_y1.*",
 73 |         r"Started AI Platform task: https://console.cloud.google.com/ai-platform/jobs/.*bionic_y1.*",
 74 |         r"Submitting AI Platform task .*\(name='y1'\).*CaseKey\(x1=1\).*",
 75 |         r"Computed   y1\(x1=1\) using AI Platform",
 76 |         r"Downloading y1\(x1=1\) from GCS \.\.\.",
 77 |     )
 78 | 
 79 |     assert builder.build().get("total") == 90
 80 | 
 81 |     log_checker.expect_regex(
 82 |         r"Loaded     y1\(x1=1\) from disk cache",
 83 |         r"Staging AI Platform task .* at gs://.*bionic_y2.*",
 84 |         r"Started AI Platform task: https://console.cloud.google.com/ai-platform/jobs/.*bionic_y2.*",
 85 |         r"Submitting AI Platform task .*\(name='y2'\).*CaseKey\(x1=1\).*",
 86 |         r"Computed   y2\(x1=1\) using AI Platform",
 87 |         r"Downloading y2\(x1=1\) from GCS \.\.\.",
 88 |         r"Computed   y3\(x1=1\)",
 89 |         r"Computed   y4\(x1=1\)",
 90 |         r"Computed   y5\(x1=1\)",
 91 |         r"Computed   total\(x1=1\)",
 92 |     )
 93 | 
 94 | 
 95 | def test_aip_fail(aip_builder, log_checker):
 96 |     builder = aip_builder
 97 | 
 98 |     builder.assign("x", 1)
 99 | 
100 |     @builder
101 |     @bn.run_in_aip("n1-standard-4")
102 |     def x_plus_one(x):
103 |         raise Exception()
104 | 
105 |     with pytest.raises(AipError):
106 |         builder.build().get("x_plus_one")
107 | 
108 |     log_checker.expect_regex(
109 |         r"Staging AI Platform task .* at gs://.*bionic_x_plus_one.*",
110 |         r"Started AI Platform task: https://console.cloud.google.com/ai-platform/jobs/.*bionic_x_plus_one.*",
111 |         r"Submitting AI Platform task .*\(name='x_plus_one'\).*CaseKey\(x=1\).*",
112 |         r".*error while doing remote computation for x_plus_one\(x=1\).*AipError.*",
113 |     )
114 | 
115 | 
116 | def test_fix_pip_requirements():
117 |     pip_requirements = dedent(
118 |         """
119 |         Package1==1.2.3
120 |         Package2==2
121 |         -e git+git@github.com:square/bionic.git@f13f5405e928d92b553d2cbee41084eecccf7de3#egg=bionic
122 |         -e git+https://github.com/square/bionic.git@88fec3d6921ed13b7c7575cca4c292b4f7003b9c#egg=bionic
123 |     """
124 |     )
125 | 
126 |     fixed_pip_requirements = dedent(
127 |         """
128 |         Package1==1.2.3
129 |         Package2==2
130 |         -e git+https://github.com/square/bionic.git@f13f5405e928d92b553d2cbee41084eecccf7de3#egg=bionic
131 |         -e git+https://github.com/square/bionic.git@88fec3d6921ed13b7c7575cca4c292b4f7003b9c#egg=bionic
132 |     """
133 |     )
134 | 
135 |     assert fix_pip_requirements(pip_requirements) == fixed_pip_requirements
136 | 
137 | 
138 | @pytest.mark.needs_aip_and_docker_commit_access
139 | @pytest.mark.real_gcp_only
140 | @pytest.mark.no_parallel
141 | def test_aip_with_docker_build(aip_builder):
142 |     builder = aip_builder
143 |     builder.set("core__aip_execution__docker_image_name", None)
144 | 
145 |     def get_pip_freeze_exclude_editable() -> str:
146 |         # pip freeze may not work properly for editable installs when running in
147 |         # AIP since AIP does not have access to remote git repositories. Hence
148 |         # editable installs are excluded.
149 |         return subprocess.run(
150 |             ["pip", "freeze", "--exclude-editable"],
151 |             capture_output=True,
152 |             check=True,
153 |             encoding="utf-8",
154 |         ).stdout
155 | 
156 |     @builder
157 |     @bn.run_in_aip("n1-standard-4")
158 |     def x():
159 |         return get_pip_freeze_exclude_editable()
160 | 
161 |     flow = builder.build()
162 | 
163 |     assert flow.get("x") == get_pip_freeze_exclude_editable()
164 | 


--------------------------------------------------------------------------------
/tests/conftest.py:
--------------------------------------------------------------------------------
  1 | import os
  2 | 
  3 | import pytest
  4 | from git import Repo
  5 | 
  6 | from bionic.utils.misc import oneline
  7 | 
  8 | 
  9 | @pytest.fixture(autouse=True)
 10 | def set_env_variables(monkeypatch):
 11 |     # We don't want to set up Stackdriver logging for local tests.
 12 |     monkeypatch.setenv("BIONIC_NO_STACKDRIVER", "True")
 13 |     yield
 14 |     monkeypatch.delenv("BIONIC_NO_STACKDRIVER")
 15 | 
 16 | 
 17 | def pytest_addoption(parser):
 18 |     parser.addoption(
 19 |         "--slow", action="store_true", default=False, help="run slow tests"
 20 |     )
 21 |     parser.addoption(
 22 |         "--bucket", action="store", help="URL to GCS bucket to use for tests"
 23 |     )
 24 |     parser.addoption(
 25 |         "--aip",
 26 |         action="store_true",
 27 |         default=False,
 28 |         help="run AIP tests, requires --bucket",
 29 |     )
 30 |     parser.addoption(
 31 |         "--parallel",
 32 |         action="store_true",
 33 |         default=False,
 34 |         help="also run all tests with parallel execution mode",
 35 |     )
 36 | 
 37 | 
 38 | def pytest_configure(config):
 39 |     def add_mark(name, description):
 40 |         config.addinivalue_line("markers", f"{name}: given test {description}")
 41 | 
 42 |     # These markers are added manually.
 43 |     add_mark("slow", "runs slowly")
 44 |     add_mark("needs_gcs", "requires GCS to run")
 45 |     add_mark("needs_aip", "requires AIP execution to run")
 46 |     add_mark("needs_parallel", "requires parallel execution to run")
 47 |     add_mark("no_parallel", "does not run with parallel execution")
 48 |     add_mark(
 49 |         "allows_parallel",
 50 |         "can run with parallel execution even when that's not explicitly enabled",
 51 |     )
 52 |     add_mark("real_gcp_only", "runs on real GCP only")
 53 |     add_mark("fake_gcp_only", "runs on fake GCP only")
 54 |     add_mark(
 55 |         "needs_aip_and_docker_commit_access",
 56 |         "requires AIP and docker access to the current git commit",
 57 |     )
 58 | 
 59 |     # These markers are added automatically based on parametric fixtures.
 60 |     add_mark("serial", "will run using serial execution")
 61 |     add_mark("parallel", "will run using parallel execution")
 62 |     add_mark("real_gcp", "use real gcp")
 63 |     add_mark("fake_gcp", "use fake gcp")
 64 | 
 65 |     # This marker is added automatically based on other markers.
 66 |     add_mark("baseline", "runs by default when no options are passed to pytest")
 67 | 
 68 | 
 69 | def is_current_commit_remotely_available():
 70 |     repo = Repo(os.getcwd(), search_parent_directories=True)
 71 |     return (
 72 |         not repo.is_dirty()
 73 |         and len(repo.git.branch("-r", "--contains", repo.head.ref.object.hexsha)) > 0
 74 |     )
 75 | 
 76 | 
 77 | def pytest_collection_modifyitems(config, items):
 78 |     also_run_slow = config.getoption("--slow")
 79 |     skip_slow = pytest.mark.skip(reason="only runs when --slow is set")
 80 | 
 81 |     has_gcs = config.getoption("--bucket")
 82 |     skip_needs_gcs = pytest.mark.skip(reason="only runs when --bucket is set")
 83 | 
 84 |     has_aip = has_gcs and config.getoption("--aip")
 85 |     skip_needs_aip = pytest.mark.skip(
 86 |         reason="only runs when both --bucket and --aip are set"
 87 |     )
 88 | 
 89 |     also_run_parallel = config.getoption("--parallel")
 90 | 
 91 |     items_to_keep = []
 92 |     for item in items:
 93 |         item_is_baseline = True
 94 | 
 95 |         if "slow" in item.keywords:
 96 |             item_is_baseline = False
 97 |             if not also_run_slow:
 98 |                 item.add_marker(skip_slow)
 99 | 
100 |         if "real_gcp" in item.keywords:
101 |             if "fake_gcp_only" in item.keywords:
102 |                 continue
103 | 
104 |             if "needs_gcs" in item.keywords:
105 |                 item_is_baseline = False
106 |                 if not has_gcs:
107 |                     item.add_marker(skip_needs_gcs)
108 | 
109 |             if "needs_aip" in item.keywords:
110 |                 item_is_baseline = False
111 |                 if not has_aip:
112 |                     item.add_marker(skip_needs_aip)
113 | 
114 |         elif "fake_gcp" in item.keywords:
115 |             if "real_gcp_only" in item.keywords:
116 |                 continue
117 | 
118 |         if "parallel" in item.keywords:
119 |             if "allows_parallel" not in item.keywords:
120 |                 item_is_baseline = False
121 | 
122 |                 if "no_parallel" in item.keywords or not also_run_parallel:
123 |                     continue
124 | 
125 |         elif "needs_parallel" in item.keywords:
126 |             continue
127 | 
128 |         if "needs_aip_and_docker_commit_access" in item.keywords:
129 |             if not has_aip:
130 |                 item.add_marker(skip_needs_aip)
131 |             elif not is_current_commit_remotely_available():
132 |                 item.add_marker(
133 |                     pytest.mark.skip(
134 |                         reason=oneline(
135 |                             """
136 |                         only runs when --bucket and --aip are set and the
137 |                         current git commit is available for access by docker
138 |                         build; that means the commit is pushed to the remote
139 |                         repository
140 |                     """
141 |                         )
142 |                     )
143 |                 )
144 | 
145 |         if item_is_baseline:
146 |             item.add_marker(pytest.mark.baseline)
147 | 
148 |         items_to_keep.append(item)
149 | 
150 |     items.clear()
151 |     items.extend(items_to_keep)
152 | 


--------------------------------------------------------------------------------
/tests/test_utils/test_keyed_priority_stack.py:
--------------------------------------------------------------------------------
  1 | import pytest
  2 | 
  3 | from random import Random
  4 | 
  5 | from bionic.utils.keyed_priority_stack import KeyedPriorityStack
  6 | 
  7 | 
  8 | def test_simple_push():
  9 |     kps = KeyedPriorityStack()
 10 | 
 11 |     assert len(kps) == 0
 12 | 
 13 |     kps.push("ONE", "1", 1)
 14 |     kps.push("TWO_A", "2a", 2)
 15 |     kps.push("THREE", "3", 3)
 16 |     kps.push("TWO_B", "2b", 2)
 17 | 
 18 |     assert len(kps) == 4
 19 | 
 20 |     assert kps.pop() == "3"
 21 |     assert kps.pop() == "2b"
 22 |     assert kps.pop() == "2a"
 23 |     assert kps.pop() == "1"
 24 | 
 25 |     assert len(kps) == 0
 26 | 
 27 | 
 28 | def test_pop_by_key():
 29 |     kps = KeyedPriorityStack()
 30 | 
 31 |     with pytest.raises(KeyError):
 32 |         kps.pop("ONE")
 33 | 
 34 |     kps.push("ONE", "1", 1)
 35 |     kps.push("TWO_A", "2a", 2)
 36 |     kps.push("THREE", "3", 3)
 37 |     kps.push("TWO_B", "2b", 2)
 38 | 
 39 |     with pytest.raises(KeyError):
 40 |         kps.pop("1")
 41 | 
 42 |     assert kps.pop("TWO_B") == "2b"
 43 |     assert kps.pop() == "3"
 44 |     assert kps.pop("TWO_A") == "2a"
 45 |     assert kps.pop() == "1"
 46 | 
 47 |     with pytest.raises(KeyError):
 48 |         kps.pop("THREE")
 49 | 
 50 | 
 51 | def test_incomparable_unhashable_values():
 52 |     class Wrapper:
 53 |         def __init__(self, value):
 54 |             self.value = value
 55 | 
 56 |         def __eq__(self, other):
 57 |             raise NotImplementedError("!")
 58 | 
 59 |         def __hash__(self, other):
 60 |             raise NotImplementedError("!")
 61 | 
 62 |     kps = KeyedPriorityStack()
 63 | 
 64 |     kps.push("ONE", Wrapper("1"), 1)
 65 |     kps.push("TWO_A", Wrapper("2a"), 2)
 66 |     kps.push("THREE", Wrapper("3"), 3)
 67 |     kps.push("TWO_B", Wrapper("2b"), 2)
 68 | 
 69 |     assert kps.pop().value == "3"
 70 |     assert kps.pop().value == "2b"
 71 |     assert kps.pop().value == "2a"
 72 |     assert kps.pop().value == "1"
 73 | 
 74 | 
 75 | def test_random():
 76 |     """
 77 |     Tests our data structure by applying a series of random operations and comparing
 78 |     the results to an oracle (SimpleKeyedPriorityStack).
 79 |     """
 80 | 
 81 |     random = Random(0)
 82 |     MAX_VALUE = 1000000
 83 | 
 84 |     test_kps = KeyedPriorityStack()
 85 |     ctrl_kps = SimpleKeyedPriorityStack()
 86 | 
 87 |     def do_push():
 88 |         value = random.randrange(MAX_VALUE)
 89 |         priority = random.randrange(MAX_VALUE)
 90 |         key = random.randrange(MAX_VALUE)
 91 | 
 92 |         test_kps.push(key, value, priority)
 93 |         ctrl_kps.push(key, value, priority)
 94 | 
 95 |     def do_and_check_pop():
 96 |         if len(test_kps) == 0:
 97 |             with pytest.raises(IndexError):
 98 |                 test_kps.pop()
 99 |             return
100 | 
101 |         assert test_kps.pop() == ctrl_kps.pop()
102 | 
103 |     def do_and_check_pop_with_key():
104 |         key = ctrl_kps._get_random_key(random)
105 |         if key is None:
106 |             return
107 | 
108 |         assert test_kps.pop(key) == ctrl_kps.pop(key)
109 | 
110 |     def check_pop_missing_key():
111 |         key = random.randrange(MAX_VALUE) + MAX_VALUE
112 | 
113 |         with pytest.raises(KeyError):
114 |             test_kps.pop(key)
115 | 
116 |     def check_push():
117 |         key = ctrl_kps._get_random_key(random)
118 |         if key is None:
119 |             return
120 |         value = random.randrange(MAX_VALUE)
121 |         priority = random.randrange(MAX_VALUE)
122 | 
123 |         with pytest.raises(ValueError):
124 |             test_kps.push(key, value, priority)
125 | 
126 |     def check_len():
127 |         assert len(test_kps) == len(ctrl_kps)
128 | 
129 |     N_ITERS = 3000
130 |     ACTIONS = [
131 |         # We have more pushes than pops, so the size of the stack should tend to grow
132 |         # over time.
133 |         do_push,
134 |         do_push,
135 |         do_push,
136 |         do_and_check_pop,
137 |         do_and_check_pop_with_key,
138 |         check_len,
139 |         check_push,
140 |         check_pop_missing_key,
141 |     ]
142 |     for i in range(N_ITERS):
143 |         action = random.choice(ACTIONS)
144 |         action()
145 |     while len(test_kps) > 0:
146 |         do_and_check_pop()
147 |     check_len()
148 | 
149 | 
150 | class SimpleKeyedPriorityStack:
151 |     """
152 |     An alternative implementation of KeyedPriorityStack which is simpler but less
153 |     efficient.
154 |     """
155 | 
156 |     def __init__(self):
157 |         self._sorted_quads = []
158 |         self._next_seq_id = 0
159 | 
160 |     def push(self, key, value, priority):
161 |         seq_id = self._next_seq_id
162 |         self._next_seq_id += 1
163 | 
164 |         self._sorted_quads.append([priority, seq_id, value, key])
165 |         self._sorted_quads.sort()
166 | 
167 |     def pop(self, key=None):
168 |         if key is not None:
169 |             ix = self._quad_ix_for_key(key)
170 |             _, _, value, _ = self._sorted_quads.pop(ix)
171 |             return value
172 | 
173 |         else:
174 |             _, _, value, _ = self._sorted_quads.pop()
175 |             return value
176 | 
177 |     def __len__(self):
178 |         return len(self._sorted_quads)
179 | 
180 |     def _quad_ix_for_key(self, key):
181 |         (quad_ix,) = [
182 |             quad_ix
183 |             for (quad_ix, (_, _, _, quad_key)) in enumerate(self._sorted_quads)
184 |             if quad_key == key
185 |         ]
186 |         return quad_ix
187 | 
188 |     def _get_random_key(self, random):
189 |         if len(self) == 0:
190 |             return None
191 |         return random.choice(self._sorted_quads)[3]
192 | 


--------------------------------------------------------------------------------
/docs/contributing.rst:
--------------------------------------------------------------------------------
  1 | ======================
  2 | Contributing to Bionic
  3 | ======================
  4 | 
  5 | Bionic's source is maintained on `GitHub <https://github.com/square/bionic>`_.
  6 | You can clone it with:
  7 | 
  8 | .. code-block:: bash
  9 | 
 10 |     git clone git@github.com:square/bionic.git
 11 | 
 12 | Pull requests are welcome!  (However, for large changes, we recommend
 13 | discussing the proposed change on our `Issues page
 14 | <https://github.com/square/bionic/issues>`_ first.)  Because Bionic is
 15 | supported by Square, all new contributors will be asked to sign `Square's
 16 | Contributor License Agreement
 17 | <https://gist.github.com/square-cla/0dac5a22575ecf5e4f40825e7de51d5d>`_ as part
 18 | of the pull request process.
 19 | 
 20 | For Bionic core developers, our internal processes are documented :doc:`here
 21 | <maintaining>`.
 22 | 
 23 | Submitting a Pull Request
 24 | -------------------------
 25 | 
 26 | To maintain a baseline level of correctness, readability, and design
 27 | coherence, every pull request to Bionic is reviewed by a maintainer.
 28 | Maintainers typically check at least the following:
 29 | 
 30 | 1. If you're making changes to Bionic's behavior, include tests if possible,
 31 |    and add an entry to the `Release Notes <release-notes.html>`_.
 32 | 2. If you're updating Bionic's user-facing API, :ref:`update the
 33 |    documentation <docs>`.
 34 | 3. Make sure all existing :ref:`tests <tests>` and :ref:`style checks
 35 |    <style>` pass. (This will be automatically checked by our :ref:`continuous
 36 |    integration <ci>`.)
 37 | 4. Try to conform to the style of the surrounding code.
 38 | 
 39 | To make your review go as smoothly as possible, we also suggest writing clean
 40 | and helpful commit messages.
 41 | 
 42 | Setting Up Your Development Environment
 43 | ---------------------------------------
 44 | 
 45 | Bionic has some additional dependencies required for running tests and building
 46 | documentation.  Most of these can be installed by running this from the root
 47 | of the repo:
 48 | 
 49 | .. code-block:: bash
 50 | 
 51 |     pip install -e '.[dev]'
 52 | 
 53 | If you want to build the documentation, you also need to install `pandoc
 54 | <https://pandoc.org/>`_, which is used to convert notebook files into Sphinx
 55 | documents.  On OS X you can do this with Homebrew:
 56 | 
 57 | .. code-block:: bash
 58 | 
 59 |     brew install pandoc
 60 | 
 61 | .. _tests :
 62 | 
 63 | Tests
 64 | -----
 65 | 
 66 | Bionic has a suite of automated tests using the
 67 | `pytest <https://docs.pytest.org/en/latest/>`_ framework. You can run most of
 68 | them like this:
 69 | 
 70 | .. code-block:: bash
 71 | 
 72 |     pytest
 73 | 
 74 | Our :ref:`Continuous Integration <ci>` system will also run them
 75 | automatically when you submit a pull request.
 76 | 
 77 | All functional changes to Bionic should be accompanied by new or updated tests.
 78 | 
 79 | Extra Tests
 80 | ...........
 81 | 
 82 | Pytest doesn't run all of our tests run by default. Some are slow, and only
 83 | run if you specifically request them:
 84 | 
 85 | .. code-block:: bash
 86 | 
 87 |     pytest --slow
 88 | 
 89 | Pytest will also skip the Google Cloud Storage tests unless you pass a
 90 | command line option telling it which bucket to use:
 91 | 
 92 | .. code-block:: bash
 93 | 
 94 |     pytest --bucket=gs://MYBUCKET
 95 | 
 96 | .. _style :
 97 | 
 98 | Code Style
 99 | ----------
100 | 
101 | Bionic follows the `PEP 8 <https://www.python.org/dev/peps/pep-0008/>`_
102 | standard. We use `Black <https://black.readthedocs.io/en/stable/>`_ to
103 | automatically format our code and `Flake8
104 | <https://flake8.pycqa.org/en/latest/>`_ to identify additional style errors.
105 | Our :ref:`Continuous Integration <ci>` system runs these tools on all pull
106 | requests, but your life will be easier if you run them yourself before
107 | submitting a pull request:
108 | 
109 | .. code-block:: bash
110 | 
111 |     black .
112 |     flake8
113 | 
114 | .. _ci :
115 | 
116 | Continuous Integration
117 | ----------------------
118 | 
119 | We use `GitHub Actions <https://github.com/features/actions>`_ to run our
120 | tests and style checks on every branch pushed to GitHub. If you submit a pull
121 | request, you should see the results show up automatically in the "checks"
122 | section.
123 | 
124 | .. _docs :
125 | 
126 | Updating the Documentation
127 | --------------------------
128 | 
129 | Bionic's documentation is built with `Sphinx
130 | <http://www.sphinx-doc.org/en/master/>`_.  You can build it from the ``docs``
131 | directory:
132 | 
133 | .. code-block:: bash
134 | 
135 |     make html
136 | 
137 | Alternatively, you can use `sphinx-autobuild
138 | <https://pypi.org/project/sphinx-autobuild/>`_, which watches your document
139 | source files, automatically rebuilds them when they change, and runs a web
140 | server with the latest version:
141 | 
142 | .. code-block:: bash
143 | 
144 |     make livehtml
145 |     # Leave this running and open localhost:8000 in your browser to see the docs.
146 | 
147 | Some of the documentation pages are built from Jupyter notebooks.  When editing
148 | these, you need to remember two things:
149 | 
150 | 1. Don't run any of the cells yourself; let Sphinx do that at build time.  If
151 |    you do run a cell, you can clear it with ``Edit > Clear All Outputs``.  If
152 |    you leave any cell output in the notebook, Sphinx won't try to run any of
153 |    the cells itself.
154 | 2. The "raw" text cells have special metadata that tells Sphinx that their
155 |    contents are in the ReStructured Text format.  As far as I know, this
156 |    metadata can't be changed by current versions of Jupyter Notebook or Jupyter
157 |    Lab; you have to manually edit the ``.ipynb`` file.  If you add any new text
158 |    cells to a notebook, you'll probably want to add this metadata as well.
159 | 


--------------------------------------------------------------------------------
/tests/test_flow/test_executor.py:
--------------------------------------------------------------------------------
  1 | import pytest
  2 | 
  3 | import bionic as bn
  4 | 
  5 | from bionic.executor import get_singleton_manager, logging_initializer
  6 | from bionic.deps.optdep import import_optional_dependency
  7 | 
  8 | pytestmark = pytest.mark.needs_parallel
  9 | 
 10 | 
 11 | @pytest.fixture
 12 | def loky_executor():
 13 |     loky = import_optional_dependency("loky", purpose="parallel execution")
 14 |     return loky.get_reusable_executor(
 15 |         max_workers=None,
 16 |         initializer=logging_initializer,
 17 |         initargs=(get_singleton_manager().logging_queue,),
 18 |     )
 19 | 
 20 | 
 21 | def test_executor_resizes(builder, loky_executor):
 22 |     builder.assign("a", 1)
 23 | 
 24 |     @builder
 25 |     def b(a):
 26 |         return a
 27 | 
 28 |     @builder
 29 |     def c(b):
 30 |         return b
 31 | 
 32 |     @builder
 33 |     def d(c):
 34 |         return c
 35 | 
 36 |     builder.set("core__parallel_execution__worker_count", 2)
 37 |     flow1 = builder.build()
 38 | 
 39 |     builder.set("core__parallel_execution__worker_count", 3)
 40 |     flow2 = builder.build()
 41 | 
 42 |     assert flow1.get("b") == 1
 43 |     # It's gross to check a private variable of the executor but this is
 44 |     # the best way to check that it was resized correctly.
 45 |     # TODO: Return PIDs in functions and assert that PIDs are different.
 46 |     assert loky_executor._max_workers == 2
 47 | 
 48 |     # Call a non-cached entity so that a task is submitted to executor
 49 |     # and it resizes.
 50 |     assert flow2.get("c") == 1
 51 |     assert loky_executor._max_workers == 3
 52 | 
 53 |     # Call a non-cached entity so that a task is submitted to executor
 54 |     # and it resizes.
 55 |     assert flow1.get("d") == 1
 56 |     assert loky_executor._max_workers == 2
 57 | 
 58 | 
 59 | # Test that when bionic sends a job to a parallel or AIP executor, it does not
 60 | # need to wait for the results and can send more jobs to executors.
 61 | # Test only runs in fake AIP because it uses SyncManager Barrier.
 62 | @pytest.mark.fake_gcp_only
 63 | def test_parallel_and_aip(aip_builder, multiprocessing_manager):
 64 |     builder = aip_builder
 65 | 
 66 |     builder.assign("x", 1)
 67 | 
 68 |     # The barrier ensures that all the entity functions do not complete unless
 69 |     # all of them are started.
 70 |     barrier = multiprocessing_manager.Barrier(4, timeout=120)
 71 | 
 72 |     @builder
 73 |     def y1(x):
 74 |         barrier.wait()
 75 |         return x + 1
 76 | 
 77 |     @builder
 78 |     def y2(x):
 79 |         barrier.wait()
 80 |         return x + 1
 81 | 
 82 |     @builder
 83 |     @bn.run_in_aip("n1-standard-4")
 84 |     def y3(x):
 85 |         barrier.wait()
 86 |         return x + 1
 87 | 
 88 |     @builder
 89 |     @bn.run_in_aip("n1-standard-4")
 90 |     def y4(x):
 91 |         barrier.wait()
 92 |         return x + 1
 93 | 
 94 |     @builder
 95 |     def total(y1, y2, y3, y4):
 96 |         return y1 + y2 + y3 + y4
 97 | 
 98 |     assert builder.build().get("total") == 8
 99 | 
100 | 
101 | # Similar to the test above, but with entity functions that fail to compute.
102 | # When one or more entity functions fail, bionic should wait for all other
103 | # concurrent tasks to complete and log all the exceptions.
104 | @pytest.mark.fake_gcp_only
105 | def test_parallel_fail(aip_builder, make_counter, multiprocessing_manager, log_checker):
106 |     builder = aip_builder
107 | 
108 |     builder.assign("x", 1)
109 | 
110 |     y1_counter = make_counter()
111 |     y2_counter = make_counter()
112 |     y3_counter = make_counter()
113 |     y4_counter = make_counter()
114 | 
115 |     # The barrier ensures that all the entity functions do not complete unless
116 |     # all of them are started. Since running this test in a debugger can be
117 |     # slow, a high timeout is used here.
118 |     barrier = multiprocessing_manager.Barrier(4, timeout=240)
119 | 
120 |     @builder
121 |     @y1_counter
122 |     def y1(x):
123 |         barrier.wait()
124 |         return x + 1
125 | 
126 |     @builder
127 |     @y2_counter
128 |     def y2(x):
129 |         barrier.wait()
130 |         raise Exception("y2 fail")
131 | 
132 |     @builder
133 |     @bn.run_in_aip("n1-standard-4")
134 |     @y3_counter
135 |     def y3(x):
136 |         barrier.wait()
137 |         raise Exception()
138 | 
139 |     @builder
140 |     @bn.run_in_aip("n1-standard-4")
141 |     @y4_counter
142 |     def y4(x):
143 |         barrier.wait()
144 |         return x + 1
145 | 
146 |     @builder
147 |     def total(y1, y2, y3, y4):
148 |         return y1 + y2 + y3 + y4
149 | 
150 |     with pytest.raises(Exception):
151 |         builder.build().get("total")
152 | 
153 |     # Verify that, when multiple entities fail to compute, all the exceptions
154 |     # are logged.
155 |     log_checker.expect_regex(
156 |         r"Computed   y1\(x=1\)",
157 |         r".*error while doing remote computation for y2\(x=1\).*y2 fail.*",
158 |         r".*error while doing remote computation for y3\(x=1\).*AipError.*",
159 |         r"Computed   y4\(x=1\) using AI Platform",
160 |     )
161 | 
162 |     assert y1_counter.times_called() == 1
163 |     assert y2_counter.times_called() == 1
164 |     assert y3_counter.times_called() == 1
165 |     assert y4_counter.times_called() == 1
166 | 
167 |     # flake8: noqa: E811
168 |     @builder
169 |     @y2_counter
170 |     def y2(x):
171 |         return x + 1
172 | 
173 |     # flake8: noqa: E811
174 |     @builder
175 |     @bn.run_in_aip("n1-standard-4")
176 |     @y3_counter
177 |     def y3(x):
178 |         return x + 1
179 | 
180 |     assert builder.build().get("total") == 8
181 | 
182 |     assert y1_counter.times_called() == 0
183 |     assert y2_counter.times_called() == 1
184 |     assert y3_counter.times_called() == 1
185 |     assert y4_counter.times_called() == 0
186 | 
187 |     log_checker.expect_regex(
188 |         r"Computed   y2\(x=1\)",
189 |         r"Computed   y3\(x=1\) using AI Platform",
190 |     )
191 | 


--------------------------------------------------------------------------------
/docs/warnings.rst:
--------------------------------------------------------------------------------
  1 | =====================
  2 | Warnings and Pitfalls
  3 | =====================
  4 | 
  5 | This is Alpha Software
  6 | ----------------------
  7 | 
  8 | Bionic is still in an early stage of development.  It also uses several
  9 | concepts which haven't been well-explored in this domain -- for example, the
 10 | way we create and gather multiple entity values.  We might discover that we've
 11 | created the wrong abstractions and need to adjust the API in a
 12 | non-backwards-compatible way. (We explain our policy on this `here
 13 | <release-notes.rst#versioning-scheme>`__.) To be completely protected against breaking
 14 | changes, you can pin a specific version of Bionic in your ``requirements.txt`` file:
 15 | 
 16 | .. parsed-literal::
 17 | 
 18 |     bionic==\ |version|
 19 | 
 20 | Bionic is widely used at Square, but so far hasn't received much testing in the outside
 21 | world. If you discover any new bugs, you can get help with them `here <get-help.rst>`__.
 22 | 
 23 | Avoid Global State
 24 | ------------------
 25 | 
 26 | Bionic assumes that your functions are completely self-contained: they don't
 27 | depend on or modify any external variables or systems.  This assumption is
 28 | necessary for parallelizing your code and caching the results.  If you break
 29 | it, your entity values may be inconsistent and unpredictable.
 30 | 
 31 | If you find yourself needing a global value, it's best to move that value into
 32 | its own entity.  (This means you need to define the value in such a way that it
 33 | doesn't visibly change after you've created and initialized it.)  This approach
 34 | has several advantages:
 35 | 
 36 | * Bionic can ensure that the value is properly initialized before any
 37 |   downstream entities access it.
 38 | * If you change the value's definition, Bionic can detect it and know to
 39 |   recompute those downstream entities.
 40 | * If you share your flow to another user or module, they will be able to
 41 |   redefine the value themselves using ``Flow.setting``.
 42 | 
 43 | Here are some situations you might encounter:
 44 | 
 45 | * If you have a global constant, consider making it an entity.
 46 | * If you have a dependency like a database connection, that can be an entity
 47 |   too.  You may want to use :meth:`@persist(False) <bionic.persist>` to make
 48 |   sure it only gets saved in memory, not to disk.
 49 | * If you need to initialize some external global dependency like Matplotlib,
 50 |   that should also happen in an entity.  In this case the entity may not have a
 51 |   meaningful value to return, but you can return a dummy value and use
 52 |   ``@persist(False)`` to make sure the initialization always happens.
 53 | * If you're reading data from an external source, like a database, that's a bit
 54 |   tricky.  Obviously Bionic has no way to detect whether the database's state
 55 |   has changed -- and even if it has, you might not want to re-run all your
 56 |   computations every time there's a slight update.  In some cases you can avoid
 57 |   this problem by constructing a query whose results shouldn't change; for
 58 |   example, reading over a specific time interval from an append-only database.
 59 |   Otherwise, your only option will be to manually :func:`@version
 60 |   <bionic.version>` the entity that reads from the database.
 61 | * If you're emitting debug information, like log messages, that's fine.  The
 62 |   important thing is that the log messages don't affect the behavior of any
 63 |   other entities.
 64 | * If you have a complex algorithm where multiple entities are reading from and
 65 |   writing to shared state, you'll want to refactor it into self-contained
 66 |   functions that return immutable values.  If there's no way to do that, your
 67 |   other option is just to move that entire algorithm into a single entity.
 68 | 
 69 | Don't Modify Your Arguments
 70 | ---------------------------
 71 | 
 72 | After computing an entity's value, Bionic stores it in an in-memory cache to
 73 | avoid having to recompute it again.  This means that each entity function's
 74 | arguments come from a common cache shared among all entities.  If you modify
 75 | any of these arguments, those changes may affect other entities (depending on
 76 | what order they are computed in).  So don't do this:
 77 | 
 78 | .. code-block:: python
 79 | 
 80 |     @builder
 81 |     def augmented_frame(raw_frame):
 82 |         aug_frame = raw_frame
 83 |         # This adds a column to the original frame!
 84 |         aug_frame['duration'] = aug_frame['end_time'] - aug_frame['start_time']
 85 |         return aug_frame
 86 | 
 87 | Instead do this:
 88 | 
 89 | .. code-block:: python
 90 | 
 91 |     @builder
 92 |     def augmented_frame(raw_frame):
 93 |         aug_frame = raw_frame.copy()
 94 |         aug_frame['duration'] = aug_frame['end_time'] - aug_frame['start_time']
 95 |         return aug_frame
 96 | 
 97 | Watch Out For Stale Data
 98 | ------------------------
 99 | 
100 | When using a system like Bionic that tries to intelligently cache your data
101 | for you, there's always the risk that it will fail to detect that it needs to
102 | recompute something, and instead give you an old cached value.  (There's also
103 | the risk that it will do unnecessary recomputation, but that's easier to
104 | detect, and only wastes time rather than giving you incorrect results.)
105 | 
106 | There are two main situations where this can happen:
107 | 
108 | 1. You've changed some of your code, but didn't use :func:`@version
109 |    <bionic.version>` to tell Bionic about the change. You can use
110 |    :ref:`automatic-versioning` to help avoid this.
111 | 
112 | 2. You're working in a notebook and accessing a flow defined in a Python module
113 |    file, and you've changed the definition of the flow but haven't reloaded the
114 |    module.  See :ref:`reloading-flows` for an easy way to do this.
115 | 
116 | It's a good idea to `enable logging <concepts.rst#logging>`_ at the ``INFO`` level so you
117 | can see what Bionic is doing -- this makes it much more obvious when it's
118 | failing to recompute values for you.
119 | 


--------------------------------------------------------------------------------
/tests/test_utils/test_misc.py:
--------------------------------------------------------------------------------
  1 | import pytest
  2 | 
  3 | from ..helpers import equal_when_sorted
  4 | 
  5 | 
  6 | def test_group_pairs():
  7 |     from bionic.utils.misc import group_pairs
  8 | 
  9 |     assert group_pairs([]) == []
 10 |     assert group_pairs([1, 2]) == [(1, 2)]
 11 |     assert group_pairs([1, 2, 3, 4, 5, 6]) == [(1, 2), (3, 4), (5, 6)]
 12 | 
 13 |     with pytest.raises(ValueError):
 14 |         group_pairs([1])
 15 |     with pytest.raises(ValueError):
 16 |         group_pairs([1, 2, 3])
 17 | 
 18 | 
 19 | def test_immutable_sequence():
 20 |     from bionic.utils.misc import ImmutableSequence
 21 | 
 22 |     class Seq(ImmutableSequence):
 23 |         def __init__(self, items):
 24 |             super(Seq, self).__init__(items)
 25 | 
 26 |     seq = Seq([1, 2, 3])
 27 | 
 28 |     assert seq[0] == 1
 29 |     assert seq[2] == 3
 30 |     assert seq[-2] == 2
 31 | 
 32 |     assert list(seq) == [1, 2, 3]
 33 |     assert len(seq) == 3
 34 | 
 35 |     assert 1 in seq
 36 |     assert 4 not in seq
 37 | 
 38 |     assert {seq: 7}[seq] == 7
 39 | 
 40 |     assert seq == Seq([1, 2, 3])
 41 |     assert seq != Seq([1, 3, 2])
 42 |     assert seq != [1, 2, 3]
 43 | 
 44 |     assert seq < Seq([1, 3, 2])
 45 |     assert seq <= Seq([1, 3, 2])
 46 |     assert Seq([1, 3, 2]) > seq
 47 |     assert Seq([1, 3, 2]) >= seq
 48 | 
 49 | 
 50 | def test_immutable_mapping():
 51 |     from bionic.utils.misc import ImmutableMapping
 52 | 
 53 |     class Mapping(ImmutableMapping):
 54 |         def __init__(self, values_by_key):
 55 |             super(Mapping, self).__init__(values_by_key)
 56 | 
 57 |     mapping = Mapping({"a": 1, "b": 2})
 58 | 
 59 |     assert mapping["a"] == 1
 60 |     assert mapping["b"] == 2
 61 |     with pytest.raises(KeyError):
 62 |         mapping["c"]
 63 | 
 64 |     assert mapping.get("a") == 1
 65 |     assert mapping.get("c") is None
 66 | 
 67 |     assert {mapping: 7}[mapping] == 7
 68 | 
 69 |     assert equal_when_sorted(list(mapping), ["a", "b"])
 70 |     assert dict(mapping) == {"a": 1, "b": 2}
 71 |     assert equal_when_sorted(list(mapping.keys()), ["a", "b"])
 72 |     assert equal_when_sorted(list(mapping.values()), [1, 2])
 73 |     assert equal_when_sorted(list(mapping.items()), [("a", 1), ("b", 2)])
 74 |     assert equal_when_sorted(list(mapping.keys()), ["a", "b"])
 75 |     assert equal_when_sorted(list(mapping.values()), [1, 2])
 76 |     assert equal_when_sorted(list(mapping.items()), [("a", 1), ("b", 2)])
 77 | 
 78 |     assert mapping == Mapping({"a": 1, "b": 2})
 79 |     assert mapping != {"a": 1, "b": 2}
 80 |     assert mapping != Mapping({"b": 1, "a": 2})
 81 |     assert mapping < Mapping({"b": 1, "a": 2})
 82 |     assert mapping <= Mapping({"b": 1, "a": 2})
 83 |     assert Mapping({"b": 1, "a": 2}) > mapping
 84 |     assert Mapping({"b": 1, "a": 2}) >= mapping
 85 | 
 86 | 
 87 | def test_oneline():
 88 |     from bionic.utils.misc import oneline
 89 | 
 90 |     assert oneline("one two") == "one two"
 91 |     assert oneline(" one two ") == "one two"
 92 |     assert oneline("\none\ntwo") == "one two"
 93 |     assert (
 94 |         oneline(
 95 |             """
 96 |        one
 97 |        two   three"""
 98 |         )
 99 |         == "one two   three"
100 |     )
101 |     assert (
102 |         oneline(
103 |             """
104 |        one
105 |        two
106 | 
107 |        three
108 |        """
109 |         )
110 |         == "one two three"
111 |     )
112 | 
113 | 
114 | def test_clean_docstring():
115 |     from bionic.utils.misc import rewrap_docstring
116 | 
117 |     assert rewrap_docstring("") == ""
118 |     assert rewrap_docstring("test") == "test"
119 |     assert rewrap_docstring("test one two") == "test one two"
120 |     assert rewrap_docstring("test\none\ntwo") == "test one two"
121 |     assert rewrap_docstring("test 1. 2.") == "test 1. 2."
122 |     assert rewrap_docstring("test\n\none\ntwo") == "test\none two"
123 | 
124 |     doc = """
125 |     test
126 |     """
127 |     assert rewrap_docstring(doc) == "test"
128 | 
129 |     doc = """
130 |     test one
131 |     two
132 |     """
133 |     assert rewrap_docstring(doc) == "test one two"
134 | 
135 |     doc = """
136 |     test one
137 |         two
138 |     """
139 |     assert rewrap_docstring(doc) == "test one two"
140 | 
141 |     doc = """
142 |     test one
143 | 
144 |     two
145 |     """
146 |     assert rewrap_docstring(doc) == "test one\ntwo"
147 | 
148 |     doc = """
149 |     test
150 |     - one
151 |     - two
152 |     """
153 |     assert rewrap_docstring(doc) == "test\n- one\n- two"
154 | 
155 |     doc = """
156 |     test
157 |     + one
158 |     + two
159 |     """
160 |     assert rewrap_docstring(doc) == "test\n+ one\n+ two"
161 | 
162 |     doc = """
163 |     test
164 |     * one
165 |     * two
166 |     """
167 |     assert rewrap_docstring(doc) == "test\n* one\n* two"
168 | 
169 |     doc = """
170 |     test
171 |     1. one
172 |     2. two
173 |     """
174 |     assert rewrap_docstring(doc) == "test\n1. one\n2. two"
175 | 
176 |     doc = """
177 |     test
178 |     1) one
179 |     2) two
180 |     """
181 |     assert rewrap_docstring(doc) == "test\n1) one\n2) two"
182 | 
183 |     doc = """
184 |     test
185 |     10) one
186 |     20) two
187 |     """
188 |     assert rewrap_docstring(doc) == "test\n10) one\n20) two"
189 | 
190 |     doc = """
191 |     test
192 |     a) one
193 |     b) two
194 |     """
195 |     assert rewrap_docstring(doc) == "test\na) one\nb) two"
196 | 
197 |     doc = """
198 |     test
199 |     1.0
200 |     """
201 |     assert rewrap_docstring(doc) == "test 1.0"
202 | 
203 |     doc = """
204 |     test (one
205 |     two)
206 |     """
207 |     assert rewrap_docstring(doc) == "test (one two)"
208 | 
209 |     doc = """
210 |     test
211 |     - one
212 |     - two
213 | 
214 |     - three
215 |     - four
216 |     """
217 |     assert rewrap_docstring(doc) == "test\n- one\n- two\n\n- three\n- four"
218 | 
219 |     doc = """
220 |     test
221 |     - one
222 |     two
223 |     """
224 |     assert rewrap_docstring(doc) == "test\n- one two"
225 | 
226 |     doc = """
227 |     test
228 |     - one
229 | 
230 |     two
231 |     """
232 |     assert rewrap_docstring(doc) == "test\n- one\ntwo"
233 | 


--------------------------------------------------------------------------------
/docs/conf.py:
--------------------------------------------------------------------------------
  1 | # -*- coding: utf-8 -*-
  2 | #
  3 | # Configuration file for the Sphinx documentation builder.
  4 | 
  5 | # -- Path setup --------------------------------------------------------------
  6 | 
  7 | # If extensions (or modules to document with autodoc) are in another directory,
  8 | # add these directories to sys.path here. If the directory is relative to the
  9 | # documentation root, use os.path.abspath to make it absolute, like shown here.
 10 | #
 11 | import os
 12 | import sys
 13 | 
 14 | sys.path.insert(0, os.path.abspath(".."))
 15 | 
 16 | 
 17 | # -- Project information -----------------------------------------------------
 18 | 
 19 | project = "bionic"
 20 | copyright = "2019, Square"
 21 | author = "Janek Klawe"
 22 | 
 23 | # The short X.Y version
 24 | version = "0.11.1"
 25 | # The full version, including alpha/beta/rc tags
 26 | release = version
 27 | 
 28 | # -- General configuration ---------------------------------------------------
 29 | 
 30 | # If your documentation needs a minimal Sphinx version, state it here.
 31 | #
 32 | # needs_sphinx = '1.0'
 33 | 
 34 | # Add any Sphinx extension module names here, as strings. They can be
 35 | # extensions coming with Sphinx (named 'sphinx.ext.*') or your custom
 36 | # ones.
 37 | extensions = [
 38 |     "sphinx.ext.autodoc",
 39 |     "sphinx.ext.coverage",
 40 |     "sphinx.ext.viewcode",
 41 |     "sphinx.ext.githubpages",
 42 |     "sphinx.ext.napoleon",
 43 |     "sphinx.ext.napoleon",
 44 |     "nbsphinx",
 45 | ]
 46 | 
 47 | # Add any paths that contain templates here, relative to this directory.
 48 | templates_path = ["_templates"]
 49 | 
 50 | # The suffix(es) of source filenames.
 51 | # You can specify multiple suffix as a list of string:
 52 | #
 53 | # source_suffix = ['.rst', '.md']
 54 | source_suffix = ".rst"
 55 | 
 56 | # The master toctree document.
 57 | master_doc = "index"
 58 | 
 59 | # The language for content autogenerated by Sphinx. Refer to documentation
 60 | # for a list of supported languages.
 61 | #
 62 | # This is also used if you do content translation via gettext catalogs.
 63 | # Usually you set "language" from the command line for these cases.
 64 | language = None
 65 | 
 66 | # List of patterns, relative to source directory, that match files and
 67 | # directories to ignore when looking for source files.
 68 | # This pattern also affects html_static_path and html_extra_path.
 69 | exclude_patterns = [
 70 |     "_build",
 71 |     "Thumbs.db",
 72 |     ".DS_Store",
 73 |     "**.ipynb_checkpoints",
 74 |     "bndata",
 75 |     "*.swp",
 76 | ]
 77 | 
 78 | # The name of the Pygments (syntax highlighting) style to use.
 79 | pygments_style = None
 80 | 
 81 | 
 82 | # -- Options for HTML output -------------------------------------------------
 83 | 
 84 | # The theme to use for HTML and HTML Help pages.  See the documentation for
 85 | # a list of builtin themes.
 86 | #
 87 | html_theme = "sphinx_rtd_theme"
 88 | 
 89 | # Theme options are theme-specific and customize the look and feel of a theme
 90 | # further.  For a list of options available for each theme, see the
 91 | # documentation.
 92 | #
 93 | # html_theme_options = {}
 94 | 
 95 | # Add any paths that contain custom static files (such as style sheets) here,
 96 | # relative to this directory. They are copied after the builtin static files,
 97 | # so a file named "default.css" will overwrite the builtin "default.css".
 98 | html_static_path = []
 99 | 
100 | # Custom sidebar templates, must be a dictionary that maps document names
101 | # to template names.
102 | #
103 | # The default sidebars (for documents that don't match any pattern) are
104 | # defined by theme itself.  Builtin themes are using these templates by
105 | # default: ``['localtoc.html', 'relations.html', 'sourcelink.html',
106 | # 'searchbox.html']``.
107 | #
108 | # html_sidebars = {}
109 | 
110 | 
111 | # -- Options for HTMLHelp output ---------------------------------------------
112 | 
113 | # Output file base name for HTML help builder.
114 | htmlhelp_basename = "bionicdoc"
115 | 
116 | 
117 | # -- Options for LaTeX output ------------------------------------------------
118 | 
119 | latex_elements = {
120 |     # The paper size ('letterpaper' or 'a4paper').
121 |     #
122 |     # 'papersize': 'letterpaper',
123 |     # The font size ('10pt', '11pt' or '12pt').
124 |     #
125 |     # 'pointsize': '10pt',
126 |     # Additional stuff for the LaTeX preamble.
127 |     #
128 |     # 'preamble': '',
129 |     # Latex figure (float) alignment
130 |     #
131 |     # 'figure_align': 'htbp',
132 | }
133 | 
134 | # Grouping the document tree into LaTeX files. List of tuples
135 | # (source start file, target name, title,
136 | #  author, documentclass [howto, manual, or own class]).
137 | latex_documents = [
138 |     (master_doc, "bionic.tex", "bionic Documentation", "Janek Klawe", "manual"),
139 | ]
140 | 
141 | 
142 | # -- Options for manual page output ------------------------------------------
143 | 
144 | # One entry per manual page. List of tuples
145 | # (source start file, name, description, authors, manual section).
146 | man_pages = [(master_doc, "bionic", "bionic Documentation", [author], 1)]
147 | 
148 | 
149 | # -- Options for Texinfo output ----------------------------------------------
150 | 
151 | # Grouping the document tree into Texinfo files. List of tuples
152 | # (source start file, target name, title, author,
153 | #  dir menu entry, description, category)
154 | texinfo_documents = [
155 |     (
156 |         master_doc,
157 |         "bionic",
158 |         "bionic Documentation",
159 |         author,
160 |         "bionic",
161 |         "One line description of project.",
162 |         "Miscellaneous",
163 |     ),
164 | ]
165 | 
166 | 
167 | # -- Options for Epub output -------------------------------------------------
168 | 
169 | # Bibliographic Dublin Core info.
170 | epub_title = project
171 | 
172 | # The unique identifier of the text. This can be a ISBN number
173 | # or the project homepage.
174 | #
175 | # epub_identifier = ''
176 | 
177 | # A unique identification for the text.
178 | #
179 | # epub_uid = ''
180 | 
181 | # A list of files that should not be packed into the epub file.
182 | epub_exclude_files = ["search.html"]
183 | 
184 | 
185 | # -- Extension configuration -------------------------------------------------
186 | 


--------------------------------------------------------------------------------
/bionic/aip/task.py:
--------------------------------------------------------------------------------
  1 | """
  2 | Data model for task running on AI platform
  3 | """
  4 | import logging
  5 | import pickle
  6 | import time
  7 | from typing import Callable, Optional
  8 | 
  9 | import attr
 10 | 
 11 | from bionic.aip.state import State, AipError
 12 | from bionic.deps.optdep import import_optional_dependency
 13 | 
 14 | 
 15 | @attr.s(auto_attribs=True, frozen=True)
 16 | class TaskConfig:
 17 |     """
 18 |     Contains configuration that can differ per task, which can be a
 19 |     single machine or a cluster of machines.
 20 | 
 21 |     In the future could be extended to support GPUs and similar.
 22 |     """
 23 | 
 24 |     machine: str
 25 |     worker_count: Optional[int] = None
 26 |     worker_machine: Optional[str] = None
 27 | 
 28 | 
 29 | @attr.s(auto_attribs=True, frozen=True)
 30 | class Config:
 31 |     """
 32 |     Contains configuration that remains the same across all tasks.
 33 | 
 34 |     Attributes
 35 |     ----------
 36 |     uuid: str
 37 |         The globally unique job name on AI platform.
 38 |     project: str
 39 |         The GCP project id where the jobs will be run.
 40 |     poll_period_seconds: float
 41 |         How many seconds to wait between polling calls to AIP while waiting for jobs
 42 |         to complete.
 43 |     account: str, optional
 44 |         The GCP service account to use. Corresponds to AIP's
 45 |         `TrainingInput.serviceAccount`.
 46 |     network: str, optional
 47 |         The name of the Google Compute Engine network with which jobs are peered.
 48 |         Corresponds to AIP's `TrainingInput.network`.
 49 |     """
 50 | 
 51 |     uuid: str
 52 |     project_id: str
 53 |     poll_period_seconds: float
 54 |     account: Optional[str] = None
 55 |     network: Optional[str] = None
 56 | 
 57 | 
 58 | @attr.s(auto_attribs=True, frozen=True)
 59 | class Task:
 60 |     # This task object will be serialized and sent to AIP. Hence, all entities
 61 |     # here must be serializable as well.
 62 |     name: str
 63 |     function: Callable
 64 |     config: Config
 65 |     docker_image_uri: str
 66 |     task_config: TaskConfig
 67 | 
 68 |     @property
 69 |     def job_id(self) -> str:
 70 |         return f"{self.config.uuid}_{self.name}"
 71 | 
 72 |     @property
 73 |     def inputs_uri(self) -> str:
 74 |         # In a future version it might be better to make this path
 75 |         # specified by the flow, so that it can be inside a GCS cache
 76 |         # location and different file types.
 77 |         return f"gs://{self.config.project_id}/bionic/{self.config.uuid}/{self.name}-inputs.cloudpickle"
 78 | 
 79 |     @property
 80 |     def output_uri(self) -> str:
 81 |         # In a future version it might be better to make this path
 82 |         # specified by the flow, so that it can be inside a GCS cache
 83 |         # location and different file types.
 84 |         return f"gs://{self.config.project_id}/bionic/{self.config.uuid}/{self.name}-output.cloudpickle"
 85 | 
 86 |     def _ai_platform_job_spec(self):
 87 |         """Conversion from our task data model to a job request on ai platform"""
 88 |         output = {
 89 |             "jobId": f"{self.job_id}",
 90 |             "labels": {"job": self.config.uuid},
 91 |             "trainingInput": {
 92 |                 "serviceAccount": self.config.account,
 93 |                 "masterType": self.task_config.machine,
 94 |                 "masterConfig": {"imageUri": self.docker_image_uri},
 95 |                 "args": ["python", "-m", "bionic.aip.main", self.inputs_uri],
 96 |                 "packageUris": [],
 97 |                 "region": "us-west1",
 98 |                 "pythonModule": "",
 99 |                 "scaleTier": "CUSTOM",
100 |             },
101 |         }
102 |         if self.config.network is not None:
103 |             output["trainingInput"]["network"] = self.config.network
104 | 
105 |         if (
106 |             self.task_config.worker_count is not None
107 |             and self.task_config.worker_count > 0
108 |         ):
109 |             output["trainingInput"]["workerCount"] = self.task_config.worker_count
110 |             output["trainingInput"]["workerType"] = self.task_config.worker_machine
111 |             output["trainingInput"]["workerConfig"] = {
112 |                 "imageUri": self.docker_image_uri
113 |             }
114 | 
115 |         return output
116 | 
117 |     def _stage(self, gcs_fs):
118 |         cloudpickle = import_optional_dependency("cloudpickle")
119 | 
120 |         path = self.inputs_uri
121 |         logging.info(f"Staging AI Platform task {self.name} at {path}")
122 | 
123 |         with gcs_fs.open(path, "wb") as f:
124 |             cloudpickle.dump(self, f)
125 | 
126 |     def submit(self, gcs_fs, aip_client):
127 |         self._stage(gcs_fs)
128 |         spec = self._ai_platform_job_spec()
129 | 
130 |         logging.info(f"Submitting AI Platform task on {self.config.project_id}: {self}")
131 | 
132 |         request = (
133 |             aip_client.projects()
134 |             .jobs()
135 |             .create(body=spec, parent=f"projects/{self.config.project_id}")
136 |         )
137 |         request.execute()
138 |         url = f"https://console.cloud.google.com/ai-platform/jobs/{self.job_id}"
139 |         logging.info(f"Started AI Platform task: {url}")
140 | 
141 |     def wait_for_results(self, gcs_fs, aip_client):
142 |         state, error = self._get_state_and_error(aip_client)
143 |         while state.is_executing():
144 |             time.sleep(self.config.poll_period_seconds)
145 |             state, error = self._get_state_and_error(aip_client)
146 |             logging.info(f"Future for {self.job_id} has state {state}")
147 | 
148 |         if state is not State.SUCCEEDED:
149 |             raise AipError(f"{self.job_id}: " + str(error))
150 | 
151 |         with gcs_fs.open(self.output_uri, "rb") as f:
152 |             return pickle.load(f)
153 | 
154 |     def _get_state_and_error(self, aip_client):
155 |         request = (
156 |             aip_client.projects()
157 |             .jobs()
158 |             .get(name=f"projects/{self.config.project_id}/jobs/{self.job_id}")
159 |         )
160 |         response = request.execute()
161 |         state, error = State[response["state"]], response.get("errorMessage", None)
162 |         return state, error
163 | 


--------------------------------------------------------------------------------