├── catalog
    └── .gitignore
├── notebooks
    └── .gitkeep
├── reference
    ├── .gitkeep
    ├── datasets
    │   └── .gitkeep
    ├── templates
    │   └── .gitkeep
    └── easydata
    │   ├── images
    │       └── toolbar-screenshot.png
    │   ├── project-layout.md
    │   ├── git-workflow.md
    │   ├── git-configuration.md
    │   ├── troubleshooting.md
    │   ├── paths.md
    │   ├── easydata.md
    │   ├── datasets.md
    │   ├── notebooks.md
    │   ├── conda-environments.md
    │   └── sharing-your-work.md
├── reports
    ├── .gitkeep
    └── figures
    │   └── .gitkeep
├── src
    ├── data
    │   ├── .gitkeep
    │   ├── __init__.py
    │   ├── process_functions.py
    │   ├── extra.py
    │   ├── utils.py
    │   ├── transformer_functions.py
    │   ├── catalog.py
    │   └── fetch.py
    ├── tests
    │   ├── __init__.py
    │   ├── no_ci
    │   │   ├── __init__.py
    │   │   └── test_user_dataset_environment_integration.py
    │   ├── test_imports.py
    │   ├── test_ci.py
    │   ├── test_catalog.py
    │   ├── test_iter_directory.py
    │   ├── make_test_datasets.py
    │   └── data
    │   │   └── dataset-test.json
    ├── analysis
    │   └── __init__.py
    ├── log
    │   └── __init__.py
    ├── exceptions.py
    ├── __init__.py
    ├── decorators.py
    ├── conftest.py
    ├── workflow.py
    ├── _paths.py
    ├── utils
    │   ├── ipynbname.py
    │   └── __init__.py
    ├── kvstore.py
    └── helpers.py
├── models
    ├── figures
    │   └── .gitkeep
    ├── output
    │   └── .gitkeep
    └── trained
    │   └── .gitkeep
├── docs
    ├── commands.rst
    ├── getting-started.rst
    ├── index.rst
    ├── make.bat
    ├── Makefile
    └── conf.py
├── setup.py
├── Makefile
├── .easydata.yml
├── Makefile.include
├── .easydata.json
├── .post-create-environment.txt
├── environment.yml
├── LICENSE
├── .gitignore
├── README.md
├── .circleci
    └── config.yml
├── Makefile.help
└── Makefile.envs


/catalog/.gitignore:
--------------------------------------------------------------------------------
1 | 


--------------------------------------------------------------------------------
/notebooks/.gitkeep:
--------------------------------------------------------------------------------
1 | 


--------------------------------------------------------------------------------
/reference/.gitkeep:
--------------------------------------------------------------------------------
1 | 


--------------------------------------------------------------------------------
/reports/.gitkeep:
--------------------------------------------------------------------------------
1 | 


--------------------------------------------------------------------------------
/src/data/.gitkeep:
--------------------------------------------------------------------------------
1 | 


--------------------------------------------------------------------------------
/models/figures/.gitkeep:
--------------------------------------------------------------------------------
1 | 


--------------------------------------------------------------------------------
/models/output/.gitkeep:
--------------------------------------------------------------------------------
1 | 


--------------------------------------------------------------------------------
/models/trained/.gitkeep:
--------------------------------------------------------------------------------
1 | 


--------------------------------------------------------------------------------
/src/tests/__init__.py:
--------------------------------------------------------------------------------
1 | 


--------------------------------------------------------------------------------
/reference/datasets/.gitkeep:
--------------------------------------------------------------------------------
1 | 


--------------------------------------------------------------------------------
/reference/templates/.gitkeep:
--------------------------------------------------------------------------------
1 | 


--------------------------------------------------------------------------------
/reports/figures/.gitkeep:
--------------------------------------------------------------------------------
1 | 


--------------------------------------------------------------------------------
/src/analysis/__init__.py:
--------------------------------------------------------------------------------
1 | 


--------------------------------------------------------------------------------
/src/tests/no_ci/__init__.py:
--------------------------------------------------------------------------------
1 | 


--------------------------------------------------------------------------------
/src/data/__init__.py:
--------------------------------------------------------------------------------
1 | from .catalog import *
2 | from .datasets import *
3 | from .fetch import *
4 | from .utils import *
5 | from .extra import *
6 | 


--------------------------------------------------------------------------------
/reference/easydata/images/toolbar-screenshot.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/hackalog/make_better_defaults/HEAD/reference/easydata/images/toolbar-screenshot.png


--------------------------------------------------------------------------------
/docs/commands.rst:
--------------------------------------------------------------------------------
1 | Commands
2 | ========
3 | 
4 | The Makefile contains the central entry points for common tasks related to this project.
5 | Type `make` for help
6 | 


--------------------------------------------------------------------------------
/src/data/process_functions.py:
--------------------------------------------------------------------------------
 1 | """
 2 | Custom dataset processing/generation functions should be added to this file
 3 | """
 4 | 
 5 | import pathlib
 6 | 
 7 | from tqdm.auto import tqdm
 8 | 
 9 | from .. import paths
10 | from ..log import logger
11 | 
12 | __all__ = [
13 | ]
14 | 


--------------------------------------------------------------------------------
/docs/getting-started.rst:
--------------------------------------------------------------------------------
1 | Getting started
2 | ===============
3 | 
4 | This is where you describe how to get set up on a clean install, including the
5 | commands necessary to get the raw data (using the `sync_data_from_s3` command,
6 | for example), and then how to make the cleaned, final data sets.
7 | 


--------------------------------------------------------------------------------
/src/log/__init__.py:
--------------------------------------------------------------------------------
1 | import logging
2 | import os
3 | import sys
4 | 
5 | _log_fmt = '%(asctime)s - %(module)s - %(levelname)s - %(message)s'
6 | logging.basicConfig(level=os.environ.get('LOGLEVEL', 'INFO'), format=_log_fmt)
7 | _MODULE = sys.modules[__name__]
8 | logger = logging.getLogger(__name__)
9 | 


--------------------------------------------------------------------------------
/setup.py:
--------------------------------------------------------------------------------
 1 | from setuptools import find_packages, setup
 2 | 
 3 | setup(
 4 |     name='src',
 5 |     packages=find_packages(),
 6 |     version='2.0.0',
 7 |     description='''Make Better Defaults: Improving your data science workflows with "make". A Pydata Global 2021 Talk''',
 8 |     author='Kjell Wooding <kjell@wooding.org>',
 9 |     license='MIT',
10 | )
11 | 


--------------------------------------------------------------------------------
/Makefile:
--------------------------------------------------------------------------------
 1 | include Makefile.include
 2 | include Makefile.envs
 3 | include Makefile.help
 4 | 
 5 | #
 6 | # COMMANDS                                                                      #
 7 | #
 8 | 
 9 | .PHONY: clean
10 | ## Delete all compiled Python files
11 | clean:
12 | 	find . -type f -name "*.py[co]" -delete
13 | 	find . -type d -name "__pycache__" -delete
14 | 	rm -f .make.*
15 | 


--------------------------------------------------------------------------------
/src/tests/no_ci/test_user_dataset_environment_integration.py:
--------------------------------------------------------------------------------
 1 | ## Test dataset information
 2 | import unittest
 3 | 
 4 | from src.data import Dataset
 5 | 
 6 | 
 7 | class TestDatasets(unittest.TestCase):
 8 |     """
 9 |     Basic smoke tests to ensure that all of the available datasets
10 |     load and have some expected property.
11 |     """
12 |     def basic_unit_test(self):
13 |         assert True
14 | 


--------------------------------------------------------------------------------
/.easydata.yml:
--------------------------------------------------------------------------------
 1 | default_context:
 2 |   author_name: Kjell Wooding <kjell@wooding.org>
 3 |   conda_path: ~/miniconda3/bin/conda
 4 |   default_branch: main
 5 |   description: 'Make Better Defaults: Improving your data science workflows with "make". A Pydata Global 2021 Talk'
 6 |   module_name: src
 7 |   open_source_license: MIT
 8 |   project_name: Make Better Defaults
 9 |   python_version: '3.8'
10 |   repo_name: make_better_defaults
11 |   upstream_location: github.com
12 | 


--------------------------------------------------------------------------------
/Makefile.include:
--------------------------------------------------------------------------------
 1 | CONDA_EXE ?= ~/miniconda3/bin/conda
 2 | DEBUG_FILE := debug.txt
 3 | MODULE_NAME := src
 4 | TESTS_NO_CI = $(MODULE_NAME)/tests/no_ci
 5 | PROJECT_DIR := $(shell dirname $(realpath $(lastword $(MAKEFILE_LIST))))
 6 | PROJECT_NAME := make_better_defaults
 7 | PYTHON_INTERPRETER := python3
 8 | ARCH := $(shell $(PYTHON_INTERPRETER) -c "import platform; print(platform.platform())")
 9 | VIRTUALENV := conda
10 | EASYDATA_LOCKFILE := environment.$(ARCH).lock.yml
11 | 


--------------------------------------------------------------------------------
/.easydata.json:
--------------------------------------------------------------------------------
 1 | {
 2 |     "_template": "easydata",
 3 |     "author_name": "Kjell Wooding <kjell@wooding.org>",
 4 |     "conda_path": "~/miniconda3/bin/conda",
 5 |     "default_branch": "main",
 6 |     "description": "Make Better Defaults: Improving your data science workflows with \"make\". A Pydata Global 2021 Talk",
 7 |     "module_name": "src",
 8 |     "open_source_license": "MIT",
 9 |     "project_name": "Make Better Defaults",
10 |     "python_version": "3.8",
11 |     "repo_name": "make_better_defaults",
12 |     "upstream_location": "github.com"
13 | }
14 | 


--------------------------------------------------------------------------------
/docs/index.rst:
--------------------------------------------------------------------------------
 1 | .. Make Better Defaults documentation master file, created by
 2 |    sphinx-quickstart.
 3 |    You can adapt this file completely to your liking, but it should at least
 4 |    contain the root `toctree` directive.
 5 | 
 6 | Make Better Defaults documentation!
 7 | ==============================================
 8 | 
 9 | Contents:
10 | 
11 | .. toctree::
12 |    :maxdepth: 2
13 | 
14 |    getting-started
15 |    commands
16 | 
17 | 
18 | 
19 | Indices and tables
20 | ==================
21 | 
22 | * :ref:`genindex`
23 | * :ref:`modindex`
24 | * :ref:`search`
25 | 


--------------------------------------------------------------------------------
/.post-create-environment.txt:
--------------------------------------------------------------------------------
 1 | 
 2 | Now would be a good time to initialize a git repo; i.e.
 3 | >>> git init
 4 | >>> git add .
 5 | >>> git commit -m 'initial import'
 6 | >>> git branch easydata    # tag for future easydata upgrades
 7 | 
 8 | NOTE: By default, raw data is installed and unpacked in the
 9 | `make_better_defaults/data` directory. If you are working with big data (or
10 | have a small disk partition), it is HIGHLY recommended that you point
11 | this directory elsewhere; i.e. by setting paths['data_path']. For example:
12 | >>> conda activate make_better_defaults
13 | >>> python -c "import src; src.paths['data_path'] = '/path/to/big/data'"
14 | 
15 | You have been warned.
16 | 


--------------------------------------------------------------------------------
/environment.yml:
--------------------------------------------------------------------------------
 1 | name: make_better_defaults
 2 | channels:
 3 |   - defaults
 4 | dependencies:
 5 |   - pip
 6 |   - pip:
 7 |     - -e .  # conda >= 4.4 only
 8 |     - python-dotenv>=0.5.1
 9 |     - nbval
10 |     - nbdime
11 |     - gdown
12 |   - setuptools
13 |   - wheel
14 |   - git>=2.5  # for git worktree template updating
15 |   - sphinx
16 |   - bokeh
17 |   - click
18 |   - colorcet
19 |   - coverage
20 |   - coveralls
21 |   - matplotlib
22 |   - jupyter
23 |   - scikit-learn
24 |   - scipy
25 |   - joblib
26 |   - nb_conda_kernels   # Try <2.2.0  if you hit nb_conda_kernels issue #158
27 |   - pandas
28 |   - requests
29 |   - pathlib
30 |   - fsspec
31 |   - python=3.8
32 | 
33 | 


--------------------------------------------------------------------------------
/src/tests/test_imports.py:
--------------------------------------------------------------------------------
 1 | import unittest
 2 | 
 3 | 
 4 | class TestImports(unittest.TestCase):
 5 |     """
 6 |     Basic smoke test to ensure that the installed packages can actually be
 7 |     imported (we had a compatibility issue once that was not resolved
 8 |     properly by conda).
 9 |     """
10 |     def test_infrastructure_packages(self):
11 |         import gdown
12 |         import sphinx
13 |         import click
14 |         import joblib
15 |         import requests
16 | 
17 |     def test_common_packages(self):
18 |         import numpy
19 |         import scipy.sparse
20 |         import pandas
21 |         import bokeh
22 |         import matplotlib
23 |         import sklearn
24 | 


--------------------------------------------------------------------------------
/src/exceptions.py:
--------------------------------------------------------------------------------
 1 | class EasydataError(Exception):
 2 |     """General Easydata Error. Further error types are subclassed from this Exception"""
 3 |     pass
 4 | 
 5 | class ValidationError(EasydataError):
 6 |     """Hash check failed"""
 7 |     pass
 8 | 
 9 | class ObjectCollision(EasydataError):
10 |     """Object already exists in object store
11 | 
12 |     This is more general than a FileExistsError, as it applies to more than just the filesystem.
13 |     """
14 |     pass
15 | 
16 | class NotFoundError(EasydataError):
17 |     """Named object not found in object store
18 | 
19 |     This is more general than a FileNotFoundError, as it applies to more than just the filesystem.
20 |     """
21 |     pass
22 | 


--------------------------------------------------------------------------------
/src/tests/test_ci.py:
--------------------------------------------------------------------------------
 1 | ## Test dataset information
 2 | import logging
 3 | import unittest
 4 | 
 5 | from src.data import Dataset
 6 | from src import workflow
 7 | from src.log import logger
 8 | 
 9 | 
10 | class TestDatasetsSmall(unittest.TestCase):
11 |     """
12 |     Basic smoke tests to ensure that the smaller (and more quickly processed)
13 |     available datasets load and have some expected property.
14 |     """
15 |     def test_20_newsgroups(self):
16 |         ds = Dataset.load('20_newsgroups')
17 |         ds = Dataset.load('20_newsgroups')
18 |         assert len(ds.data) == 18846
19 |         assert len(ds.target) == 18846
20 | 
21 | def test_logging_is_debug_level():
22 |     assert logger.getEffectiveLevel() == logging.DEBUG
23 | 


--------------------------------------------------------------------------------
/src/__init__.py:
--------------------------------------------------------------------------------
 1 | import pathlib
 2 | from ._paths import Paths
 3 | 
 4 | _module_dir = pathlib.Path(__file__).parent.resolve()
 5 | 
 6 | _path_defaults = {
 7 |     'cache_path': '${data_path}/interim/cache',
 8 |     'data_path': '${project_path}/data',
 9 |     'figures_path': '${output_path}/figures',
10 |     'interim_data_path': '${data_path}/interim',
11 |     'notebook_path': '${project_path}/notebooks',
12 |     'output_path': '${project_path}/reports',
13 |     'processed_data_path': '${data_path}/processed',
14 |     'project_path': '${catalog_path}/..',
15 |     'raw_data_path': '${data_path}/raw',
16 |     'template_path': '${project_path}/reference/templates',
17 | }
18 | _catalog_file = _module_dir.parent / "catalog" / "config.ini"
19 | 
20 | paths = Paths(_path_defaults, config_file=_catalog_file, config_section="Paths")
21 | 


--------------------------------------------------------------------------------
/src/decorators.py:
--------------------------------------------------------------------------------
 1 | 
 2 | # Singleton/SingletonDecorator.py
 3 | class SingletonDecorator:
 4 |     """Turns a class into a Singleton class
 5 | 
 6 |     When placed before a class definition, ensures that all
 7 |     insances of this class return the same data; i.e. editing one
 8 |     will change them all.
 9 |     """
10 |     def __init__(self,klass):
11 |         self.klass = klass
12 |         self.instance = None
13 |     def __call__(self,*args,**kwds):
14 |         if self.instance == None:
15 |             self.instance = self.klass(*args,**kwds)
16 |         return self.instance
17 | 
18 |     # https://softwareengineering.stackexchange.com/questions/386755/sharing-docstrings-between-similar-functions
19 | def is_documented_by(original):
20 |   def wrapper(target):
21 |     target.__doc__ = original.__doc__
22 |     return target
23 |   return wrapper
24 | 


--------------------------------------------------------------------------------
/src/conftest.py:
--------------------------------------------------------------------------------
 1 | import os
 2 | import pathlib as pl
 3 | import pytest
 4 | import shutil
 5 | import tempfile as tf
 6 | 
 7 | from src.log import logger
 8 | 
 9 | 
10 | @pytest.fixture
11 | def manage_config_ini(doctest_namespace):
12 |     path_config_ini = pl.Path("config.ini")
13 |     if path_config_ini.exists():
14 |         # Save the current config.ini
15 |         fd_temp, path_temp = tf.mkstemp()
16 |         try:
17 |             shutil.copyfile(path_config_ini, path_temp)
18 |             path_config_ini.unlink()
19 |             yield
20 |             shutil.copyfile(path_temp, path_config_ini)
21 |         finally:
22 |             os.close(fd_temp)
23 |             os.remove(path_temp)
24 |     else:
25 |         # Make sure we don't leave a spurious config.ini
26 |         try:
27 |             yield
28 |         finally:
29 |             if path_config_ini.exists():
30 |                 path_config_ini.unlink()
31 | 


--------------------------------------------------------------------------------
/LICENSE:
--------------------------------------------------------------------------------
 1 | 
 2 | The MIT License (MIT)
 3 | Copyright (c) 2021, Kjell Wooding <kjell@wooding.org>
 4 | 
 5 | Permission is hereby granted, free of charge, to any person obtaining a copy of this software and associated documentation files (the "Software"), to deal in the Software without restriction, including without limitation the rights to use, copy, modify, merge, publish, distribute, sublicense, and/or sell copies of the Software, and to permit persons to whom the Software is furnished to do so, subject to the following conditions:
 6 | 
 7 | The above copyright notice and this permission notice shall be included in all copies or substantial portions of the Software.
 8 | 
 9 | THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
10 | 
11 | 


--------------------------------------------------------------------------------
/src/workflow.py:
--------------------------------------------------------------------------------
 1 | # Workflow is where we patch around API issues in between releases.
 2 | # Nothing in this file is intended to be a stable API. use at your own risk,
 3 | # as its contents will be regularly deprecated
 4 | import sys
 5 | import logging
 6 | from .data import Catalog, Dataset, DataSource
 7 | from .log import logger
 8 | 
 9 | __all__ = [
10 |     'make_target'
11 | ]
12 | 
13 | def make_target(target):
14 |     """process command from makefile
15 | 
16 |     Parameters
17 |     ----------
18 |     target: target to execute
19 |     """
20 | 
21 |     if target == "datasets":
22 |         c = Catalog.load('datasets')
23 |         for dsname in c:
24 |             logger.info(f"Generating Dataset:'{dsname}'")
25 |             ds = Dataset.load(dsname)
26 |     elif target == "datasources":
27 |         c = Catalog.load('datasources')
28 |         for name in c:
29 |             logger.info(f"Fetching, unpacking, and processing DataSource:'{name}'")
30 |             dsrc = DataSource.from_catalog(name)
31 |             ds = dsrc.process()
32 |     else:
33 |         raise NotImplementedError(f"Target: '{target}' not implemented")
34 | 
35 | 
36 | if __name__ == '__main__':
37 |     make_target(sys.argv[1])
38 | 


--------------------------------------------------------------------------------
/src/tests/test_catalog.py:
--------------------------------------------------------------------------------
 1 | import pytest
 2 | import pathlib
 3 | 
 4 | from src.data import Catalog
 5 | from src.log import logger
 6 | 
 7 | @pytest.fixture
 8 | def catalog(tmpdir):
 9 |     """Create a test catalog"""
10 | 
11 |     # Setup
12 |     # tmpdir should be empty when we get here
13 |     c = Catalog.create(catalog_path=tmpdir)
14 |     yield c
15 | 
16 |     # Teardown
17 | 
18 | @pytest.fixture
19 | def old_catalog_file():
20 |     test_dir = pathlib.Path(__file__).parent.resolve()
21 | 
22 |     yield test_dir / 'data' / 'dataset-test.json'
23 | 
24 | def test_old_catalog_init(tmpdir, old_catalog_file):
25 |     c = Catalog.from_old_catalog(old_catalog_file, catalog_path=tmpdir)
26 |     # Verify the catalog is nonempty and contains the expected data
27 |     assert len(c) == 4
28 |     for dsname in ["wine_reviews_130k", "wine_reviews_150k", 'wine_reviews_130k_varietals_75', 'wine_reviews']:
29 |         assert dsname in c
30 | 
31 |     # Should fail, as it already exists
32 |     with pytest.raises(FileExistsError):
33 |         c = Catalog.from_old_catalog(old_catalog_file, catalog_path=tmpdir)
34 | 
35 |     # Should succeed, as replace is set
36 |     c = Catalog.from_old_catalog(old_catalog_file, catalog_path=tmpdir, replace=True)
37 | 


--------------------------------------------------------------------------------
/src/tests/test_iter_directory.py:
--------------------------------------------------------------------------------
 1 | from contextlib import contextmanager
 2 | from tempfile import mkdtemp
 3 | from pathlib import Path
 4 | import pytest
 5 | import shutil
 6 | 
 7 | from ..data.utils import iter_directory
 8 | 
 9 | 
10 | @contextmanager
11 | def dir_temp() -> Path:
12 |     path = Path(mkdtemp())
13 |     try:
14 |         yield path
15 |     finally:
16 |         shutil.rmtree(path)
17 | 
18 | 
19 | def test_iter_directory_empty():
20 |     with dir_temp() as d:
21 |         assert list(iter_directory(d)) == []
22 | 
23 | 
24 | def test_iter_directory_flat():
25 |     with dir_temp() as d:
26 |         (d / "qwer").touch()
27 |         (d / "asdf").touch()
28 |         (d / "ghgh").touch()
29 |         (d / "1234").touch()
30 |         assert list(iter_directory(d)) == [d / i for i in ["1234", "asdf", "ghgh", "qwer"]]
31 | 
32 | 
33 | def test_iter_directory_deep():
34 |     with dir_temp() as d:
35 |         (d / "a" / "b" / "a" / "A").mkdir(parents=True)
36 |         (d / "a" / "hoho").touch()
37 |         (d / "1").touch()
38 |         (d / "a" / "b" / "a" / "A" / "v").touch()
39 |         (d / "a" / "b" / "3").touch()
40 |         (d / "a" / "b" / "z").touch()
41 |         assert list(iter_directory(d)) == [
42 |             d / i
43 |             for i in ["1", "a", "a/b", "a/b/3", "a/b/a", "a/b/a/A", "a/b/a/A/v", "a/b/z", "a/hoho"]
44 |         ]
45 | 


--------------------------------------------------------------------------------
/.gitignore:
--------------------------------------------------------------------------------
 1 | # Byte-compiled / optimized / DLL files
 2 | __pycache__/
 3 | *.py[cod]
 4 | 
 5 | # C extensions
 6 | *.so
 7 | 
 8 | # Distribution / packaging
 9 | .Python
10 | env/
11 | build/
12 | develop-eggs/
13 | dist/
14 | downloads/
15 | eggs/
16 | .eggs/
17 | lib/
18 | lib64/
19 | parts/
20 | sdist/
21 | var/
22 | *.egg-info/
23 | .installed.cfg
24 | *.egg
25 | 
26 | # PyInstaller
27 | #  Usually these files are written by a python script from a template
28 | #  before PyInstaller builds the exe, so as to inject date/other infos into it.
29 | *.manifest
30 | *.spec
31 | 
32 | # Installer logs
33 | pip-log.txt
34 | pip-delete-this-directory.txt
35 | 
36 | # Unit test / coverage reports
37 | htmlcov/
38 | .tox/
39 | .coverage
40 | .coverage.*
41 | .cache
42 | nosetests.xml
43 | coverage.xml
44 | *.cover
45 | 
46 | # Translations
47 | *.mo
48 | *.pot
49 | 
50 | # Django stuff:
51 | *.log
52 | 
53 | # Sphinx documentation
54 | docs/_build/
55 | 
56 | # PyBuilder
57 | target/
58 | 
59 | # DotEnv configuration
60 | .env
61 | 
62 | # Database
63 | *.db
64 | *.rdb
65 | 
66 | # Pycharm
67 | .idea
68 | 
69 | # VS Code
70 | .vscode/
71 | 
72 | # Spyder
73 | .spyproject/
74 | 
75 | # Jupyter NB Checkpoints
76 | .ipynb_checkpoints/
77 | 
78 | # exclude data and local config from source control by default
79 | /data/
80 | catalog/config.ini
81 | 
82 | # Mac OS-specific storage files
83 | .DS_Store
84 | 
85 | # Vim
86 | *.swp
87 | *.swo
88 | 
89 | # Emacs
90 | *~
91 | .*~
92 | 
93 | # Makefile Machinery
94 | .make.*
95 | 


--------------------------------------------------------------------------------
/README.md:
--------------------------------------------------------------------------------
 1 | Make Better Defaults
 2 | ====================
 3 | _Author: Kjell Wooding <kjell@wooding.org>_
 4 | 
 5 | This is the git repo for [Makefiles: One great trick for making your conda environments more managable](https://pydata.org/global2021/schedule/presentation/24/makefiles-one-great-trick-for-making-your-conda-environments-more-managable/). A [Pydata Global 2021](https://pydata.org/global2021/) talk given on October 28, 2021 by Kjell Wooding. ([Video](https://www.youtube.com/watch?v=cCzkL9DhWEE))
 6 | 
 7 | Getting Started
 8 | ---------------
 9 | 
10 | To get started, type "make".
11 | 
12 | To follow along, watch [the video](https://www.youtube.com/watch?v=cCzkL9DhWEE).
13 | 
14 | To learn more about Easydata, the framework that generated this repo, see the [Getting Started Guide](reference/easydata/easydata.md).
15 | 
16 | The Tips
17 | --------
18 | 1. Use git and virtual environments. Always.
19 | 2. Good workflow trumps good tooling
20 | 3. Good workflow means not having to remember things
21 | 4. Use one virtual environment per git repo. Give them both the same name.
22 | 5. Maintain virtual environments as code.
23 | 6. Use Lockfiles: Separate "what you want" from "what you need".
24 | 7. Auto-document your workflow
25 | 8. Don't be afraid to "Nuke it from orbit"
26 | 
27 | The Implementation
28 | ------------------
29 | See https://github.com/hackalog/make_better_defaults
30 | 
31 | Directory Structure
32 | -------------------
33 | See [Project Organization](reference/easydata/project-layout.md) for details on how this project is organized on disk.
34 | 
35 | --------
36 | 
37 | <p><small>This project was built using <a target="_blank" href="https://github.com/hackalog/easydata">Easydata</a>, a python framework aimed at making your data science workflow reproducible.</small></p>
38 | 


--------------------------------------------------------------------------------
/.circleci/config.yml:
--------------------------------------------------------------------------------
 1 | # Python CircleCI 2.0 configuration file
 2 | #
 3 | # Check https://circleci.com/docs/2.0/language-python/ for more details
 4 | #
 5 | version: 2
 6 | jobs:
 7 |   build:
 8 |     docker:
 9 |       # specify the version you desire here
10 |       # use `-browsers` prefix for selenium tests, e.g. `3.6.1-browsers`
11 |       - image: circleci/python:3.7.0
12 | 
13 |       # Specify service dependencies here if necessary
14 |       # CircleCI maintains a library of pre-built images
15 |       # documented at https://circleci.com/docs/2.0/circleci-images/
16 |       # - image: circleci/postgres:9.4
17 | 
18 |     working_directory: ~/repo
19 | 
20 |     steps:
21 |       - checkout
22 | 
23 |       - run:
24 |           name: Set up Anaconda
25 |           command: |
26 |             wget -q http://repo.continuum.io/miniconda/Miniconda3-latest-Linux-x86_64.sh -O ~/miniconda.sh;
27 |             chmod +x ~/miniconda.sh;
28 |             ~/miniconda.sh -b -p ~/miniconda;
29 |             echo "export PATH=~/miniconda/bin:$PATH" >> $BASH_ENV;
30 | 
31 |       - run:
32 |           name: Create environment and contrive to always use it
33 |           command: |
34 |             conda update --yes --quiet conda;
35 |             export CONDA_EXE=/home/circleci/miniconda/bin/conda
36 |             make create_environment
37 |             conda init bash
38 |             sed -ne '/>>> conda initialize/,/<<< conda initialize/p' ~/.bashrc >> $BASH_ENV
39 |             echo "conda activate make_better_defaults" >> $BASH_ENV;
40 | 
41 |       - run:
42 |           name: Create test report directory
43 |           command: |
44 |             mkdir test-reports
45 | 
46 |       # Cache dependencies
47 |       - save_cache:
48 |           key: pip-cache
49 |           paths:
50 |             - ~/.cache/pip
51 | 
52 |       - restore_cache:
53 |           keys:
54 |             - pip-cache
55 | 
56 |       - run:
57 |           name: Run tests
58 |           command: |
59 |             make test CI_RUNNING=yes
60 | 
61 |       - store_test_results:
62 |           path: test-reports
63 | 
64 |       - store_artifacts:
65 |           path: test-reports
66 |           destination: test-reports
67 | 


--------------------------------------------------------------------------------
/reference/easydata/project-layout.md:
--------------------------------------------------------------------------------
 1 | Easydata Project Layout
 2 | -----------------------
 3 | * `LICENSE`
 4 | * `Makefile`
 5 |     * Top-level makefile. Type `make` for a list of valid commands.
 6 | * `Makefile.include`
 7 |     * Global includes for makefile routines. Included by `Makefile`.
 8 | * `Makefile.env`
 9 |     * Command for maintaining reproducible conda environment. Included by `Makefile`.
10 | * `README.md`
11 |     * this file
12 | * `catalog`
13 |   * Data catalog. This is where config information such as data sources
14 |     and data transformations are saved.
15 |   * `catalog/config.ini`
16 |      * Local Data Store. This configuration file is for local data only, and is never checked into the repo.
17 | * `data`
18 |     * Data directory. Often symlinked to a filesystem with lots of space.
19 |     * `data/raw`
20 |         * Raw (immutable) hash-verified downloads.
21 |     * `data/interim`
22 |         * Extracted and interim data representations.
23 |     * `data/interim/cache`
24 |         * Dataset cache
25 |     * `data/processed`
26 |         * The final, canonical data sets ready for analysis.
27 | * `docs`
28 |     * Sphinx-format documentation files for this project.
29 |     * `docs/Makefile`: Makefile for generating HTML/Latex/other formats from Sphinx-format documentation.
30 | * `notebooks`
31 |     *  Jupyter notebooks. Naming convention is a number (for ordering),
32 |     the creator's initials, and a short `-` delimited description,
33 |     e.g. `1.0-jqp-initial-data-exploration`.
34 | * `reference`
35 |     * Data dictionaries, documentation, manuals, scripts, papers, or other explanatory materials.
36 |     * `reference/easydata`: Easydata framework and workflow documentation.
37 |     * `reference/templates`: Templates and code snippets for Jupyter
38 |     * `reference/dataset`: resources related to datasets; e.g. dataset creation notebooks and scripts
39 | * `reports`
40 |     * Generated analysis as HTML, PDF, LaTeX, etc.
41 |     * `reports/figures`
42 |         * Generated graphics and figures to be used in reporting.
43 | * `environment.yml`
44 |     * The user-readable YAML file for reproducing the conda/pip environment.
45 | * `environment.(platform).lock.yml`
46 |     * resolved versions, result of processing `environment.yml`
47 | * `setup.py`
48 |     * Turns contents of `src` into a
49 |     pip-installable python module  (`pip install -e .`) so it can be
50 |     imported in python code.
51 | * `src`
52 |     * Source code for use in this project.
53 |     * `src/__init__.py`
54 |         * Makes `src` a Python module.
55 |     * `src/data`
56 |         * Scripts to fetch or generate data.
57 |     * `src/analysis`
58 |         * Scripts to turn datasets into output products.
59 | 


--------------------------------------------------------------------------------
/reference/easydata/git-workflow.md:
--------------------------------------------------------------------------------
 1 | # The Easydata Git Workflow
 2 | Here's our suggestion for a reliable git workflow that works well in small team settings using [Easydata][cookiecutter-easydata].
 3 | 
 4 | ## Git configuration
 5 | 
 6 | If you haven't yet done so, please follow the instrucitons
 7 | in our [Git Configuration Guide](git-configuration.md) first.
 8 | 
 9 | ## Git Workflow
10 | 
11 | We suggest you start each day by doing this:
12 | 
13 | ### Where was I? What was I doing? Did I check it in?
14 | Sometimes, you stop work without checking things back in to the repo.
15 | Now, before you do any additional work, is the time to fix that.
16 | ```bash
17 | git branch   # what branch am I on?
18 | git status   # are there any files that need checking in?
19 | git add -p   # accept or reject parts of the modified files
20 | git commit -m "put your commit message here"
21 | ```
22 | 
23 | ### Did I do any work elsewhere?
24 | Did you make changes to your personal fork, but on a different machine? Make sure your local branch is up-to-date with your personal fork (`origin`):
25 | ```bash
26 | git checkout main
27 | git fetch origin --prune
28 | git merge origin/main
29 | ```
30 | 
31 | ### What happened upstream?
32 | Did someone make changes to the `upstream` repo in your absense?
33 | Let's fetch and merge those changes
34 | 
35 | ```bash
36 | git checkout main
37 | git fetch upstream --prune
38 | git merge upstream/main
39 | git push origin main
40 | make update_environment
41 | ```
42 | 
43 | ### Am I working from the latest `main`?
44 | Now that your `main` branch is up-to-date with both `origin` and `upstream`, you should use it to update your local working branches. If you are already developing in a branch called, e.g. `my_branch`, do this before writing any more code:
45 | 
46 | ```bash
47 | git checkout my_branch
48 | git merge main
49 | git push origin my_branch
50 | ```
51 | 
52 | ### Do I have any stale branches?
53 | With your local `main`, `origin/main` and `upstream/main` all in sync, we like to clean up any old branches that are fully merged (and hence, can be deleted without data loss.)
54 | ```bash
55 | git branch --merged main
56 | git branch -d <name_of_merged_branch>
57 | ```
58 | A really great feature of `git branch -d` is that it will refuse to remove a branch that hasn't been fully merged into another. Thus it's safe to use without any fear of data loss.
59 | 
60 | 
61 | ### Time to start the day
62 | Once you've finished all your merge tasks, you can create a clean working branch from the latest `main` by doing a:
63 | ```bash
64 | git checkout main
65 | git checkout -b new_branch_name
66 | ```
67 | 
68 | 
69 | That's it!. Do you have any suggestions for improvements to this workflow? Drop us a line or file an issue at
70 | [cookiecutter-easydata].
71 | 
72 | [cookiecutter-easydata]: https://github.com/hackalog/cookiecutter-easydata/


--------------------------------------------------------------------------------
/src/data/extra.py:
--------------------------------------------------------------------------------
 1 | """
 2 | Functions for handling "extra" data; i.e.  collections of raw files associated with a Dataset
 3 | """
 4 | 
 5 | from collections import defaultdict
 6 | import pathlib
 7 | import shutil
 8 | import os
 9 | 
10 | from tqdm.auto import tqdm
11 | 
12 | from .. import paths
13 | from ..log import logger
14 | 
15 | __all__ = [
16 |     'process_extra_files',
17 | ]
18 | 
19 | def process_extra_files(*, extract_dir=None, metadata=None, unpack_dir=None, file_glob="*", extra_dir=".extra", dataset_dir=None, do_copy=False):
20 |     """
21 |     Process unpacked raw files into its minimal dataset components (data, target, metadata).
22 |     Here, 'minimal' means `data` and `target` will be None, and `extra` will contain a
23 |     file dict of files matching the specified file_glob (and their sizes).
24 | 
25 |     Parameters
26 |     ----------
27 |     unpack_dir: default paths['interim_data_path']
28 |         The directory the interim data files have been unpacked into
29 |     dataset_dir: default paths['processed_data_path']
30 |         location of processed datasets.
31 |     extract_dir:
32 |         Name of the directory of the unpacked zip file containing the raw data files.
33 |         relative to unpack_dir
34 |     file_glob: string
35 |         Add only files matching this glob pattern to EXTRA
36 |     extra_dir: string
37 |         Used in building the file_dict keys.
38 |     do_copy: boolean
39 |         if True, actually copy the files. Otherwise just build EXTRA
40 | 
41 |     Returns
42 |     -------
43 |     (data, target, additional_metadata)
44 | 
45 |     where
46 | 
47 |     data and target are None,
48 | 
49 |     metadata contains a file dict; i.e.
50 |     'extra': {"path_relative_to_processed_dir_1": {"filename_1":["size:33"], "filename_2":["size:54"], ...}, ...}
51 |     """
52 |     if metadata is None:
53 |         metadata = {}
54 | 
55 |     if dataset_dir is None:
56 |         dataset_dir = paths['processed_data_path']
57 |     else:
58 |         dataset_dir = pathlib.Path(dataset_dir)
59 |     if unpack_dir is None:
60 |         unpack_dir = paths['interim_data_path']
61 |     else:
62 |         unpack_dir = pathlib.Path(unpack_dir)
63 |     if extract_dir is not None:
64 |         unpack_dir /= extract_dir
65 | 
66 |     extra_dir = pathlib.Path(extra_dir)
67 |     extra_dir_fq = dataset_dir / extra_dir
68 |     logger.debug(f"Do copy: {do_copy}")
69 |     if do_copy:
70 |         if extra_dir_fq.is_dir():
71 |             logger.warning(f"Cleaning contents of {extra_dir}")
72 |             shutil.rmtree(extra_dir_fq)
73 |             logger.debug(f"Copying files to {extra_dir_fq}...")
74 | 
75 |     file_dict = defaultdict(dict)
76 |     files = sorted(list(unpack_dir.rglob(file_glob)))
77 |     for i, file in enumerate(tqdm(files)):
78 |         if file.is_dir():
79 |             continue
80 |         relative_path = file.relative_to(unpack_dir)
81 |         extra_path = extra_dir / relative_path
82 |         file_dict[str(extra_path.parent)][str(extra_path.name)] = [f'size:{os.path.getsize(file)}']
83 |         if do_copy:
84 |             os.makedirs(dataset_dir / extra_path.parent, exist_ok=True)
85 |             shutil.copyfile(file, dataset_dir / extra_path)
86 |     metadata['extra'] = dict(file_dict)
87 | 
88 |     return None, None, metadata
89 | 


--------------------------------------------------------------------------------
/reference/easydata/git-configuration.md:
--------------------------------------------------------------------------------
 1 | # Setting up git and Checking Out the Repo
 2 | 
 3 | **Note**: These instructions assume you are using SSH keys (and not HTTPS authentication) with github.com. If you haven't set up SSH access to your repo host, see [Configuring SSH Access to Github or Gitlab][git-ssh]. This also includes instructions for using more than one account with SSH keys.
 4 | 
 5 | [git-ssh]: https://github.com/hackalog/cookiecutter-easydata/wiki/Configuring-SSH-Access-to-Github-or-GitLab
 6 | 
 7 | ## Git Configuration
 8 | When sharing a git repo with a small team, your code usually lives in at least 3 different places:
 9 | 
10 | * "local" refers to any git checkout on a local machine (or JupyterHub instance). This is where you work most of the time.
11 | * `upstream` refers to the shared Easydata repo on github.com; i.e. the **team repo**,
12 | * `origin` refers to your **personal fork** of the shared Easydata repo. It also lives on github.com.
13 | 
14 | ### Create a Personal Fork
15 | 
16 | We strongly recommend you make all your edits on a personal fork of this repo. Here's how to create such a fork:
17 | 
18 | * On Github or Gitlab, press the Fork button in the top right corner.
19 | * On Bitbucket, press the "+" icon on the left and choose **Fork this Repo**
20 | 
21 | ### Local, `origin`, and `upstream`
22 | git calls `upstream` (the **team repo**), and `origin` (your **personal fork** of the team repo) "remote" branches. Here's how to create them.
23 | 
24 | Create a local git checkout by cloning your personal fork:
25 | ```bash
26 | git clone git@github.com:<your_git_handle>/make_better_defaults.git
27 | ```
28 | Add the team (shared) repo as a remote branch named `upstream`:
29 | ```bash
30 |   cd make_better_defaults
31 |   git remote add upstream git@github.com:<upstream-repo>/make_better_defaults.git
32 | ```
33 | 
34 | You can verify that these branches are configured correctly by typing
35 | 
36 | ```
37 | >>> git remote -v
38 | origin	git@github.com:<your_git_handle>/make_better_defaults.git (fetch)
39 | origin	git@github.com:<your_git_handle>/make_better_defaults.git (push)
40 | upstream	git@github.com:<upstream-repo>/make_better_defaults.git (fetch)
41 | upstream	git@github.com:<upstream-repo>/make_better_defaults.git (push)
42 | ```
43 | or if you use HTTPS-based authentication:
44 | ```
45 | origin	https://github.com/<your_git_handle>/make_better_defaults.git (fetch)
46 | origin	https://github.com/<your_git_handle>/make_better_defaults.git (push)
47 | upstream	https://github.com/<upstream-repo>/make_better_defaults.git (fetch)
48 | upstream	https://github.com/<upstream-repo>/make_better_defaults.git (push)
49 | ```
50 | 
51 | ### Do Your Work in Branches
52 | To make life easiest, we recommend you do all your development **in branches**, and use your main branch **only** for tracking changes in the shared `upstream/main`. This combination makes it much easier not only to stay up to date with changes in the shared project repo, but also makes it easier to submit Pull/Merge Requests (PRs) against the upstream project repository should you want to share your code or data.
53 | 
54 | ### A Useful Git Workflow
55 | Once you've got your local, `origin`, and `upstream` branches configured, you can follow the instructions in this handy [Git Workflow Cheat Sheet](git-workflow.md) to keep your working copy of the repo in sync with the others.
56 | 


--------------------------------------------------------------------------------
/src/tests/make_test_datasets.py:
--------------------------------------------------------------------------------
 1 | from sklearn.datasets import fetch_20newsgroups
 2 | from functools import partial
 3 | 
 4 | from src.data import DataSource, Dataset, DatasetGraph, Catalog
 5 | from src import workflow, paths
 6 | from src.log import logger
 7 | 
 8 | # Set up a 20 newsgroups dataset
 9 | 
10 | ds_name = '20_newsgroups'
11 | output_ds_name = ds_name
12 | dsrc = DataSource(ds_name)
13 | 
14 | license = """
15 | Custom Academic License: "You may use this material free of charge for any educational purpose, provided attribution is given in any lectures or publications that make use of this material." As in http://kdd.ics.uci.edu/databases/20newsgroups/20newsgroups.data.html.
16 | """
17 | metadata = """
18 | The 20 Newsgroups dataset is a collection of approximately 20,000 newsgroup documents, partitioned (nearly) evenly across 20 different newsgroups.
19 | 
20 | The data is organized into 20 different newsgroups, each corresponding to a different topic. Some of the newsgroups are very closely related to each other (e.g. comp.sys.ibm.pc.hardware / comp.sys.mac.hardware), while others are highly unrelated (e.g misc.forsale / soc.religion.christian).
21 | 
22 | Here are the categories:
23 | 
24 |  * `alt.atheism`,
25 |  * `comp.graphics`,
26 |  * `comp.os.ms-windows.misc`,
27 |  * `comp.sys.ibm.pc.hardware`,
28 |  * `comp.sys.mac.hardware`,
29 |  * `comp.windows.x`,
30 |  * `misc.forsale`,
31 |  * `rec.autos`,
32 |  * `rec.motorcycles`,
33 |  * `rec.sport.baseball`,
34 |  * `rec.sport.hockey`,
35 |  * `sci.crypt`,
36 |  * `sci.electronics`,
37 |  * `sci.med`,
38 |  * `sci.space`,
39 |  * `soc.religion.christian`,
40 |  * `talk.politics.guns`,
41 |  * `talk.politics.mideast`,
42 |  * `talk.politics.misc`,
43 |  * `talk.religion.misc`
44 | 
45 | The current version is obtained by wrapping `sklearn.datasets.fetch_20newsgroups`, which comes from this [20 newsgroups webpage](http://qwone.com/~jason/20Newsgroups/).
46 | 
47 | By default we follow the sklearn suggestion to set `remove=('headers', 'footers', 'quotes')` to avoid overfitting.
48 | """
49 | 
50 | dsrc.add_metadata(contents=metadata, force=True)
51 | dsrc.add_metadata(contents=license, kind='LICENSE', force=True)
52 | 
53 | def process_20_newsgroups(*, extract_dir='20_newsgroups',
54 |                           metadata=None, unpack_dir=None,
55 |                           opts={"subset":"all", "remove":"('headers', 'footers', 'quotes')"}):
56 |     """
57 |     Process 20 newsgroups into (data, target, metadata) format.
58 | 
59 | 
60 |     Parameters
61 |     ----------
62 |     unpack_dir: path
63 |         The interim parent directory the dataset files have been unpacked into.
64 |     extract_dir: str
65 |         Name of the directory of the unpacked files relative to the unpack_dir. Note that
66 |     opts: dict default {"subset":"all", "remove"="('headers', 'footers', 'quotes')"}
67 |         Options to pass to sklearn.datasets.fetch_20newsgroups.
68 | 
69 | 
70 |     Returns
71 |     -------
72 |     A tuple:
73 |         (data, target, additional_metadata)
74 | 
75 |     """
76 |     if metadata is None:
77 |         metadata = {}
78 | 
79 |     if unpack_dir is None:
80 |         unpack_dir = paths['interim_data_path']
81 |     else:
82 |         unpack_dir = pathlib.Path(unpack_dir)
83 |     data_dir = unpack_dir / f"{extract_dir}"
84 | 
85 |     news = fetch_20newsgroups(**opts)
86 | 
87 |     return news.data, news.target, metadata
88 | 
89 | process_function = process_20_newsgroups
90 | process_kwargs = {}
91 | 
92 | dsrc.process_function = partial(process_function, **process_kwargs)
93 | dsrc.update_catalog()
94 | 
95 | dag = DatasetGraph()
96 | dag.add_source(output_dataset=output_ds_name, datasource_name=ds_name, overwrite_catalog=True)
97 | 


--------------------------------------------------------------------------------
/Makefile.help:
--------------------------------------------------------------------------------
 1 | #################################################################################
 2 | # Self Documenting Help for Make Targets                                        #
 3 | #################################################################################
 4 | #
 5 | # The MIT License (MIT)
 6 | # Copyright (c) 2016 DrivenData, Inc. <https://github.com/drivendata/cookiecutter-data-science>
 7 | # Copyright (c) 2018 Kjell Wooding <https://github.com/hackalog/easy.env>
 8 | #
 9 | # Permission is hereby granted, free of charge, to any person
10 | # obtaining a copy of this software and associated documentation files
11 | # (the "Software"), to deal in the Software without restriction,
12 | # including without limitation the rights to use, copy, modify, merge,
13 | # publish, distribute, sublicense, and/or sell copies of the Software,
14 | # and to permit persons to whom the Software is furnished to do so,
15 | # subject to the following conditions:
16 | #
17 | # The above copyright notice and this permission notice shall be
18 | # included in all copies or substantial portions of the Software.
19 | #
20 | # THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
21 | # EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
22 | # MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
23 | # NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS
24 | # BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN
25 | # ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN
26 | # CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
27 | # SOFTWARE.
28 | 
29 | .DEFAULT_GOAL := show-help
30 | 
31 | # Inspired by <http://marmelab.com/blog/2016/02/29/auto-documented-makefile.html>
32 | # sed script explained:
33 | # /^##/:
34 | # 	* save line in hold space
35 | # 	* purge line
36 | # 	* Loop:
37 | # 		* append newline + line to hold space
38 | # 		* go to next line
39 | # 		* if line starts with doc comment, strip comment character off and loop
40 | # 	* remove target prerequisites
41 | # 	* append hold space (+ newline) to line
42 | # 	* replace newline plus comments by `---`
43 | # 	* print line
44 | # Separate expressions are necessary because labels cannot be delimited by
45 | # semicolon; see <http://stackoverflow.com/a/11799865/1968>
46 | .PHONY: show-help
47 | 
48 | print-%  : ; @echo $* = $($*)
49 | 
50 | HELP_VARS := PROJECT_NAME DEBUG_FILE ARCH
51 | 
52 | help-prefix:
53 | 	@echo "To get started:"
54 | 	@echo "  >>> $$(tput bold)make create_environment$$(tput sgr0)"
55 | 	@echo "  >>> $$(tput bold)conda activate $(PROJECT_NAME)$$(tput sgr0)"
56 | 	@echo "  >>> $$(tput bold)make update_environment$$(tput sgr0)"
57 | 	@echo ""
58 | 	@echo "$$(tput bold)Project Variables:$$(tput sgr0)"
59 | 	@echo ""
60 | 
61 | show-help: help-prefix $(addprefix print-, $(HELP_VARS))
62 | 	@echo
63 | 	@echo "$$(tput bold)Available rules:$$(tput sgr0)"
64 | 	@sed -n -e "/^## / { \
65 | 		h; \
66 | 		s/.*//; \
67 | 		:doc" \
68 | 		-e "H; \
69 | 		n; \
70 | 		s/^## //; \
71 | 		t doc" \
72 | 		-e "s/:.*//; \
73 | 		G; \
74 | 		s/\\n## /---/; \
75 | 		s/\\n/ /g; \
76 | 		p; \
77 | 	}" ${MAKEFILE_LIST} \
78 | 	| LC_ALL='C' sort --ignore-case \
79 | 	| awk -F '---' \
80 | 		-v ncol=$$(tput cols) \
81 | 		-v indent=19 \
82 | 		-v col_on="$$(tput setaf 6)" \
83 | 		-v col_off="$$(tput sgr0)" \
84 | 	'{ \
85 | 		printf "%s%*s%s ", col_on, -indent, $$1, col_off; \
86 | 		n = split($$2, words, " "); \
87 | 		line_length = ncol - indent; \
88 | 		for (i = 1; i <= n; i++) { \
89 | 			line_length -= length(words[i]) + 1; \
90 | 			if (line_length <= 0) { \
91 | 				line_length = ncol - indent - length(words[i]) - 1; \
92 | 				printf "\n%*s ", -indent, " "; \
93 | 			} \
94 | 			printf "%s ", words[i]; \
95 | 		} \
96 | 		printf "\n"; \
97 | 	}' \
98 | 	| more $(shell test $(shell uname) = Darwin && echo '--no-init --raw-control-chars')
99 | 	@echo


--------------------------------------------------------------------------------
/src/_paths.py:
--------------------------------------------------------------------------------
  1 | from .decorators import SingletonDecorator
  2 | from .kvstore import KVStore
  3 | from .log import logger
  4 | import pathlib
  5 | 
  6 | class PathStore(KVStore):
  7 |     """Persistent Key-Value store for project-level paths
  8 | 
  9 |     >>> b = PathStore(config_file='/tmpx/project/catalog/config.ini', \
 10 |         project_path='${catalog_path}/..', \
 11 |         data_path='${project_path}/data', \
 12 |         persistent=False)
 13 | 
 14 |     By default, the project directory is the parent of the directory containing the `config_file`:
 15 | 
 16 |     >>> b['project_path']
 17 |     PosixPath('/tmpx/project')
 18 |     >>> b['data_path']
 19 |     PosixPath('/tmpx/project/data')
 20 | 
 21 |     The `catalog_path` is set upon instantiation and is read-only:
 22 | 
 23 |     >>> b['catalog_path']
 24 |     PosixPath('/tmpx/project/catalog')
 25 |     >>> b['catalog_path'] = '/tmp'
 26 |     Traceback (most recent call last):
 27 |      ...
 28 |     AttributeError: catalog_path is write-protected
 29 | 
 30 |     Changing a value changes all values that expand to contain it:
 31 | 
 32 |     >>> b['project_path'] = '/tmpy'
 33 |     >>> b['project_path']
 34 |     PosixPath('/tmpy')
 35 |     >>> b['data_path']
 36 |     PosixPath('/tmpy/data')
 37 | 
 38 |     We can have multiple levels of expansion:
 39 | 
 40 |     >>> b['raw_data_path'] = "${data_path}/raw"
 41 |     >>> b['raw_data_path']
 42 |     PosixPath('/tmpy/data/raw')
 43 |     >>> b['project_path'] = '/tmp3'
 44 |     >>> b['data_path']
 45 |     PosixPath('/tmp3/data')
 46 |     >>> b['raw_data_path']
 47 |     PosixPath('/tmp3/data/raw')
 48 |     """
 49 | 
 50 |     # These keys should never be written to disk, though they may be used
 51 |     # as variables in relative paths
 52 |     _protected = ['catalog_path']
 53 | 
 54 |     def __init__(self, *args,
 55 |                  config_section='Paths', config_file=None,
 56 |                  **kwargs):
 57 |         """Handle the special case of the config file"""
 58 |         if config_file is None:
 59 |             self._config_file = "config.ini"
 60 |         else:
 61 |             self._config_file = pathlib.Path(config_file)
 62 |         self._usage_warning = False
 63 |         super().__init__(*args, config_section=config_section,
 64 |                          config_file=self._config_file, **kwargs)
 65 |         self._usage_warning = True
 66 | 
 67 |     def _write(self):
 68 |         """temporarily hide protected keys when saving"""
 69 |         for key in self._protected:
 70 |             self._config.remove_option(self._config_section, key)
 71 |         super()._write()
 72 |         for key in self._protected:
 73 |             self._config.set(self._config_section, key, str(getattr(self, key)))
 74 | 
 75 |     def __setitem__(self, key, value):
 76 |         """Do not set a key if it is protected"""
 77 |         if key in self._protected:
 78 |             raise AttributeError(f"{key} is write-protected")
 79 | 
 80 |         if self._usage_warning:
 81 |             logger.warning(f"'{key}' is a local configuration variable, and for reproducibility reasons, should not set from a notebook or shared code. It is better to edit '{self._config_file}' instead. We have set it, but you have been warned.")
 82 | 
 83 |         super().__setitem__(key, value)
 84 | 
 85 | 
 86 |     def __getitem__(self, key):
 87 |         """get keys (including protected ones), converting to paths and fully resolving them"""
 88 |         if key in self._protected:
 89 |             return getattr(self, key)
 90 |         self._read()
 91 |         return pathlib.Path(super().__getitem__(key)).resolve()
 92 | 
 93 |     @property
 94 |     def catalog_path(self):
 95 |         return self._config_file.parent.resolve()
 96 | 
 97 | @SingletonDecorator
 98 | class Paths(PathStore):
 99 |     pass
100 | 
101 | 
102 | if __name__ == "__main__":
103 |     import doctest
104 |     doctest.testmod()
105 | 


--------------------------------------------------------------------------------
/reference/easydata/troubleshooting.md:
--------------------------------------------------------------------------------
 1 | ## Troubleshooting Guide
 2 | 
 3 | It's impossible to test the configurations on every possible machine, so we haven't caught everything. But we're working on making fixes as problems come up. Here's what we've encountered so far (with links to the issues in question if you want to deep dive into the fix).
 4 | 
 5 | Before you report a problem, make sure you are running the latest version of the surge repo.
 6 | Assuming you are following the [recommended git workflow](git-workflow.md) (i.e. you have set your `upstream` remote to point to the surge repo, you are working in a branch, and your `main` branch is tracking the surge repo), this means doing a:
 7 | ```
 8 | git checkout main
 9 | git fetch upstream --prune
10 | git merge upstream/main
11 | git push origin main
12 | make update_environment
13 | ```
14 | 
15 | You can then update your working branches as follows:
16 | ```
17 | git checkout my_branch
18 | git merge main  # advanced git users can do a rebase here. Others please merge.
19 | ```
20 | 
21 | Next, turn on debugging in your notebook. Add these cells to the top:
22 | ```
23 | import logging
24 | from src.log import logger
25 | 
26 | logger.setLevel(logging.DEBUG)
27 | ```
28 | 
29 | Third, ensure your notebook is running the correct environment; i.e. select **Kernel -> Change kernel -> Python[conda env:make_better_defaults]**. If you don't seem to have that option, make sure that you ran `jupyter notebooks` with the `make_better_defaults` conda environment enabled, and that `which jupyter` points to the correct (make_better_defaults) version of jupyter.
30 | 
31 | 
32 | If your problem persists, work through the table below. If these fail to resolve your issue, please post your issue. Include with your issue:
33 | 
34 | * A copy/pasted copy of the error traceback text (preferably posted as a "code snippet"), including DEBUG-level log messages.
35 | * The contents of your `environment.*.lock.yml`
36 | * the output of `%conda info` (run from within your jupyter notebook)
37 | * The output of `which python` and `which jupyter`
38 | 
39 | | Problem  | Status                    | Fix  |
40 | | :---          |    :----                             |   :----                             |
41 | | General weirdness due to not being in the right conda environment  | **Try this first**  | `conda activate make_better_defaults` or change the kernel in your jupyter notebook |
42 | | Old conda (e.g. `src` module is not being installed correctly) | **Try this second**| Upgrade conda to version > 4.8 |
43 | | `src` module not found | **Try this first** | `conda activate make_better_defaults`|
44 | | `src` module still doesn't work | **Try this second** | `touch environment.yml && make update_environment` |
45 | | Nothing works | Take off and nuke it from orbit | `conda deactivate && make delete_environment && make create_environment`|
46 | 
47 | ### Other specific troubleshooting FAQ
48 | 
49 | If `import cairo` fails, this may suggest some library (such as `libXrender.so`) could be missing. If you’ve followed all the troubleshooting instructions above, then proceed.
50 | 
51 | There is an open issue with Conda's handling of system dependencies related to the Cairo library, which is used for graph visualization through the `igraph` library, amongst other things. Seemingly, on cloud-borne virtual machines, such libraries that are common on desktop installs go undeployed, a fact that Conda apparently neglects.
52 | 
53 | Once can work around this issue by locally installing the missing dependency through their system's package manager (e.g. APT, Yum, Homebrew, and so on). For instance, on Ubuntu 18.04, the aforementioned Xrender library can be installed with the command
54 | 
55 | ```
56 | sudo apt-get install -y libxrender-dev
57 | ```
58 | 
59 | 
60 | ### Quick References
61 | 
62 | * [README](../README.md)
63 | * [Setting up and Maintaining your Conda Environment Reproducibly](conda-environments.md)
64 | * [Getting and Using Datasets](datasets.md)
65 | * [Using Notebooks for Analysis](notebooks.md)
66 | * [Sharing your Work](sharing-your-work.md)
67 | 


--------------------------------------------------------------------------------
/Makefile.envs:
--------------------------------------------------------------------------------
  1 | #
  2 | # Environment Management Makefile
  3 | #
  4 | 
  5 | include Makefile.include
  6 | 
  7 | $(EASYDATA_LOCKFILE): environment.yml
  8 | ifeq (conda, $(VIRTUALENV))
  9 | 	$(CONDA_EXE) env update -n $(PROJECT_NAME) -f $<
 10 | 	$(CONDA_EXE) env export -n $(PROJECT_NAME) -f $@
 11 | 	# pip install -e .  # uncomment for conda <= 4.3
 12 | else
 13 | 	$(error Unsupported Environment `$(VIRTUALENV)`. Use conda)
 14 | endif
 15 | 
 16 | .PHONY: create_environment
 17 | ## Set up virtual (conda) environment for this project
 18 | create_environment: $(EASYDATA_LOCKFILE)
 19 | ifeq (conda,$(VIRTUALENV))
 20 | 	@rm -f $(EASYDATA_LOCKFILE)
 21 | 	@echo
 22 | 	@echo "New conda env created. Activate with:"
 23 | 	@echo ">>> conda activate $(PROJECT_NAME)"
 24 | 	@echo ">>> make update_environment"
 25 | ifneq ("X$(wildcard .post-create-environment.txt)","X")
 26 | 	@cat .post-create-environment.txt
 27 | endif
 28 | else
 29 | 	$(error Unsupported Environment `$(VIRTUALENV)`. Use conda)
 30 | endif
 31 | 
 32 | .PHONY: delete_environment
 33 | ## Delete the virtual (conda) environment for this project
 34 | delete_environment:
 35 | ifeq (conda,$(VIRTUALENV))
 36 | 	@echo "Deleting conda environment."
 37 | 	$(CONDA_EXE) env remove -n $(PROJECT_NAME)
 38 | 	rm -f $(EASYDATA_LOCKFILE)
 39 | ifneq ("X$(wildcard .post-delete-environment.txt)","X")
 40 | 	@cat .post-delete-environment.txt
 41 | endif
 42 | else
 43 | 	$(error Unsupported Environment `$(VIRTUALENV)`. Use conda)
 44 | endif
 45 | 
 46 | .PHONY: update_environment
 47 | ## Install or update Python Dependencies in the virtual (conda) environment
 48 | update_environment: environment_enabled $(EASYDATA_LOCKFILE)
 49 | ifneq ("X$(wildcard .post-update-environment.txt)","X")
 50 | 	@cat .post-update-environment.txt
 51 | endif
 52 | 
 53 | .PHONY: environment_enabled
 54 | # Checks that the conda environment is active
 55 | environment_enabled:
 56 | ifeq (conda,$(VIRTUALENV))
 57 | ifneq ($(notdir ${CONDA_DEFAULT_ENV}), $(PROJECT_NAME))
 58 | 	$(error Run "$(VIRTUALENV) activate $(PROJECT_NAME)" before proceeding...)
 59 | endif
 60 | else
 61 | 	$(error Unsupported Environment `$(VIRTUALENV)`. Use conda)
 62 | endif
 63 | 
 64 | .PHONY: check_lockfile
 65 | # Test that an environment lockfile exists
 66 | check_lockfile:
 67 | ifeq (X,X$(wildcard $(EASYDATA_LOCKFILE)))
 68 | 	$(error Run "make update_environment" before proceeding...)
 69 | endif
 70 | 
 71 | .PHONY: check_environment
 72 | ## Check if environment is enabled and correctly configured
 73 | check_environment: environment_enabled check_lockfile $(EASYDATA_LOCKFILE)
 74 | 
 75 | .phony: help_update_easydata
 76 | help_update_easydata:
 77 | 	@echo "\nTo update easydata on an existing repo, verify that you have an 'easydata' branch"
 78 | 	@echo "\n>>>git rev-parse -q --verify easydata"
 79 | 	@echo "\nIf no output is given, do this:"
 80 | 	@echo "\n>>>git branch easydata `git rev-list --max-parents=0 HEAD`"
 81 | 	@echo "\nIf no output is given, do this:"
 82 | 	@echo "\nCheck-in all your changes, then merge the new easydata branch into yours"
 83 | 	@echo "\ngit branch easydata"
 84 | 	@echo "# replace easydata with https://github.com/hackalog/easydata if needed"
 85 | 	@echo "pushd .. && cookiecutter --config-file $(PROJECT_NAME)/.easydata.yml easydata -f --no-input && popd"
 86 | 	@echo "git add -p  # add all the changes"
 87 | 	@echo "git commit -m 'sync with easydata'"
 88 | 	@echo "git checkout main"
 89 | 	@echo "git merge easydata"
 90 | 
 91 | .PHONY: debug_environment
 92 | ## dump useful debugging information to $(DEBUG_FILE)
 93 | debug_environment:
 94 | 	@echo "\n\n======================"
 95 | 	@echo "\nPlease include the contents $(DEBUG_FILE) when submitting an issue or support request.\n"
 96 | 	@echo "======================\n\n"
 97 | 	@echo "##\n## Git status\n##\n" > $(DEBUG_FILE)
 98 | 	git status >> $(DEBUG_FILE)
 99 | 	@echo "\n##\n## git log\n##\n" >> $(DEBUG_FILE)
100 | 	git log -8 --graph --oneline --decorate --all >> $(DEBUG_FILE)
101 | 	@echo "\n##\n## Github remotes\n##\n" >> $(DEBUG_FILE)
102 | 	git remote -v >> $(DEBUG_FILE)
103 | 	@echo "\n##\n## github SSH credentials\n##\n" >> $(DEBUG_FILE)
104 | 	ssh git@github.com 2>&1 | cat >> $(DEBUG_FILE)
105 | 	@echo "\n##\n## Conda config\n##\n" >> $(DEBUG_FILE)
106 | 	$(CONDA_EXE) config --get >> $(DEBUG_FILE)
107 | 	@echo "\n##\n## Conda info\n##\n" >> $(DEBUG_FILE)
108 | 	$(CONDA_EXE) info  >> $(DEBUG_FILE)
109 | 	@echo "\n##\n## Conda list\n##\n" >> $(DEBUG_FILE)
110 | 	$(CONDA_EXE) list >> $(DEBUG_FILE)
111 | 
112 | .PHONY: unfinished
113 | unfinished:
114 | 	@echo "WARNING: this target is unfinished and may be removed or changed dramatically in future releases"
115 | 


--------------------------------------------------------------------------------
/reference/easydata/paths.md:
--------------------------------------------------------------------------------
  1 | ## Specifying paths in Easydata
  2 | 
  3 | As hardcoded paths are a notorious source of reproducibility issues, Easydata attempts to help avoid path-related issues by introducing a mechanism called `paths`.
  4 | 
  5 | ```
  6 | >>> from src import paths
  7 | ```
  8 | 
  9 | The goal of the `paths` mechanism is to help ensure that **hardcoded path data is never checked-in** to the git repository.
 10 | 
 11 | In an Easydata project, paths are recorded in `catalog/config.ini`.  This is a standard `configparser`-format _ini_ file (in [ExtendedInterpolation] format).  The paths specified in this file are used throughout Easydata to specify the standard locations of data artifacts.
 12 | 
 13 | [ExtendedInterpolation]: https://docs.python.org/3/library/configparser.html#configparser.ExtendedInterpolation
 14 | 
 15 | Because [ExtendedInterpolation] format is used, paths may refer to each other without the need to specify absolute path names.  The default paths, for example, are all relative to `project_path`:
 16 | 
 17 | ```
 18 | [Paths]
 19 | data_path = ${project_path}/data
 20 | raw_data_path = ${data_path}/raw
 21 | interim_data_path = ${data_path}/interim
 22 | processed_data_path = ${data_path}/processed
 23 | project_path = ${catalog_path}/..
 24 | ```
 25 | 
 26 | Note that, for chicken-and-egg reasons, `catalog_path` (the location of the `config.ini` file used to specify the paths) is **not specified** in this file. It is set upon module instantiation (when ` src` is imported) and is write-protected:
 27 | 
 28 | ```
 29 | >>> paths['catalog_path']
 30 | PosixPath('/tmpx/project/catalog')
 31 | >>> paths['catalog_path'] = '/tmp'
 32 | Traceback (most recent call last):
 33 |  ...
 34 | AttributeError: catalog_path is write-protected
 35 | ```
 36 | 
 37 | ### Accessing `paths` from Python
 38 | 
 39 | Within Python, `paths` appears to be a dictionary of standard path locations.  For instance, if your Easydata project lives in the `/path/to/repo` directory:
 40 | 
 41 | ```python
 42 | >>>  src.path['project_path']
 43 | /paths/to/repo
 44 | >>> type(paths['project_path'])
 45 | pathlib.PosixPath
 46 | ```
 47 | 
 48 | Notice that paths are automatically resolved to absolute filenames (in [pathlib] format) when accessed.
 49 | 
 50 | ```python
 51 | >>> for name, location in paths.items():
 52 | >>>     print(f"{name}: {location}")
 53 | data_path: /path/to/repo/make_better_defaults/data
 54 | raw_data_path: /path/to/repo/make_better_defaults/data/raw
 55 | interim_data_path: /path/to/repo/make_better_defaults/data/interim
 56 | processed_data_path: /path/to/repo/make_better_defaults/data/processed
 57 | project_path: /path/to/repo/make_better_defaults
 58 | ```
 59 | [pathlib]: https://docs.python.org/3/library/pathlib.html
 60 | 
 61 | Even though absolute paths are returned from the dictionary, the relative nature of the paths is preserved when these paths are modified.
 62 | 
 63 | 
 64 | ### Modifying paths
 65 | 
 66 | Recall that one of the Easydata design goals is to ensure that hardcoded paths should not be checked into your git repository. To this end, paths should **never be set from within notebooks or source code that is checked-in** to git. If you wish to modify a path on your local system, edit `config.ini` directly, or use python from the command line, as shown show below:
 67 | 
 68 | ```bash
 69 | >>> python -c "import  src;  src.paths['project_path'] = /alternate/bigdata/path"
 70 | ```
 71 | 
 72 | When accessed from Python, you'll immediately see the paths have all changed:
 73 | 
 74 | ```python
 75 | >>> for name, location in paths.items():
 76 | >>>     print(f"{name}: {location}")
 77 | data_path: /alternate/bigdata/path/make_better_defaults/data
 78 | raw_data_path: /alternate/bigdata/path/make_better_defaults/data/raw
 79 | interim_data_path: /alternate/bigdata/path/make_better_defaults/data/interim
 80 | processed_data_path: /alternate/bigdata/path/make_better_defaults/data/processed
 81 | project_path: /alternate/bigdata/path/make_better_defaults
 82 | ```
 83 | as has `config.ini`:
 84 | 
 85 | ```bash
 86 | >>> cat catalog/config.ini
 87 | [Paths]
 88 | data_path = ${project_path}/data
 89 | raw_data_path = ${data_path}/raw
 90 | interim_data_path = ${data_path}/interim
 91 | processed_data_path = ${data_path}/processed
 92 | project_path:/alternate/bigdata/path
 93 | ```
 94 | 
 95 | ### Accessing the unresolved paths from Python
 96 | 
 97 | If you ever need to see the raw (non-resolved) versions of the paths from within Python, use `paths.data`:
 98 | 
 99 | ```python
100 | >>> for name, location in paths.data.items():
101 | >>>     print(f"{name}: {location}")
102 | data_path:${project_path}/data
103 | raw_data_path:${data_path}/raw
104 | interim_data_path:${data_path}/interim
105 | processed_data_path:${data_path}/processed
106 | project_path:/alternate/bigdata/path
107 | ```
108 | 
109 | ### For more information
110 | ```python
111 | >>> from  src import paths
112 | >>> help(paths)
113 | ```
114 | 


--------------------------------------------------------------------------------
/src/utils/ipynbname.py:
--------------------------------------------------------------------------------
  1 | import json
  2 | import urllib.error
  3 | import urllib.request
  4 | from itertools import chain
  5 | from pathlib import Path, PurePath
  6 | from typing import Generator, Tuple, Union
  7 | 
  8 | import ipykernel
  9 | from jupyter_core.paths import jupyter_runtime_dir
 10 | from traitlets.config import MultipleInstanceError
 11 | 
 12 | __license__ = """
 13 | Copyright (c) 2020 Mark McPherson
 14 | 
 15 | Permission is hereby granted, free of charge, to any person obtaining a copy
 16 | of this software and associated documentation files (the "Software"), to deal
 17 | in the Software without restriction, including without limitation the rights
 18 | to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
 19 | copies of the Software, and to permit persons to whom the Software is
 20 | furnished to do so, subject to the following conditions:
 21 | 
 22 | The above copyright notice and this permission notice shall be included in all
 23 | copies or substantial portions of the Software.
 24 | 
 25 | THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
 26 | IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
 27 | FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
 28 | AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
 29 | LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
 30 | OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
 31 | SOFTWARE.
 32 | """
 33 | 
 34 | 
 35 | 
 36 | FILE_ERROR = "Can't identify the notebook {}."
 37 | CONN_ERROR = "Unable to access server;\n" \
 38 |            + "ipynbname requires either no security or token based security."
 39 | 
 40 | 
 41 | def _list_maybe_running_servers(runtime_dir=None) -> Generator[dict, None, None]:
 42 |     """ Iterate over the server info files of running notebook servers.
 43 |     """
 44 |     if runtime_dir is None:
 45 |         runtime_dir = jupyter_runtime_dir()
 46 |     runtime_dir = Path(runtime_dir)
 47 | 
 48 |     if runtime_dir.is_dir():
 49 |         for file_name in chain(
 50 |             runtime_dir.glob('nbserver-*.json'),  # jupyter notebook (or lab 2)
 51 |             runtime_dir.glob('jpserver-*.json'),  # jupyterlab 3
 52 |         ):
 53 |             yield json.loads(file_name.read_bytes())
 54 | 
 55 | 
 56 | def _get_kernel_id() -> str:
 57 |     """ Returns the kernel ID of the ipykernel.
 58 |     """
 59 |     connection_file = Path(ipykernel.get_connection_file()).stem
 60 |     kernel_id = connection_file.split('-', 1)[1]
 61 |     return kernel_id
 62 | 
 63 | 
 64 | def _get_sessions(srv):
 65 |     """ Given a server, returns sessions, or HTTPError if access is denied.
 66 |         NOTE: Works only when either there is no security or there is token
 67 |         based security. An HTTPError is raised if unable to connect to a
 68 |         server.
 69 |     """
 70 |     try:
 71 |         qry_str = ""
 72 |         token = srv['token']
 73 |         if token:
 74 |             qry_str = f"?token={token}"
 75 |         url = f"{srv['url']}api/sessions{qry_str}"
 76 |         with urllib.request.urlopen(url) as req:
 77 |             return json.load(req)
 78 |     except Exception:
 79 |         raise urllib.error.HTTPError(CONN_ERROR)
 80 | 
 81 | 
 82 | def _find_nb_path() -> Union[Tuple[dict, PurePath], Tuple[None, None]]:
 83 |     try:
 84 |         kernel_id = _get_kernel_id()
 85 |     except (MultipleInstanceError, RuntimeError, IndexError):
 86 |         return None, None  # Could not determine
 87 |     for srv in _list_maybe_running_servers():
 88 |         try:
 89 |             sessions = _get_sessions(srv)
 90 |             for sess in sessions:
 91 |                 if sess['kernel']['id'] == kernel_id:
 92 |                     return srv, PurePath(sess['notebook']['path'])
 93 |         except Exception:
 94 |             pass  # There may be stale entries in the runtime directory
 95 |     return None, None
 96 | 
 97 | def filepath():
 98 |     """Return notebook filename and path as a tuple"""
 99 |     _, path = _find_nb_path()
100 |     if path:
101 |         return path.name, path.parent
102 |     raise FileNotFoundError(FILE_ERROR.format('name'))
103 | 
104 | 
105 | def name() -> str:
106 |     """ Returns the short name of the notebook w/o the .ipynb extension,
107 |         or raises a FileNotFoundError exception if it cannot be determined.
108 |     """
109 |     _, path = _find_nb_path()
110 |     if path:
111 |         return path.stem
112 |     raise FileNotFoundError(FILE_ERROR.format('name'))
113 | 
114 | def path() -> Path:
115 |     """ Returns the absolute path of the notebook,
116 |         or raises a FileNotFoundError exception if it cannot be determined.
117 |     """
118 |     srv, path = _find_nb_path()
119 |     if srv and path:
120 |         root_dir = Path(srv.get('root_dir') or srv['notebook_dir'])
121 |         return root_dir / path
122 |     raise FileNotFoundError(FILE_ERROR.format('path'))
123 | 


--------------------------------------------------------------------------------
/src/utils/__init__.py:
--------------------------------------------------------------------------------
  1 | import json
  2 | import numpy as np
  3 | import pathlib
  4 | import time
  5 | 
  6 | import nbformat
  7 | from nbconvert.preprocessors import ExecutePreprocessor, CellExecutionError
  8 | 
  9 | 
 10 | from ..log import logger
 11 | from .ipynbname import name as ipynb_name, path as ipynb_path
 12 | from .. import paths
 13 | 
 14 | # Timing and Performance
 15 | 
 16 | def timing_info(method):
 17 |     def wrapper(*args, **kw):
 18 |         start_time = time.time()
 19 |         result = method(*args, **kw)
 20 |         end_time = time.time()
 21 |         logger.info(f"timing_info: {method.__name__}"
 22 |                     f"@{round((end_time-start_time)*1000,1)} ms")
 23 | 
 24 |         return result
 25 | 
 26 |     return wrapper
 27 | 
 28 | def record_time_interval(section, start_time, line_break=False):
 29 |     """Record a time interval since the last timestamp"""
 30 |     end_time = time.time()
 31 |     delta = end_time - start_time
 32 |     if delta < 1:
 33 |         delta *= 1000
 34 |         units = "ms"
 35 |     else:
 36 |         units = "s"
 37 |     if line_break:
 38 |         logger.debug("PROCESS_TIME:{:>36}    {} {}\n".format(section, round(delta, 1), units))
 39 |     else:
 40 |         logger.debug("PROCESS_TIME:{:>36}    {} {}".format(section, round(delta, 1), units))
 41 |     return end_time
 42 | 
 43 | def normalize_numpy_dict(d):
 44 |     ret = d.copy()
 45 |     for k, v in ret.items():
 46 |         if isinstance(v, np.generic):
 47 |             ret[k] = np.asscalar(v)
 48 |     return ret
 49 | 
 50 | def save_json(filename, obj, indent=2, sort_keys=True):
 51 |     """Dump an object to disk in json format
 52 | 
 53 |     filename: pathname
 54 |         Filename to dump to
 55 |     obj: object
 56 |         Object to dump
 57 |     indent: integer
 58 |         number of characters to indent
 59 |     sort_keys: boolean
 60 |         Whether to sort keys before writing. Should be True if you ever use revision control
 61 |         on the resulting json file.
 62 |     """
 63 |     blob = json.dumps(obj, indent=indent, sort_keys=sort_keys)
 64 | 
 65 |     with open(filename, 'w') as fw:
 66 |         fw.write(blob)
 67 | 
 68 | def load_json(filename):
 69 |     """Read a json file from disk"""
 70 |     with open(filename) as f:
 71 |         obj = json.load(f)
 72 |     return obj
 73 | 
 74 | def head_file(filename, n=5):
 75 |     """Return the first `n` lines of a file
 76 |     """
 77 |     with open(filename, 'r') as fd:
 78 |         lines = []
 79 |         for i, line in enumerate(fd):
 80 |             if i > n:
 81 |                 break
 82 |             lines.append(line)
 83 |     return "".join(lines)
 84 | 
 85 | def list_dir(path, fully_qualified=False, glob_pattern='*'):
 86 |     """do an ls on a path
 87 | 
 88 |     fully_qualified: boolean (default: False)
 89 |         If True, return a list of fully qualified pathlib objects.
 90 |         if False, return just the bare filenames
 91 |     glob_pattern: glob (default: '*')
 92 |         File mattern to match
 93 | 
 94 |     Returns
 95 |     -------
 96 |     A list of names, or fully qualified pathlib objects"""
 97 |     if fully_qualified:
 98 |         return list(pathlib.Path(path).glob(glob_pattern))
 99 | 
100 |     return [file.name for file in pathlib.Path(path).glob(glob_pattern)]
101 | 
102 | def normalize_to_list(str_or_iterable):
103 |     """Convert strings to lists. convert None to list. Convert all other iterables to lists
104 |     """
105 |     if isinstance(str_or_iterable, str):
106 |         return [str_or_iterable]
107 |     if str_or_iterable is None:
108 |         return []
109 |     return str_or_iterable
110 | 
111 | 
112 | def run_notebook(*,
113 |                 notebook_name=None,
114 |                 notebook_path=None,
115 |                 output_notebook_name=None,
116 |                 output_notebook_path=None,
117 |                 timeout=-1,
118 |                 notebook_version=4,
119 |                 kernel='python3',
120 |                 ):
121 |     """Execute a jupyter notebook
122 | 
123 |     kernel name is an issue: https://github.com/jupyter/nbconvert/issues/515
124 | 
125 |     """
126 |     if notebook_path is None:
127 |         notebook_path = paths['notebook_path']
128 |     else:
129 |         notebook_path = pathlib.Path(notebook_path)
130 | 
131 |     if output_notebook_path is None:
132 |         output_notebook_path = paths['interim_data_path']
133 |     else:
134 |         output_notebook_path = pathlib.Path(output_notebook_path)
135 | 
136 |     if output_notebook_name is None:
137 |         output_notebook_name = notebook_name
138 | 
139 |     output_notebook_fq = output_notebook_path / output_notebook_name
140 | 
141 |     with open(notebook_path / notebook_name) as f:
142 |         nb = nbformat.read(f, as_version=notebook_version)
143 | 
144 |     ep = ExecutePreprocessor(timeout=timeout, kernel_name=kernel)
145 |     try:
146 |         out = ep.preprocess(nb, {'metadata': {'path': notebook_path}})
147 |     except CellExecutionError:
148 |         out = None
149 |         msg = f"""Error executing the notebook "{notebook_name}".
150 | 
151 |         See notebook "{str(output_notebook_fq)}" for the traceback.'
152 |         """
153 |         logger.error(msg)
154 |         raise
155 |     finally:
156 |         with open(output_notebook_fq, mode='w', encoding='utf-8') as f:
157 |             nbformat.write(nb, f)
158 |     return output_notebook_name
159 | 


--------------------------------------------------------------------------------
/src/kvstore.py:
--------------------------------------------------------------------------------
  1 | import configparser
  2 | import pathlib
  3 | from collections.abc import MutableMapping
  4 | 
  5 | class KVStore(MutableMapping):
  6 |     """Dictionary-like key-value store backed to disk by a ConfigParser (ini) file
  7 | 
  8 |     Basic functionality is that of a dictionary, with the addition of an implicit
  9 |     `config_file` and `config_section`:
 10 | 
 11 |     >>> getfixture('manage_config_ini')  # This is just a test fixture, please disregard
 12 |     >>> d = KVStore({'key1':'value1'}, key2='value2')
 13 |     >>> d['key3'] = 'value3'
 14 |     >>> d
 15 |     KVStore(config_file='config.ini', config_section='KVStore', key1='value1', key2='value2', key3='value3')
 16 | 
 17 | 
 18 |     To create a brand new, default KVStore, ignoring anything that may already be on disk:
 19 |     >>> d = KVStore(overwrite=True)
 20 |     >>> d
 21 |     KVStore(config_file='config.ini', config_section='KVStore', )
 22 | 
 23 |     KVStore values can reference other values via substitution using the
 24 |     `ConfigParser.ExtendedInterpolation` format. When the KVStore is viewed as a dict,
 25 |     this substitution happens automatically.
 26 | 
 27 |     >>> d = KVStore(root_path='/tmp', data_path='${root_path}/data')
 28 |     >>> dict(d)
 29 |     {'root_path': '/tmp', 'data_path': '/tmp/data'}
 30 |     >>> d['data_path']
 31 |     '/tmp/data'
 32 | 
 33 |     To see the unparsed (raw) value, examine the object's `data` method; e.g.
 34 |     >>> d.data
 35 |     {'root_path': '/tmp', 'data_path': '${root_path}/data'}
 36 | 
 37 |     This substitution is updated whenever a key changes; e.g.
 38 |     >>> d['raw_data_path'] = '${root_path}/raw'
 39 |     >>> d['root_path'] = '/tmp2'
 40 |     >>> dict(d)
 41 |     {'root_path': '/tmp2', 'data_path': '/tmp2/data', 'raw_data_path': '/tmp2/raw'}
 42 |     >>> d.data
 43 |     {'root_path': '/tmp2', 'data_path': '${root_path}/data', 'raw_data_path': '${root_path}/raw'}
 44 |     >>> d['data_path']
 45 |     '/tmp2/data'
 46 | 
 47 |     Because this object is disk-backed, newly instantiated objects will receive the last set of defaults:
 48 |     >>> c = KVStore()
 49 |     >>> dict(c)
 50 |     {'root_path': '/tmp2', 'data_path': '/tmp2/data', 'raw_data_path': '/tmp2/raw'}
 51 |     >>> c.data
 52 |     {'root_path': '/tmp2', 'data_path': '${root_path}/data', 'raw_data_path': '${root_path}/raw'}
 53 | 
 54 |     We can force overwriting of this disk-backed file using the `overwrite` parameters:
 55 |     >>> c = KVStore(overwrite=True)
 56 |     >>> dict(c), c.data
 57 |     ({}, {})
 58 |     """
 59 |     def __init__(self, *args,
 60 |                  config_file=None, config_section="KVStore", overwrite=False, persistent=True,
 61 |                  **kwargs):
 62 |         """Create a new disk-backed key-value store
 63 | 
 64 |         Arguments
 65 |         ---------
 66 |         config_file: Path
 67 |             path to ini (ConfigParser-formatted) file that will be used to persist the KVStore
 68 |         config_section: String
 69 |             Section name to be used in the `config_file`
 70 |         overwrite: Boolean
 71 |             If True, any config file on disk will be overwritten.
 72 |             Otherwise, existing values from this file will be used as defaults,
 73 |             (unless overridden by explicit key/value pairs in the constructor)
 74 |         *args, **kwargs:
 75 |             All other arguments will be used as per the standard `dict` constructor
 76 | 
 77 |         """
 78 |         self._persistent = persistent
 79 |         if config_file is None:
 80 |             self._config_file = pathlib.Path("config.ini")
 81 |         else:
 82 |             self._config_file = pathlib.Path(config_file)
 83 |         self._config_section = config_section
 84 |         self._config = configparser.ConfigParser(interpolation=configparser.ExtendedInterpolation())
 85 | 
 86 |         self.data = dict()
 87 | 
 88 |         if self._config_file.exists() and not overwrite:
 89 |             self._read()
 90 |         else:
 91 |             self._config.add_section(config_section)
 92 |             self._config.read_dict(self.data)
 93 | 
 94 |         self.update({k:v for k,v in self._config.items(self._config_section, raw=True)}) # `update` comes for free from the abc
 95 |         self.update(dict(*args, **kwargs))
 96 |         self._write()
 97 | 
 98 |     def __getitem__(self, key):
 99 |         return self._config.get(self._config_section, key)
100 | 
101 |     def __setitem__(self, key, value):
102 |         self.data[key] = value
103 |         self._config.set(self._config_section, key, value)
104 |         self._write()
105 | 
106 |     def __delitem__(self, key):
107 |         del self.data[key]
108 |         self._config.remove_option(self._config_section, key)
109 |         self._write()
110 | 
111 |     def __iter__(self):
112 |         return iter(self.data)
113 | 
114 |     def __len__(self):
115 |         return len(self.data)
116 | 
117 |     def _read(self):
118 |         self._config.read(self._config_file)
119 |         if not self._config.has_section(self._config_section):
120 |             # File exists but we are adding to a new section of it
121 |             self._config.add_section(self._config_section)
122 | 
123 |     def _write(self):
124 |         if self._persistent:
125 |             with open(self._config_file, 'w') as fw:
126 |                 self._config.write(fw)
127 | 
128 |     def __repr__(self):
129 |         kvstr = ", ".join([f"{k}='{v}'" for k,v in self.data.items()])
130 |         return f"KVStore(config_file='{str(self._config_file)}', config_section='{self._config_section}', {kvstr})"
131 | 
132 |     def __str__(self):
133 |         return str({k:v for k,v in self._config.items(self._config_section, raw=False)})
134 | 
135 | 
136 | if __name__ == "__main__":
137 |     import doctest
138 |     doctest.testmod()
139 | 


--------------------------------------------------------------------------------
/docs/make.bat:
--------------------------------------------------------------------------------
  1 | @ECHO OFF
  2 | 
  3 | REM Command file for Sphinx documentation
  4 | 
  5 | if "%SPHINXBUILD%" == "" (
  6 | 	set SPHINXBUILD=sphinx-build
  7 | )
  8 | set BUILDDIR=_build
  9 | set ALLSPHINXOPTS=-d %BUILDDIR%/doctrees %SPHINXOPTS% .
 10 | set I18NSPHINXOPTS=%SPHINXOPTS% .
 11 | if NOT "%PAPER%" == "" (
 12 | 	set ALLSPHINXOPTS=-D latex_paper_size=%PAPER% %ALLSPHINXOPTS%
 13 | 	set I18NSPHINXOPTS=-D latex_paper_size=%PAPER% %I18NSPHINXOPTS%
 14 | )
 15 | 
 16 | if "%1" == "" goto help
 17 | 
 18 | if "%1" == "help" (
 19 | 	:help
 20 | 	echo.Please use `make ^<target^>` where ^<target^> is one of
 21 | 	echo.  html       to make standalone HTML files
 22 | 	echo.  dirhtml    to make HTML files named index.html in directories
 23 | 	echo.  singlehtml to make a single large HTML file
 24 | 	echo.  pickle     to make pickle files
 25 | 	echo.  json       to make JSON files
 26 | 	echo.  htmlhelp   to make HTML files and a HTML help project
 27 | 	echo.  qthelp     to make HTML files and a qthelp project
 28 | 	echo.  devhelp    to make HTML files and a Devhelp project
 29 | 	echo.  epub       to make an epub
 30 | 	echo.  latex      to make LaTeX files, you can set PAPER=a4 or PAPER=letter
 31 | 	echo.  text       to make text files
 32 | 	echo.  man        to make manual pages
 33 | 	echo.  texinfo    to make Texinfo files
 34 | 	echo.  gettext    to make PO message catalogs
 35 | 	echo.  changes    to make an overview over all changed/added/deprecated items
 36 | 	echo.  linkcheck  to check all external links for integrity
 37 | 	echo.  doctest    to run all doctests embedded in the documentation if enabled
 38 | 	goto end
 39 | )
 40 | 
 41 | if "%1" == "clean" (
 42 | 	for /d %%i in (%BUILDDIR%\*) do rmdir /q /s %%i
 43 | 	del /q /s %BUILDDIR%\*
 44 | 	goto end
 45 | )
 46 | 
 47 | if "%1" == "html" (
 48 | 	%SPHINXBUILD% -b html %ALLSPHINXOPTS% %BUILDDIR%/html
 49 | 	if errorlevel 1 exit /b 1
 50 | 	echo.
 51 | 	echo.Build finished. The HTML pages are in %BUILDDIR%/html.
 52 | 	goto end
 53 | )
 54 | 
 55 | if "%1" == "dirhtml" (
 56 | 	%SPHINXBUILD% -b dirhtml %ALLSPHINXOPTS% %BUILDDIR%/dirhtml
 57 | 	if errorlevel 1 exit /b 1
 58 | 	echo.
 59 | 	echo.Build finished. The HTML pages are in %BUILDDIR%/dirhtml.
 60 | 	goto end
 61 | )
 62 | 
 63 | if "%1" == "singlehtml" (
 64 | 	%SPHINXBUILD% -b singlehtml %ALLSPHINXOPTS% %BUILDDIR%/singlehtml
 65 | 	if errorlevel 1 exit /b 1
 66 | 	echo.
 67 | 	echo.Build finished. The HTML pages are in %BUILDDIR%/singlehtml.
 68 | 	goto end
 69 | )
 70 | 
 71 | if "%1" == "pickle" (
 72 | 	%SPHINXBUILD% -b pickle %ALLSPHINXOPTS% %BUILDDIR%/pickle
 73 | 	if errorlevel 1 exit /b 1
 74 | 	echo.
 75 | 	echo.Build finished; now you can process the pickle files.
 76 | 	goto end
 77 | )
 78 | 
 79 | if "%1" == "json" (
 80 | 	%SPHINXBUILD% -b json %ALLSPHINXOPTS% %BUILDDIR%/json
 81 | 	if errorlevel 1 exit /b 1
 82 | 	echo.
 83 | 	echo.Build finished; now you can process the JSON files.
 84 | 	goto end
 85 | )
 86 | 
 87 | if "%1" == "htmlhelp" (
 88 | 	%SPHINXBUILD% -b htmlhelp %ALLSPHINXOPTS% %BUILDDIR%/htmlhelp
 89 | 	if errorlevel 1 exit /b 1
 90 | 	echo.
 91 | 	echo.Build finished; now you can run HTML Help Workshop with the ^
 92 | .hhp project file in %BUILDDIR%/htmlhelp.
 93 | 	goto end
 94 | )
 95 | 
 96 | if "%1" == "qthelp" (
 97 | 	%SPHINXBUILD% -b qthelp %ALLSPHINXOPTS% %BUILDDIR%/qthelp
 98 | 	if errorlevel 1 exit /b 1
 99 | 	echo.
100 | 	echo.Build finished; now you can run "qcollectiongenerator" with the ^
101 | .qhcp project file in %BUILDDIR%/qthelp, like this:
102 | 	echo.^> qcollectiongenerator %BUILDDIR%\qthelp\make_better_defaults.qhcp
103 | 	echo.To view the help file:
104 | 	echo.^> assistant -collectionFile %BUILDDIR%\qthelp\make_better_defaults.ghc
105 | 	goto end
106 | )
107 | 
108 | if "%1" == "devhelp" (
109 | 	%SPHINXBUILD% -b devhelp %ALLSPHINXOPTS% %BUILDDIR%/devhelp
110 | 	if errorlevel 1 exit /b 1
111 | 	echo.
112 | 	echo.Build finished.
113 | 	goto end
114 | )
115 | 
116 | if "%1" == "epub" (
117 | 	%SPHINXBUILD% -b epub %ALLSPHINXOPTS% %BUILDDIR%/epub
118 | 	if errorlevel 1 exit /b 1
119 | 	echo.
120 | 	echo.Build finished. The epub file is in %BUILDDIR%/epub.
121 | 	goto end
122 | )
123 | 
124 | if "%1" == "latex" (
125 | 	%SPHINXBUILD% -b latex %ALLSPHINXOPTS% %BUILDDIR%/latex
126 | 	if errorlevel 1 exit /b 1
127 | 	echo.
128 | 	echo.Build finished; the LaTeX files are in %BUILDDIR%/latex.
129 | 	goto end
130 | )
131 | 
132 | if "%1" == "text" (
133 | 	%SPHINXBUILD% -b text %ALLSPHINXOPTS% %BUILDDIR%/text
134 | 	if errorlevel 1 exit /b 1
135 | 	echo.
136 | 	echo.Build finished. The text files are in %BUILDDIR%/text.
137 | 	goto end
138 | )
139 | 
140 | if "%1" == "man" (
141 | 	%SPHINXBUILD% -b man %ALLSPHINXOPTS% %BUILDDIR%/man
142 | 	if errorlevel 1 exit /b 1
143 | 	echo.
144 | 	echo.Build finished. The manual pages are in %BUILDDIR%/man.
145 | 	goto end
146 | )
147 | 
148 | if "%1" == "texinfo" (
149 | 	%SPHINXBUILD% -b texinfo %ALLSPHINXOPTS% %BUILDDIR%/texinfo
150 | 	if errorlevel 1 exit /b 1
151 | 	echo.
152 | 	echo.Build finished. The Texinfo files are in %BUILDDIR%/texinfo.
153 | 	goto end
154 | )
155 | 
156 | if "%1" == "gettext" (
157 | 	%SPHINXBUILD% -b gettext %I18NSPHINXOPTS% %BUILDDIR%/locale
158 | 	if errorlevel 1 exit /b 1
159 | 	echo.
160 | 	echo.Build finished. The message catalogs are in %BUILDDIR%/locale.
161 | 	goto end
162 | )
163 | 
164 | if "%1" == "changes" (
165 | 	%SPHINXBUILD% -b changes %ALLSPHINXOPTS% %BUILDDIR%/changes
166 | 	if errorlevel 1 exit /b 1
167 | 	echo.
168 | 	echo.The overview file is in %BUILDDIR%/changes.
169 | 	goto end
170 | )
171 | 
172 | if "%1" == "linkcheck" (
173 | 	%SPHINXBUILD% -b linkcheck %ALLSPHINXOPTS% %BUILDDIR%/linkcheck
174 | 	if errorlevel 1 exit /b 1
175 | 	echo.
176 | 	echo.Link check complete; look for any errors in the above output ^
177 | or in %BUILDDIR%/linkcheck/output.txt.
178 | 	goto end
179 | )
180 | 
181 | if "%1" == "doctest" (
182 | 	%SPHINXBUILD% -b doctest %ALLSPHINXOPTS% %BUILDDIR%/doctest
183 | 	if errorlevel 1 exit /b 1
184 | 	echo.
185 | 	echo.Testing of doctests in the sources finished, look at the ^
186 | results in %BUILDDIR%/doctest/output.txt.
187 | 	goto end
188 | )
189 | 
190 | :end
191 | 


--------------------------------------------------------------------------------
/reference/easydata/easydata.md:
--------------------------------------------------------------------------------
  1 | EASYDATA
  2 | --------
  3 | 
  4 | This repo was generated using the Easydata framework, which includes:
  5 | 
  6 | * tools for managing conda environments in a consistent and reproducible way,
  7 | * built-in dataset management (including tracking of metadata such as LICENSES and READMEs),
  8 | * a prescribed project directory structure,
  9 | * workflows and conventions for contributing notebooks and other code.
 10 | 
 11 | EASYDATA REQUIREMENTS
 12 | ------------
 13 | * Make
 14 | * conda >= 4.8 (via Anaconda or Miniconda)
 15 | * Git
 16 | 
 17 | GETTING STARTED
 18 | ---------------
 19 | ### Initial Git Configuration and Checking Out the Repo
 20 | 
 21 | If you haven't yet done so, please follow the instrucitons
 22 | in [Setting up git and Checking Out the Repo](reference/easydata/git-configuration.md) in
 23 | order to check-out the code and set-up your remote branches
 24 | 
 25 | Note: These instructions assume you are using SSH keys (and not HTTPS authentication) with github.com.
 26 | If you haven't set up SSH access to github.com, see [Configuring SSH Access to github.com](https://github.com/hackalog/easydata/wiki/Configuring-SSH-Access-to-Github). This also includes instuctions for using more than one account with SSH keys.
 27 | 
 28 | Once you've got your local, `origin`, and `upstream` branches configured, you can follow the instructions in this handy [Git Workflow Cheat Sheet](reference/easydata/git-workflow.md) to keep your working copy of the repo in sync with the others.
 29 | 
 30 | ### Setting up your environment
 31 | **WARNING**: If you have conda-forge listed as a channel in your `.condarc` (or any other channels other than defaults), you may experience great difficulty generating reproducible conda environments.
 32 | 
 33 | We recommend you remove conda-forge (and all other non-default channels) from your `.condarc` file and [set your channel priority to 'strict'](https://docs.conda.io/projects/conda/en/latest/user-guide/tasks/manage-channels.html). Alternate channels can be specified explicitly in your your `environment.yml` by prefixing your package name with `channel-name::`; e.g.
 34 | ```
 35 |   - wheel                    # install from the default (anaconda) channel
 36 |   - pytorch::pytorch         # install this from the `pytorch` channel
 37 |   - conda-forge::tokenizers  # install this from conda-forge
 38 | 
 39 | 
 40 | ### Initial setup
 41 | 
 42 | * Make note of the path to your conda binary:
 43 | ```
 44 |    $ which conda
 45 |    ~/miniconda3/bin/conda
 46 | ```
 47 | * ensure your `CONDA_EXE` environment variable is set to this value (or edit `Makefile.include` directly)
 48 | ```
 49 |     export CONDA_EXE=~/miniconda3/bin/conda
 50 | ```
 51 | * Create and switch to the virtual environment:
 52 | ```
 53 | cd make_better_defaults
 54 | make create_environment
 55 | conda activate make_better_defaults
 56 | ```
 57 | 
 58 | Now you're ready to run `jupyter notebook` (or jupyterlab) and explore the notebooks in the `notebooks` directory.
 59 | 
 60 | For more instructions on setting up and maintaining your environment (including how to point your environment at your custom forks and work in progress) see [Setting up and Maintaining your Conda Environment Reproducibly](reference/easydata/conda-environments.md).
 61 | 
 62 | ### Loading Datasets
 63 | 
 64 | At this point you will be able to load any of the pre-built datasets by the following set of commands:
 65 | ```python
 66 | from src.data import Dataset
 67 | ds = Dataset.load("<dataset-name>")
 68 | ```
 69 | Because of licenses and other distribution restrictions, some of the datasets will require a manual dowload step. If so, you will prompted at this point and given instructions for what to do. Some datasets will require local pre-processing. If so, the first time your run the command, you will be executing all of the processing scripts (which can be quite slow).
 70 | 
 71 | After the first time, data will loaded from cache on disk which should be fast.
 72 | 
 73 | To see which datasets are currently available:
 74 | ```python
 75 | from src import workflow
 76 | workflow.available_datasets(keys_only=True)
 77 | ```
 78 | 
 79 | Note: sometimes datasets can be quite large. If you want to store your data externally, we recommend symlinking your data directory (that is `make_better_defaults/data`) to somewhere with more room.
 80 | 
 81 | For more on Datasets, see [Getting and Using Datasets](reference/easydata/datasets.md).
 82 | 
 83 | ### Using Notebooks and Sharing your Work
 84 | This repo has been set up in such a way as to make:
 85 | 
 86 | * environment management easy and reproducible
 87 | * sharing analyses via notebooks easy and reproducible
 88 | 
 89 | There are some tricks, hacks, and built in utilities that you'll want to check out: [Using Notebooks for Analysis](reference/easydata/notebooks.md).
 90 | 
 91 | Here are some best practices for sharing using this repo:
 92 | 
 93 | * Notebooks go in the...you guessed it...`notebooks` directory. The naming convention is a number (for ordering), the creator’s initials, and a short - delimited description, e.g. `01-jqp-initial-data-exploration`. Please increment the starting number when creating a new notebook.
 94 | * When checking in a notebook, run **Kernel->Restart & Run All** or **Kernel->Restart & Clear Output** and then **Save** before checking it in.
 95 | * Put any scripts or other code in the `src` module. We suggest you create a directory using the same initials you put in your notebook titles (e.g. `src/xyz`) You will be able to import it into your notebooks via `from src.xyz import ...`.
 96 | * See the Project Organization section below to see where other materials should go, such as reports, figures, and references.
 97 | 
 98 | For more on sharing your work, including using git, submitting PRs and the like, see [Sharing your Work](reference/easydata/sharing-your-work.md).
 99 | 
100 | ### Quick References
101 | * [Setting up and Maintaining your Conda Environment Reproducibly](reference/easydata/conda-environments.md)
102 | * [Getting and Using Datasets](reference/easydata/datasets.md)
103 | * [Using Notebooks for Analysis](reference/easydata/notebooks.md)
104 | * [Sharing your Work](reference/easydata/sharing-your-work.md)
105 | 


--------------------------------------------------------------------------------
/docs/Makefile:
--------------------------------------------------------------------------------
  1 | # Makefile for Sphinx documentation
  2 | #
  3 | 
  4 | # You can set these variables from the command line.
  5 | SPHINXOPTS    =
  6 | SPHINXBUILD   = sphinx-build
  7 | PAPER         =
  8 | BUILDDIR      = _build
  9 | 
 10 | # Internal variables.
 11 | PAPEROPT_a4     = -D latex_paper_size=a4
 12 | PAPEROPT_letter = -D latex_paper_size=letter
 13 | ALLSPHINXOPTS   = -d $(BUILDDIR)/doctrees $(PAPEROPT_$(PAPER)) $(SPHINXOPTS) .
 14 | # the i18n builder cannot share the environment and doctrees with the others
 15 | I18NSPHINXOPTS  = $(PAPEROPT_$(PAPER)) $(SPHINXOPTS) .
 16 | 
 17 | .PHONY: help clean html dirhtml singlehtml pickle json htmlhelp qthelp devhelp epub latex latexpdf text man changes linkcheck doctest gettext
 18 | 
 19 | help:
 20 | 	@echo "Please use \`make <target>' where <target> is one of"
 21 | 	@echo "  html       to make standalone HTML files"
 22 | 	@echo "  dirhtml    to make HTML files named index.html in directories"
 23 | 	@echo "  singlehtml to make a single large HTML file"
 24 | 	@echo "  pickle     to make pickle files"
 25 | 	@echo "  json       to make JSON files"
 26 | 	@echo "  htmlhelp   to make HTML files and a HTML help project"
 27 | 	@echo "  qthelp     to make HTML files and a qthelp project"
 28 | 	@echo "  devhelp    to make HTML files and a Devhelp project"
 29 | 	@echo "  epub       to make an epub"
 30 | 	@echo "  latex      to make LaTeX files, you can set PAPER=a4 or PAPER=letter"
 31 | 	@echo "  latexpdf   to make LaTeX files and run them through pdflatex"
 32 | 	@echo "  text       to make text files"
 33 | 	@echo "  man        to make manual pages"
 34 | 	@echo "  texinfo    to make Texinfo files"
 35 | 	@echo "  info       to make Texinfo files and run them through makeinfo"
 36 | 	@echo "  gettext    to make PO message catalogs"
 37 | 	@echo "  changes    to make an overview of all changed/added/deprecated items"
 38 | 	@echo "  linkcheck  to check all external links for integrity"
 39 | 	@echo "  doctest    to run all doctests embedded in the documentation (if enabled)"
 40 | 
 41 | clean:
 42 | 	-rm -rf $(BUILDDIR)/*
 43 | 
 44 | html:
 45 | 	$(SPHINXBUILD) -b html $(ALLSPHINXOPTS) $(BUILDDIR)/html
 46 | 	@echo
 47 | 	@echo "Build finished. The HTML pages are in $(BUILDDIR)/html."
 48 | 
 49 | dirhtml:
 50 | 	$(SPHINXBUILD) -b dirhtml $(ALLSPHINXOPTS) $(BUILDDIR)/dirhtml
 51 | 	@echo
 52 | 	@echo "Build finished. The HTML pages are in $(BUILDDIR)/dirhtml."
 53 | 
 54 | singlehtml:
 55 | 	$(SPHINXBUILD) -b singlehtml $(ALLSPHINXOPTS) $(BUILDDIR)/singlehtml
 56 | 	@echo
 57 | 	@echo "Build finished. The HTML page is in $(BUILDDIR)/singlehtml."
 58 | 
 59 | pickle:
 60 | 	$(SPHINXBUILD) -b pickle $(ALLSPHINXOPTS) $(BUILDDIR)/pickle
 61 | 	@echo
 62 | 	@echo "Build finished; now you can process the pickle files."
 63 | 
 64 | json:
 65 | 	$(SPHINXBUILD) -b json $(ALLSPHINXOPTS) $(BUILDDIR)/json
 66 | 	@echo
 67 | 	@echo "Build finished; now you can process the JSON files."
 68 | 
 69 | htmlhelp:
 70 | 	$(SPHINXBUILD) -b htmlhelp $(ALLSPHINXOPTS) $(BUILDDIR)/htmlhelp
 71 | 	@echo
 72 | 	@echo "Build finished; now you can run HTML Help Workshop with the" \
 73 | 	      ".hhp project file in $(BUILDDIR)/htmlhelp."
 74 | 
 75 | qthelp:
 76 | 	$(SPHINXBUILD) -b qthelp $(ALLSPHINXOPTS) $(BUILDDIR)/qthelp
 77 | 	@echo
 78 | 	@echo "Build finished; now you can run "qcollectiongenerator" with the" \
 79 | 	      ".qhcp project file in $(BUILDDIR)/qthelp, like this:"
 80 | 	@echo "# qcollectiongenerator $(BUILDDIR)/qthelp/make_better_defaults.qhcp"
 81 | 	@echo "To view the help file:"
 82 | 	@echo "# assistant -collectionFile $(BUILDDIR)/qthelp/make_better_defaults.qhc"
 83 | 
 84 | devhelp:
 85 | 	$(SPHINXBUILD) -b devhelp $(ALLSPHINXOPTS) $(BUILDDIR)/devhelp
 86 | 	@echo
 87 | 	@echo "Build finished."
 88 | 	@echo "To view the help file:"
 89 | 	@echo "# mkdir -p $$HOME/.local/share/devhelp/make_better_defaults"
 90 | 	@echo "# ln -s $(BUILDDIR)/devhelp $$HOME/.local/share/devhelp/make_better_defaults"
 91 | 	@echo "# devhelp"
 92 | 
 93 | epub:
 94 | 	$(SPHINXBUILD) -b epub $(ALLSPHINXOPTS) $(BUILDDIR)/epub
 95 | 	@echo
 96 | 	@echo "Build finished. The epub file is in $(BUILDDIR)/epub."
 97 | 
 98 | latex:
 99 | 	$(SPHINXBUILD) -b latex $(ALLSPHINXOPTS) $(BUILDDIR)/latex
100 | 	@echo
101 | 	@echo "Build finished; the LaTeX files are in $(BUILDDIR)/latex."
102 | 	@echo "Run \`make' in that directory to run these through (pdf)latex" \
103 | 	      "(use \`make latexpdf' here to do that automatically)."
104 | 
105 | latexpdf:
106 | 	$(SPHINXBUILD) -b latex $(ALLSPHINXOPTS) $(BUILDDIR)/latex
107 | 	@echo "Running LaTeX files through pdflatex..."
108 | 	$(MAKE) -C $(BUILDDIR)/latex all-pdf
109 | 	@echo "pdflatex finished; the PDF files are in $(BUILDDIR)/latex."
110 | 
111 | text:
112 | 	$(SPHINXBUILD) -b text $(ALLSPHINXOPTS) $(BUILDDIR)/text
113 | 	@echo
114 | 	@echo "Build finished. The text files are in $(BUILDDIR)/text."
115 | 
116 | man:
117 | 	$(SPHINXBUILD) -b man $(ALLSPHINXOPTS) $(BUILDDIR)/man
118 | 	@echo
119 | 	@echo "Build finished. The manual pages are in $(BUILDDIR)/man."
120 | 
121 | texinfo:
122 | 	$(SPHINXBUILD) -b texinfo $(ALLSPHINXOPTS) $(BUILDDIR)/texinfo
123 | 	@echo
124 | 	@echo "Build finished. The Texinfo files are in $(BUILDDIR)/texinfo."
125 | 	@echo "Run \`make' in that directory to run these through makeinfo" \
126 | 	      "(use \`make info' here to do that automatically)."
127 | 
128 | info:
129 | 	$(SPHINXBUILD) -b texinfo $(ALLSPHINXOPTS) $(BUILDDIR)/texinfo
130 | 	@echo "Running Texinfo files through makeinfo..."
131 | 	make -C $(BUILDDIR)/texinfo info
132 | 	@echo "makeinfo finished; the Info files are in $(BUILDDIR)/texinfo."
133 | 
134 | gettext:
135 | 	$(SPHINXBUILD) -b gettext $(I18NSPHINXOPTS) $(BUILDDIR)/locale
136 | 	@echo
137 | 	@echo "Build finished. The message catalogs are in $(BUILDDIR)/locale."
138 | 
139 | changes:
140 | 	$(SPHINXBUILD) -b changes $(ALLSPHINXOPTS) $(BUILDDIR)/changes
141 | 	@echo
142 | 	@echo "The overview file is in $(BUILDDIR)/changes."
143 | 
144 | linkcheck:
145 | 	$(SPHINXBUILD) -b linkcheck $(ALLSPHINXOPTS) $(BUILDDIR)/linkcheck
146 | 	@echo
147 | 	@echo "Link check complete; look for any errors in the above output " \
148 | 	      "or in $(BUILDDIR)/linkcheck/output.txt."
149 | 
150 | doctest:
151 | 	$(SPHINXBUILD) -b doctest $(ALLSPHINXOPTS) $(BUILDDIR)/doctest
152 | 	@echo "Testing of doctests in the sources finished, look at the " \
153 | 	      "results in $(BUILDDIR)/doctest/output.txt."
154 | 


--------------------------------------------------------------------------------
/reference/easydata/datasets.md:
--------------------------------------------------------------------------------
  1 | # Getting and Using Datasets
  2 | 
  3 | ## TL;DR
  4 | To get started, all you really need to know is that you can query for available datasets via
  5 | ```python
  6 | from src import workflow
  7 | workflow.dataset_catalog()
  8 | ```
  9 | 
 10 | and load these datasets via
 11 | ```python
 12 | from src.data import Dataset
 13 | ds = Dataset.load(dataset_name)
 14 | ```
 15 | 
 16 | If you've followed the instructions from building the repo contained in the [README](../README.md), this should just work (if it doesn't, please let us know)!
 17 | 
 18 | You can start using the data via `ds.data`. To find out more about the dataset you've just loaded, take a look at `ds.DESCR` and `ds.LICENSE`.
 19 | 
 20 | **Warning**: some of the datasets can be quite large. If you want to store your data externally, we recommend symlinking your data directory (that is the `make_better_defaults/data` directory) to somewhere with more room before loading your first `Dataset`.
 21 | 
 22 | 
 23 | ## Digging Deeper
 24 | It is useful to know a little bit more about how Datasets work.
 25 | 
 26 | 
 27 | ## What is a `Dataset` object?
 28 | 
 29 | A Dataset is the fundamental object we use for turning raw data into useful datasets, reproducibly. It is like a scikit-learn-style `Bunch` object --- essentially, a dictionary with some extra magic to make it nicer to work with --- containing the following attributes:
 30 | 
 31 | ```
 32 |         data: the processed data
 33 |         target: (optional) target vector (for supervised learning problems)
 34 |         metadata: Data about the data
 35 | ```
 36 | 
 37 | The `data` attribute can really be any processed data form that you like: sometimes it's a pandas dataframe (like with `wine_reviews_130k`), a list of tuples containing other data, (`reddit_comment_tree_graphs`), or other formats including  `scipy.sparse` matrices or `igraph` graphs. The `target` (if you're using it), expects something that matches the `data` in terms of length.
 38 | 
 39 | For a hint as to which data format to expect, you can look at the contents of the `DESCR` attribute, one of the many pieces of medata that are maintained as part of the `Dataset` object.
 40 | 
 41 | This `metadata` is where things get interesting... which we'll cover on its own next.
 42 | 
 43 | ## Why `metadata`?
 44 | The `metadata` is where the magic lives. It serves several purposes in terms of bookkeeping:
 45 | 
 46 | * it includes `HASHES`, which **improve data reproducibility**, since what you download and process gets checked each step along the way to ensure the raw data matches what is stored in the `dataset_catalog`,
 47 | * it provides easy access to **what the data is** via the `DESCR` attribute,
 48 | * it provides easy (and continual) **access to the license / usage restrictions** for the data (the `LICENSE` attribute), which helps with knowing what you can do when [Sharing your Work](sharing-your-work.md).
 49 | * it provides the **extra data manifest**, `EXTRA`, if your dataset includes around additional raw data (extra) files.
 50 | 
 51 | In short, it helps you to know what data you're working with, what you can do with it, and whether something has gone wrong.
 52 | 
 53 | Under the hood, metadata is a dictionary; however metadata can also be accessed by referring to attributes expressed in uppercase. For example, `ds.metadata['license']` and `ds.LICENSE` refer to the same thing.
 54 | 
 55 | ## Using a `Dataset`
 56 | As mentioned before, to load a `Dataset`:
 57 | ```python
 58 | from src.data import Dataset
 59 | ds = Dataset.load("<dataset-name>")
 60 | ```
 61 | At this point, if you already have a cached copy of the desired `Dataset` on disk, it will load it. Otherwise, the it will follow the *recipe* for generating the requested `Dataset`; i.e. generate the dataset from raw data, as per the instructions contained in the `dataset_catalog` (described below).
 62 | 
 63 | Because of licenses and other distribution restrictions, some of the datasets may require a **manual download** step. If so, you will be prompted at this point and given instructions for what to do. Some datasets will require local pre-processing. If so, the first time your run the command, you will be executing all of the processing scripts (which can be quite slow).
 64 | 
 65 | After the first load, however, datasets will load from cache on disk which should be fast. If you need to free up space, you can even delete related source files from `data/raw` and `data/interim`. Just don't touch the `data/processed` directory.
 66 | 
 67 | To access the data, target or metdata:
 68 | ```python
 69 | ds.data
 70 | ds.target
 71 | ds.metadata
 72 | ```
 73 | 
 74 | To access the most common metadata fields:
 75 | ```python
 76 | ds.DESCR          # or ds.metadata['descr']
 77 | ds.LICENSE        # or ds.metadata['license']
 78 | ds.HASHES         # or ds.metadata['hashes']
 79 | ```
 80 | ## The catalog
 81 | While we do our best to keep the documentation in [Available Datasets](docs/available-datasets.md) up-to-date with what's in the code, you can explore all of the currently available `Datasets` via the `dataset_catalog`. The catalog keeps a record of the recipes used to generate a `Dataset` along with relevant hashes that are used to ensure the integrity of data when it's loaded.
 82 | 
 83 | To access the catalog:
 84 | 
 85 | ```python
 86 | from src import workflow
 87 | workflow.dataset_catalog(keys_only=True)
 88 | ```
 89 | If you're interested, set `keys_only=False` to see the complete contents of the metadata that is saved in the catalog.
 90 | 
 91 | 
 92 | ## Sharing your Data as a `Dataset` object
 93 | In order to convert your data to a `Dataset` object, you will need to generate a catalog *recipe*, that uses a custom *function for processing your raw data*. Doing so allows us to document all the munging, pre-processing, and data verification necessary to reproducibly build the dataset.
 94 | 
 95 | ## What do you mean, LICENSE?
 96 | No conversation on sharing data would be complete without a short discussion about data licenses. This will be covered in [Sharing your Work](sharing-your-work.md).
 97 | 
 98 | 
 99 | ### Quick References
100 | 
101 | * [README](../README.md)
102 | * [Setting up and Maintaining your Conda Environment Reproducibly](conda-environments.md)
103 | * [Getting and Using Datasets](datasets.md)
104 | * [Using Notebooks for Analysis](notebooks.md)
105 | * [Sharing your Work](sharing-your-work.md)
106 | * [Troubleshooting Guide](troubleshooting.md)
107 | 


--------------------------------------------------------------------------------
/reference/easydata/notebooks.md:
--------------------------------------------------------------------------------
  1 | # Using Notebooks for Analysis
  2 | 
  3 | Jupyter Notebooks are a fantastic way for doing your EDA and sharing stories about your analysis afterwards. Unfortunately, (and yes, after many years of trying to use notebooks reproducibly, we are opinionated on this) they're a pretty terrible way to share code itself. While we still *love* using notebooks for sharing what we've done with others, especially in a workshop setting.
  4 | 
  5 | We've set up this repo in a way to make it easier to use notebooks to share stories, while keeping your code in a python module where it belongs.
  6 | 
  7 | Here's our best practices for using notebooks, while keeping your analyses sharable and reproducible. We've also included some of our favourite tricks and tips below for making using notebooks easier. (If you have more, please share them!)
  8 | 
  9 | ## Naming Convention
 10 | Notebooks go in the `notebooks` directory, and are named `dd-xyz-title.ipynb` where:
 11 | 
 12 | * `dd` is an integer indicating the notebook sequence. This is critical when there are dependencies between notebooks
 13 | * `xyz` is the author's initials, to help avoid namespace clashes when multiple parties are committing to the same repo
 14 | * `title` is the name of the notebook, words separated by hyphens.
 15 | 
 16 | e.g.`00-xyz-sample-notebook.ipynb`
 17 | 
 18 | ## Source Control for Notebooks
 19 | Here's where the code part of notebooks starts to get tricky. Notebooks awful for using with `git` and other source control systems because of the way that they are stored (giant JSON blob). If you're going to share your notebook back to the main surge repo (which we strongly encourage!):
 20 | 
 21 | 1. Make sure your cells run sequentially (make sure you can **Kernel->Restart & Run All** successfully)
 22 | 1. Clear all cell output before checking in your notebook (**Kernel->Restart & Clear Output** before saving).
 23 | 
 24 | We realize that clearing the notebook (which gives cleaner diffs and PRs) is a bit of a trade-off against repoducibility of the notebook in that you lose the ability to check cell-by-cell whether you're getting the same results. One way to get around this in your own fork, is to use the `git nbdiff` feature, which is part of the `nbdiff` package (that is installed in this repo by default). You can find it on the right-hand side of the notebook toolbar, asc shown below:
 25 | 
 26 | ![screenshot](images/toolbar-screenshot.png)
 27 | 
 28 | This button will diff the notebook you have open intelligently against the the base version. We like to use `git nbdiff` as a visual diffing tool even if we are clearing output before checking in notebooks.
 29 | 
 30 | If you want to give your future users help to see whether they are getting images and figures that match previous analyses, we recommend saving the figures in `reports/figures` and then putting them into a markdown cell in the notebook (so a user can see if what they generated is comparable).
 31 | 
 32 | You can also optionally check your notebook in after a successful **Kernel->Restart & Run All**. This is a little more work to maintain diffs on, but can be nicer for communication withouit having to run a notebook to see what the results look like.
 33 | 
 34 | ## On code
 35 | As mentioned, notebooks aren't a great place for keeping code, as diffs and PRs in a notebook are virtually unreadable. This repo uses an editable python module called `src`. If you write code that you'd like to use in a notebook (e.g. `my_python_file.py`), put it in the `src/xyz` directory where `xyz` is the author's initials. You should then be able to immediately load it in your notebook via:
 36 | ```python
 37 | from src.xyz.my_python_file import my_function_name
 38 | ```
 39 | If it's not immediately loading (or you need to restart your kernel to make it visible), make sure you run the following cell (preferably at the top of your notebook...see more on useful header cells below):
 40 | ```python
 41 | %load_ext autoreload
 42 | %autoreload 2
 43 | ```
 44 | 
 45 | ## Jupyter Tips and Tricks
 46 | First up, if you're in a notebook, keyboard shortcuts can be found using the `Esc` key. Use them.
 47 | 
 48 | ### Useful Header Cells
 49 | #### Better display
 50 | This cell makes your jupyter notebook use the full screen width. Put this as your first executable cell. You'll thank us.
 51 | ```python
 52 | from IPython.core.display import display, HTML
 53 | display(HTML("<style>.container { width:100% !important; }</style>"))
 54 | ```
 55 | #### Autoreloading
 56 | The cell
 57 | ```python
 58 | %load_ext autoreload
 59 | %autoreload 2
 60 | ```
 61 | let's you autoreload code that's changed in your environment. This means you can update your environment without killing your kernel or develop code in the `src` module that is immediately available via auto-reload.
 62 | #### Python Libraries
 63 | It helps to put your dependencies at the top of your notebook. Ours usually look something like this:
 64 | ```python
 65 | # Python Imports, alphabetized
 66 | import pathlib
 67 | ...
 68 | 
 69 | #3rd party python modules, alphabetized
 70 | import pandas as pd
 71 | ...
 72 | 
 73 | #Some plotting libraries
 74 | import matplotlib.pyplot as plt
 75 | %matplotlib notebook
 76 | from bokeh.plotting import show, save, output_notebook, output_file
 77 | from bokeh.resources import INLINE
 78 | output_notebook(resources=INLINE)
 79 | 
 80 | # Source module imports
 81 | from src import paths
 82 | from src.data import DataSource, Dataset
 83 | from src import workflow
 84 | ```
 85 | You can also find most of these header cells in [00-xyz-sample-notebook.ipynb](../notebooks/00-xyz-sample-notebook.ipynb)
 86 | 
 87 | ### Cell Magics
 88 | There is a whole world of cell magics. These are bits of code that you can put at the top of a cell that do magical things. A few of our most used ones are:
 89 | 
 90 | * `%%time`: time the cell (use this on slow cells)
 91 | * `%debug`: invoke the python debugger (make sure to `exit` when you're done)
 92 | * `%%file`: write current cell's content to a file (use `-a` to append)
 93 | * `%load`: load a file's contents into the current cell
 94 | * `%%bash`: run the cell using bash kernel
 95 | 
 96 | 
 97 | ### Quick References
 98 | 
 99 | * [README](../README.md)
100 | * [Setting up and Maintaining your Conda Environment Reproducibly](conda-environments.md)
101 | * [Getting and Using Datasets](datasets.md)
102 | * [Using Notebooks for Analysis](notebooks.md)
103 | * [Sharing your Work](sharing-your-work.md)
104 | * [Troubleshooting Guide](troubleshooting.md)
105 | 


--------------------------------------------------------------------------------
/reference/easydata/conda-environments.md:
--------------------------------------------------------------------------------
  1 | # Setting up and Maintaining your Conda Environment (Reproducibly)
  2 | 
  3 | The `make_better_defaults` repo is set up with template code to make managing your conda environments easy and reproducible. Not only will _future you_ appreciate this, but so will anyone else who needs to work with your code after today.
  4 | 
  5 | If you haven't yet, configure your conda environment.
  6 | 
  7 | ## Configuring your python environment
  8 | Easydata uses conda to manage python packages installed by both conda **and pip**.
  9 | 
 10 | ### Adjust your `.condarc`
 11 | **WARNING FOR EXISTING CONDA USERS**: If you have `conda-forge` listed as a channel in your `.condarc` (or any other channels other than `default`), **remove them**. These channels should be specified in `environment.yml` instead.
 12 | 
 13 | We also recommend [setting your channel priority to 'strict'](https://docs.conda.io/projects/conda/en/latest/user-guide/tasks/manage-channels.html) to reduce package incompatibility problems. This will be the default in conda 5.0, but in order to assure reproducibility, we need to use this behavior now.
 14 | 
 15 | ```
 16 | conda config --set channel_priority strict
 17 | ```
 18 | Whenever possible, re-order your channels so that `default` is first.
 19 | 
 20 | ```
 21 | conda config --prepend channels defaults
 22 | ```
 23 | 
 24 | **Note for Jupyterhub Users**: You will need to store your conda environment in your **home directory** so that they wil be persisted across JupyterHub sessions.
 25 | ```
 26 | conda config --prepend envs_dirs ~/.conda/envs   # Store environments in local dir for JupyterHub
 27 | ```
 28 | 
 29 | ### Fix the CONDA_EXE path
 30 | * Make note of the path to your conda binary:
 31 | ```
 32 |    $ which conda
 33 |    ~/miniconda3/bin/conda
 34 | ```
 35 | * ensure your `CONDA_EXE` environment variable is set correctly in `Makefile.include`
 36 | ```
 37 |     export CONDA_EXE=~/miniconda3/bin/conda
 38 | ```
 39 | ### Create the conda environment
 40 | * Create and switch to the virtual environment:
 41 | ```
 42 | cd make_better_defaults
 43 | make create_environment
 44 | conda activate make_better_defaults
 45 | make update_environment
 46 | ```
 47 | **Note**: When creating the environment the first time, you really do need to run **both** `make create_environment` and `make update_environment` for the `src` module to install correctly.
 48 | 
 49 | To activate the environment, simply `conda activate make_better_defaults`
 50 | 
 51 | To deactivate it and return to your base environment, use `conda deactivate`
 52 | 
 53 | ## Maintaining your Python environment
 54 | 
 55 | ### Updating your conda and pip environments
 56 | The `make` commands, `make create_environment` and `make update_environment` are wrappers that allow you to easily manage your conda and pip environments using the `environment.yml` file.
 57 | 
 58 | (If you ever forget which `make` command to run, you can run `make` by itself and it will provide a list of commands that are available.)
 59 | 
 60 | 
 61 | When adding packages to your python environment, **do not `pip install` or `conda install` directly**. Always edit `environment.yml` and `make update_environment` instead.
 62 | 
 63 | Your `environment.yml` file will look something like this:
 64 | ```
 65 | name: make_better_defaults
 66 |   - pip
 67 |   - pip:
 68 |     - -e .  # conda >= 4.4 only
 69 |     - python-dotenv>=0.5.1
 70 |     - nbval
 71 |     - nbdime
 72 |     - umap-learn
 73 |     - gdown
 74 |   - setuptools
 75 |   - wheel
 76 |   - git>=2.5  # for git worktree template updating
 77 |   - sphinx
 78 |   - bokeh
 79 |   - click
 80 |   - colorcet
 81 |   - coverage
 82 |   - coveralls
 83 |   - datashader
 84 |   - holoviews
 85 |   - matplotlib
 86 |   - jupyter
 87 | ...
 88 | ```
 89 | To add any package available from conda, add it to the end of the list. If you have a PYPI dependency that's not avaible via conda, add it to the list of pip installable dependencies under `  - pip:`.
 90 | 
 91 | You can include any github.com python-based project in the `pip` section via `git+https://github.com/<my_git_handle>/<package>`.
 92 | 
 93 | In particular, if you're working off of a fork or a work in progress branch of a repo in github.com (say, your personal version of <package>), you can change `git+https://github.com/<my_git_handle>/<package>` to
 94 | 
 95 | * `git+https://github.com/<my_git_handle>/<package>.git` to point to the main branch of your fork and
 96 | * `git+https://github.com/<my_git_handle>/<package>.git@<my branch>` to point to a specific branch.
 97 | 
 98 | Once you're done your edits, run `make update_environment` and voila, you're updated.
 99 | 
100 | To share your updated environment, check in your `environment.yml` file. (More on this in [Sharing your Work](sharing-your-work.md))
101 | 
102 | 
103 | #### Lock files
104 | Now, we'll admit that this workflow isn't perfectly reproducible in the sense that conda still has to resolve versions from the `environment.yml`. To make it more reproducible, running either `make create_environment` or `make update_environment` will generate an `environment.{$ARCH}.lock.yml` (e.g. `environment.i386.lock.yml`). This file keeps a record of the exact environment that is currently installed in your conda environment `make_better_defaults`. If you ever need to reproduce an environment exactly, you can install from the `.lock.yml` file. (Note: These are architecture dependent).
105 | 
106 | #### Using your conda environment in a jupyter notebook
107 | If you make a new notebook, select the `make_better_defaults` environment from within the notebook. If you are somehow in another kernel, select **Kernel -> Change kernel -> Python[conda env:make_better_defaults]**. If you don't seem to have that option, make sure that you ran `jupyter notebooks` with the `make_better_defaults` conda environment enabled, and that `which jupyter` points to the correct (`make_better_defaults`) version of jupyter.
108 | 
109 | If you want your environment changes (or `src` module edits) to be immediately available in your running notebooks, make sure to run a notebook cell containing
110 | ```
111 | %load_ext autoreload
112 | %autoreload 2
113 | ```
114 | 
115 | More on notebooks can be found in [Using Notebooks for Analysis](notebooks.md).
116 | 
117 | ### Nuke it from orbit
118 | Sometimes, you need to be sure. Making things reproducible means that blowing things away completely and rebuilding from scratch is always an option. To do so:
119 | ```
120 | conda deactivate
121 | make delete_environment
122 | make create_environment
123 | conda activate make_better_defaults
124 | touch environment.yml
125 | make update_envrionment
126 | ```
127 | and then proceed with managing your environment as above.
128 | 
129 | ### Quick References
130 | 
131 | * [README](../README.md)
132 | * [Setting up and Maintaining your Conda Environment Reproducibly](conda-environments.md)
133 | * [Getting and Using Datasets](datasets.md)
134 | * [Using Notebooks for Analysis](notebooks.md)
135 | * [Sharing your Work](sharing-your-work.md)
136 | * [Troubleshooting Guide](troubleshooting.md)
137 | 


--------------------------------------------------------------------------------
/src/data/utils.py:
--------------------------------------------------------------------------------
  1 | import importlib
  2 | import os
  3 | import pathlib
  4 | import random
  5 | import sys
  6 | import pandas as pd
  7 | import numpy as np
  8 | from typing import Iterator, List
  9 | from functools import partial
 10 | from joblib import func_inspect as jfi
 11 | 
 12 | from ..log import logger
 13 | from .. import paths
 14 | 
 15 | __all__ = [
 16 |     'deserialize_partial',
 17 |     'normalize_labels',
 18 |     'partial_call_signature',
 19 |     'read_space_delimited',
 20 |     'reservoir_sample',
 21 |     'serialize_partial',
 22 | ]
 23 | 
 24 | _MODULE = sys.modules[__name__]
 25 | _MODULE_DIR = pathlib.Path(os.path.dirname(os.path.abspath(__file__)))
 26 | 
 27 | def read_space_delimited(filename, skiprows=None, class_labels=True, metadata=None):
 28 |     """Read an space-delimited file
 29 | 
 30 |     Data is space-delimited. Last column is the (string) label for the data
 31 | 
 32 |     Note: we can't use automatic comment detection, as `#` characters are also
 33 |     used as data labels.
 34 | 
 35 |     Parameters
 36 |     ----------
 37 |     skiprows: list-like, int or callable, optional
 38 |         list of rows to skip when reading the file. See `pandas.read_csv`
 39 |         entry on `skiprows` for more
 40 |     class_labels: boolean
 41 |         if true, the last column is treated as the class (target) label
 42 |     """
 43 |     with open(filename, 'r') as fd:
 44 |         df = pd.read_csv(fd, skiprows=skiprows, skip_blank_lines=True,
 45 |                            comment=None, header=None, sep=' ', dtype=str)
 46 |         # targets are last column. Data is everything else
 47 |         if class_labels is True:
 48 |             target = df.loc[:, df.columns[-1]].values
 49 |             data = df.loc[:, df.columns[:-1]].values
 50 |         else:
 51 |             data = df.values
 52 |             target = np.zeros(data.shape[0])
 53 |         return data, target, metadata
 54 | 
 55 | def normalize_labels(target):
 56 |     """Map an arbitary target vector to an integer vector
 57 | 
 58 |     Returns
 59 |     -------
 60 |     tuple: (mapped_target, label_map)
 61 | 
 62 |     where:
 63 |         mapped_target: integer vector of same shape as target
 64 |         label_map: dict mapping mapped_target integers to original labels
 65 | 
 66 |     Examples
 67 |     --------
 68 |     >>> target = np.array(['a','b','c','a'])
 69 |     >>> mapped_target, label_map = normalize_labels(target)
 70 |     >>> mapped_target
 71 |     array([0, 1, 2, 0])
 72 | 
 73 |     The following should always be true
 74 | 
 75 |     >>> all(np.vectorize(label_map.get)(mapped_target) == target)
 76 |     True
 77 |     """
 78 |     label_map = {k:v for k, v in enumerate(np.unique(target))}
 79 |     label_map_inv = {v:k for k, v in label_map.items()}
 80 |     mapped_target = np.vectorize(label_map_inv.get)(target)
 81 | 
 82 |     return mapped_target, label_map
 83 | 
 84 | def partial_call_signature(func):
 85 |     """Return the fully qualified call signature for a (partial) function
 86 |     """
 87 |     func = partial(func)
 88 |     fa = jfi.getfullargspec(func)
 89 |     default_kw = {}
 90 |     if fa.args:
 91 |         default_kw = dict(zip(fa.args, fa.defaults))
 92 |     if getattr(fa, 'kwonlydefaults', None):
 93 |         fq_keywords = {**default_kw, **fa.kwonlydefaults}
 94 |     else:
 95 |         fq_keywords = default_kw
 96 |     return jfi.format_signature(func.func, *func.args, **fq_keywords)
 97 | 
 98 | def process_dataset_default(metadata=None, **kwargs):
 99 |     """Placeholder for data processing function"""
100 |     dataset_name = kwargs.get('dataset_name', 'unknown-dataset')
101 |     logger.error(f"'{dataset_name}()' function not found. Define it add it to the `user` namespace for correct behavior")
102 |     return None, None, metadata
103 | 
104 | def deserialize_partial(func_dict, delete_keys=False,
105 |                         key_base='load_function',
106 |                         fail_func=None):
107 |     """Convert a serialized function call into a partial
108 | 
109 |     if there is an error, returns a default function (process_dataset_default)
110 | 
111 |     Parameters
112 |     ----------
113 |     func_dict: dict containing
114 |         {key_base}_name: function name
115 |         {key_base}_module: module containing function
116 |         {key_base}_args: args to pass to function
117 |         {key_base}_kwargs: kwargs to pass to function
118 | 
119 |     delete_keys: Boolean
120 |         if True, keys are deleted from `func_dict` if found
121 |     key_base: str
122 |         name to be used when generating looking up keys in `func_dict`
123 |     fail_func:
124 |         function to use if no valid function found in the namespace
125 | 
126 |     """
127 | 
128 |     if delete_keys:
129 |         args = func_dict.pop(f"{key_base}_args", [])
130 |         kwargs = func_dict.pop(f"{key_base}_kwargs", {})
131 |         base_name = func_dict.pop(f"{key_base}_name", 'process_dataset_default')
132 |         func_mod_name = func_dict.pop(f'{key_base}_module', None)
133 |     else:
134 |         args = func_dict.get(f"{key_base}_args", [])
135 |         kwargs = func_dict.get(f"{key_base}_kwargs", {})
136 |         base_name = func_dict.get(f"{key_base}_name", 'process_dataset_default')
137 |         func_mod_name = func_dict.get(f'{key_base}_module', None)
138 | 
139 |     if fail_func is None:
140 |         fail_func = partial(process_dataset_default, dataset_name=base_name)
141 | 
142 |     try:
143 |         if func_mod_name:
144 |             func_mod = importlib.import_module(func_mod_name)
145 |         else:
146 |             func_mod = _MODULE
147 |         func_name = getattr(func_mod, base_name, fail_func)
148 |     except ModuleNotFoundError as e:
149 |         logger.error(f"Invalid parse_function: {e}")
150 |         func_name = fail_func
151 |     func = partial(func_name, *args, **kwargs)
152 | 
153 |     return func
154 | 
155 | def serialize_partial(func, key_base='load_function'):
156 |     """Serialize a function call to a dictionary.
157 | 
158 |     Parameters
159 |     ----------
160 |     func: function
161 |         function to serialize
162 |     key_base: str. Default 'load_function'
163 |         string to prepend to serialization parameters.
164 | 
165 |     Returns
166 |     -------
167 |     dict containing:
168 |         {key_base}_name: function name
169 |         {key_base}_module: fully-qualified module name containing function
170 |         {key_base}_args: args to pass to function
171 |         {key_base}_kwargs: kwargs to pass to function
172 |     """
173 | 
174 |     entry = {}
175 |     if func is None:
176 |         logger.warning(f"serialize_partial: `{key_base}` is None. Ignoring.")
177 |         return entry
178 |     func = partial(func)
179 |     entry[f'{key_base}_module'] = ".".join(jfi.get_func_name(func.func)[0])
180 |     entry[f'{key_base}_name'] = jfi.get_func_name(func.func)[1]
181 |     entry[f'{key_base}_args'] = func.args
182 |     entry[f'{key_base}_kwargs'] = func.keywords
183 |     return entry
184 | 
185 | def reservoir_sample(filename, n_samples=1, random_seed=None):
186 |     """Return a random subset of lines from a file
187 | 
188 |     Parameters
189 |     ----------
190 |     filename: path
191 |         File to be loaded
192 |     n_samples: int
193 |         number of lines to return
194 |     random_seed: int or None
195 |         If set, use this as the random seed
196 |     """
197 |     if random_seed is not None:
198 |         random.seed(random_seed)
199 |     sample = []
200 |     with open(filename) as f:
201 |         for n, line in enumerate(f):
202 |             if n < n_samples:
203 |                 sample.append(line.rstrip())
204 |             else:
205 |                 r = random.randint(0, n_samples)
206 |                 if r < n_samples:
207 |                     sample[r] = line.rstrip()
208 |     return sample
209 | 
210 | 
211 | def iter_directory(root: pathlib.Path) -> Iterator[pathlib.Path]:
212 |     """
213 |     Iterates the contents of a directory recursively, in depth-first
214 |     alphanumeric order.
215 | 
216 |     Parameters
217 |     ----------
218 |     path
219 |         Path to the directory to iterate.
220 | 
221 |     Items
222 |     -----
223 |     Paths to the various items contained in the directory and its subdirectories, recursively. The root prepends all the
224 |     yielded paths.
225 |     """
226 |     def listdir_sorted(path: pathlib.Path) -> List[pathlib.Path]:
227 |         return sorted(list(path.iterdir()), reverse=True)
228 | 
229 |     elements = listdir_sorted(root)
230 |     while elements:
231 |         item = elements.pop()
232 |         yield item
233 |         if item.is_dir():
234 |             elements += listdir_sorted(item)
235 | 


--------------------------------------------------------------------------------
/docs/conf.py:
--------------------------------------------------------------------------------
  1 | # -*- coding: utf-8 -*-
  2 | #
  3 | # Make Better Defaults documentation build configuration file, created by
  4 | # sphinx-quickstart.
  5 | #
  6 | # This file is execfile()d with the current directory set to its containing dir.
  7 | #
  8 | # Note that not all possible configuration values are present in this
  9 | # autogenerated file.
 10 | #
 11 | # All configuration values have a default; values that are commented out
 12 | # serve to show the default.
 13 | 
 14 | import os
 15 | import sys
 16 | 
 17 | # If extensions (or modules to document with autodoc) are in another directory,
 18 | # add these directories to sys.path here. If the directory is relative to the
 19 | # documentation root, use os.path.abspath to make it absolute, like shown here.
 20 | # sys.path.insert(0, os.path.abspath('.'))
 21 | 
 22 | # -- General configuration -----------------------------------------------------
 23 | 
 24 | # If your documentation needs a minimal Sphinx version, state it here.
 25 | # needs_sphinx = '1.0'
 26 | 
 27 | # Add any Sphinx extension module names here, as strings. They can be extensions
 28 | # coming with Sphinx (named 'sphinx.ext.*') or your custom ones.
 29 | extensions = []
 30 | 
 31 | # Add any paths that contain templates here, relative to this directory.
 32 | templates_path = ['_templates']
 33 | 
 34 | # The suffix of source filenames.
 35 | source_suffix = '.rst'
 36 | 
 37 | # The encoding of source files.
 38 | # source_encoding = 'utf-8-sig'
 39 | 
 40 | # The master toctree document.
 41 | master_doc = 'index'
 42 | 
 43 | # General information about the project.
 44 | project = u'Make Better Defaults'
 45 | 
 46 | # The version info for the project you're documenting, acts as replacement for
 47 | # |version| and |release|, also used in various other places throughout the
 48 | # built documents.
 49 | #
 50 | # The short X.Y version.
 51 | version = '0.1'
 52 | # The full version, including alpha/beta/rc tags.
 53 | release = '0.1'
 54 | 
 55 | # The language for content autogenerated by Sphinx. Refer to documentation
 56 | # for a list of supported languages.
 57 | # language = None
 58 | 
 59 | # There are two options for replacing |today|: either, you set today to some
 60 | # non-false value, then it is used:
 61 | # today = ''
 62 | # Else, today_fmt is used as the format for a strftime call.
 63 | # today_fmt = '%B %d, %Y'
 64 | 
 65 | # List of patterns, relative to source directory, that match files and
 66 | # directories to ignore when looking for source files.
 67 | exclude_patterns = ['_build']
 68 | 
 69 | # The reST default role (used for this markup: `text`) to use for all documents.
 70 | # default_role = None
 71 | 
 72 | # If true, '()' will be appended to :func: etc. cross-reference text.
 73 | # add_function_parentheses = True
 74 | 
 75 | # If true, the current module name will be prepended to all description
 76 | # unit titles (such as .. function::).
 77 | # add_module_names = True
 78 | 
 79 | # If true, sectionauthor and moduleauthor directives will be shown in the
 80 | # output. They are ignored by default.
 81 | # show_authors = False
 82 | 
 83 | # The name of the Pygments (syntax highlighting) style to use.
 84 | pygments_style = 'sphinx'
 85 | 
 86 | # A list of ignored prefixes for module index sorting.
 87 | # modindex_common_prefix = []
 88 | 
 89 | 
 90 | # -- Options for HTML output ---------------------------------------------------
 91 | 
 92 | # The theme to use for HTML and HTML Help pages.  See the documentation for
 93 | # a list of builtin themes.
 94 | html_theme = 'default'
 95 | 
 96 | # Theme options are theme-specific and customize the look and feel of a theme
 97 | # further.  For a list of options available for each theme, see the
 98 | # documentation.
 99 | # html_theme_options = {}
100 | 
101 | # Add any paths that contain custom themes here, relative to this directory.
102 | # html_theme_path = []
103 | 
104 | # The name for this set of Sphinx documents.  If None, it defaults to
105 | # "<project> v<release> documentation".
106 | # html_title = None
107 | 
108 | # A shorter title for the navigation bar.  Default is the same as html_title.
109 | # html_short_title = None
110 | 
111 | # The name of an image file (relative to this directory) to place at the top
112 | # of the sidebar.
113 | # html_logo = None
114 | 
115 | # The name of an image file (within the static path) to use as favicon of the
116 | # docs.  This file should be a Windows icon file (.ico) being 16x16 or 32x32
117 | # pixels large.
118 | # html_favicon = None
119 | 
120 | # Add any paths that contain custom static files (such as style sheets) here,
121 | # relative to this directory. They are copied after the builtin static files,
122 | # so a file named "default.css" will overwrite the builtin "default.css".
123 | html_static_path = ['_static']
124 | 
125 | # If not '', a 'Last updated on:' timestamp is inserted at every page bottom,
126 | # using the given strftime format.
127 | # html_last_updated_fmt = '%b %d, %Y'
128 | 
129 | # If true, SmartyPants will be used to convert quotes and dashes to
130 | # typographically correct entities.
131 | # html_use_smartypants = True
132 | 
133 | # Custom sidebar templates, maps document names to template names.
134 | # html_sidebars = {}
135 | 
136 | # Additional templates that should be rendered to pages, maps page names to
137 | # template names.
138 | # html_additional_pages = {}
139 | 
140 | # If false, no module index is generated.
141 | # html_domain_indices = True
142 | 
143 | # If false, no index is generated.
144 | # html_use_index = True
145 | 
146 | # If true, the index is split into individual pages for each letter.
147 | # html_split_index = False
148 | 
149 | # If true, links to the reST sources are added to the pages.
150 | # html_show_sourcelink = True
151 | 
152 | # If true, "Created using Sphinx" is shown in the HTML footer. Default is True.
153 | # html_show_sphinx = True
154 | 
155 | # If true, "(C) Copyright ..." is shown in the HTML footer. Default is True.
156 | # html_show_copyright = True
157 | 
158 | # If true, an OpenSearch description file will be output, and all pages will
159 | # contain a <link> tag referring to it.  The value of this option must be the
160 | # base URL from which the finished HTML is served.
161 | # html_use_opensearch = ''
162 | 
163 | # This is the file name suffix for HTML files (e.g. ".xhtml").
164 | # html_file_suffix = None
165 | 
166 | # Output file base name for HTML help builder.
167 | htmlhelp_basename = 'make_better_defaultsdoc'
168 | 
169 | 
170 | # -- Options for LaTeX output --------------------------------------------------
171 | 
172 | latex_elements = {
173 |     # The paper size ('letterpaper' or 'a4paper').
174 |     # 'papersize': 'letterpaper',
175 | 
176 |     # The font size ('10pt', '11pt' or '12pt').
177 |     # 'pointsize': '10pt',
178 | 
179 |     # Additional stuff for the LaTeX preamble.
180 |     # 'preamble': '',
181 | }
182 | 
183 | # Grouping the document tree into LaTeX files. List of tuples
184 | # (source start file, target name, title, author, documentclass [howto/manual]).
185 | latex_documents = [
186 |     ('index',
187 |      'make_better_defaults.tex',
188 |      u'Make Better Defaults Documentation',
189 |      u"Kjell Wooding <kjell@wooding.org>", 'manual'),
190 | ]
191 | 
192 | # The name of an image file (relative to this directory) to place at the top of
193 | # the title page.
194 | # latex_logo = None
195 | 
196 | # For "manual" documents, if this is true, then toplevel headings are parts,
197 | # not chapters.
198 | # latex_use_parts = False
199 | 
200 | # If true, show page references after internal links.
201 | # latex_show_pagerefs = False
202 | 
203 | # If true, show URL addresses after external links.
204 | # latex_show_urls = False
205 | 
206 | # Documents to append as an appendix to all manuals.
207 | # latex_appendices = []
208 | 
209 | # If false, no module index is generated.
210 | # latex_domain_indices = True
211 | 
212 | 
213 | # -- Options for manual page output --------------------------------------------
214 | 
215 | # One entry per manual page. List of tuples
216 | # (source start file, name, description, authors, manual section).
217 | man_pages = [
218 |     ('index', 'make_better_defaults', u'Make Better Defaults Documentation',
219 |      [u"Kjell Wooding <kjell@wooding.org>"], 1)
220 | ]
221 | 
222 | # If true, show URL addresses after external links.
223 | # man_show_urls = False
224 | 
225 | 
226 | # -- Options for Texinfo output ------------------------------------------------
227 | 
228 | # Grouping the document tree into Texinfo files. List of tuples
229 | # (source start file, target name, title, author,
230 | #  dir menu entry, description, category)
231 | texinfo_documents = [
232 |     ('index', 'make_better_defaults', u'Make Better Defaults Documentation',
233 |      u"Kjell Wooding <kjell@wooding.org>", 'Make Better Defaults',
234 |      'Make Better Defaults: Improving your data science workflows with "make". A Pydata Global 2021 Talk', 'Miscellaneous'),
235 | ]
236 | 
237 | # Documents to append as an appendix to all manuals.
238 | # texinfo_appendices = []
239 | 
240 | # If false, no module index is generated.
241 | # texinfo_domain_indices = True
242 | 
243 | # How to display URL addresses: 'footnote', 'no', or 'inline'.
244 | # texinfo_show_urls = 'footnote'
245 | 


--------------------------------------------------------------------------------
/src/data/transformer_functions.py:
--------------------------------------------------------------------------------
  1 | import pathlib
  2 | 
  3 | import pandas as pd
  4 | from tqdm.auto import tqdm
  5 | 
  6 | from sklearn.model_selection import train_test_split
  7 | 
  8 | from . import Dataset, deserialize_partial
  9 | from .. import paths
 10 | from ..log import logger
 11 | from .utils import deserialize_partial
 12 | from ..utils import run_notebook
 13 | 
 14 | __all__ = [
 15 |     'run_notebook_transformer',
 16 |     'apply_single_function',
 17 |     'csv_to_pandas',
 18 |     'new_dataset',
 19 |     'sklearn_train_test_split',
 20 |     'sklearn_transform',
 21 | ]
 22 | 
 23 | def run_notebook_transformer(dsdict, *,
 24 |                              notebook_name,
 25 |                              notebook_path,
 26 |                              output_dataset_names,
 27 |                              ):
 28 |     """
 29 |     Use a notebook as a transformer function in the dataset graph.
 30 |     The notebook *must* write the output datasets to disk; i.e. once their
 31 |     notebook has run, this function assumes Dataset.from_disk() will succeed
 32 |     for all output datasets listed in `output_dataset_names`
 33 | 
 34 |     Parameters
 35 |     ----------
 36 |     dsdict: Ignored
 37 |         Needed to conform to transformer API, but ignored, as these will need
 38 |         to be loaded in the notebook itself.
 39 |     notebook_name: None or str
 40 |         Name of current notebook. If None, an attempt will be made to infer it.
 41 |     notebook_path: None or str or Path
 42 |         If None, paths['notebook_path'] will be used
 43 |     output_dataset_names: List(str)
 44 |         List of datasets that were created (and saved to disk) by the notebook.
 45 |         These will be loaded from disk and returned by the transformer
 46 |     """
 47 |     if notebook_path == 'None':
 48 |         logger.error("JSON encoding problem with notebook_path. Please regenerate transformer")
 49 | 
 50 |     logger.debug(f"Using notebook:{notebook_name} as transformer to generate {output_dataset_names}")
 51 |     output_notebook = run_notebook(notebook_path=notebook_path, notebook_name=notebook_name)
 52 |     logger.debug(f"See {paths['interim_data_path']/output_notebook} for output of this process")
 53 |     ods_dict = {}
 54 |     for ods in output_dataset_names:
 55 |         logger.debug(f"Loading output dataset:{ods} from disk")
 56 |         ods_dict[ods] = Dataset.from_disk(ods)
 57 |     return ods_dict
 58 | 
 59 | 
 60 | def new_dataset(dsdict, *, dataset_name, dataset_opts=None):
 61 |     """
 62 |     Transformer function: create a dataset from its default constructor
 63 | 
 64 |     Parameters
 65 |     ----------
 66 |     dsdict: ignored
 67 | 
 68 |     dataset_name:
 69 |         Name of dataset to create
 70 |     dataset_opts: dict
 71 |         kwargs dict to pass to Dataset constructor
 72 | 
 73 |     Returns
 74 |     -------
 75 |     dsdict {dataset_name: Dataset}
 76 |     """
 77 |     if dataset_opts is None:
 78 |         dataset_opts = {}
 79 |     ds = Dataset(dataset_name, **dataset_opts)
 80 |     return {dataset_name: ds}
 81 | 
 82 | def sklearn_train_test_split(ds_dict, **split_opts):
 83 |     """Transformer Function: performs a train/test split.
 84 | 
 85 |     for each `dset` in ds_dict, this transformer creates two new
 86 |     datasets: {dset.name}_test and {dset.name}_train
 87 | 
 88 |     Parameters
 89 |     ----------
 90 |     ds_dict:
 91 |         input datasets
 92 |     **split_opts:
 93 |         Remaining options will be passed to `train_test_split`
 94 | 
 95 |     """
 96 |     new_ds = {}
 97 |     for ds_name, dset in ds_dict.items():
 98 | 
 99 |         for kind in ['train', 'test']:
100 |             dset_name = f"{dset_name}_{kind}"
101 |             dset_meta = {**dset.metadata, 'split':kind, 'split_opts':split_opts}
102 |             new_ds[dset_name] = Dataset(dataset_name=dset_name, metadata=dset_meta)
103 |         X_train, X_test, y_train, y_test = train_test_split(dset.data, dset.target, **split_opts)
104 | 
105 |         new_ds[f'{dset_name}_train'].data = X_train
106 |         new_ds[f'{dset_name}_train'].target = y_train
107 |         new_ds[f'{dset_name}_test'].data = X_test
108 |         new_ds[f'{dset_name}_test'].target = y_test
109 |     return new_ds
110 | 
111 | def sklearn_transform(ds_dict, transformer_name, transformer_opts=None, subselect_column=None, **opts):
112 |     """
113 |     Wrapper for any 1:1 (data in to data out) sklearn style transformer. Will run the .fit_transform
114 |     method of the transformer on dset.data. If subselect_column is not None, it will treat the data
115 |     like a dataframe and will subselect dset.data[subselect_column] to run the transformer on.
116 | 
117 |     Parameters
118 |     ----------
119 |     ds_dictet:
120 |         Datasets upon which to apply transforms
121 |     transformer_name: string
122 |         sklearn style transformer with a .fit_transform method avaible via sklearn_transformers.
123 |     transformer_opts: dict
124 |         options to pass on to the transformer
125 |     subselect_column: string
126 |         column name for dset.data to run the transformer on
127 |     return_whole: boolean
128 |         return the whole dataframe with a new column named "transformed"
129 |     **opts:
130 |         options to pass on to the fit_transform method
131 | 
132 |     Returns
133 |     -------
134 |     Datasets whose data are the result of the transformer.fit_transform
135 |     """
136 |     new_dsdict = {}
137 |     for ds_name, dset in ds_dict.items():
138 |         if transformer_name in sklearn_transformers():
139 |             transformer = sklearn_transformers(keys_only=False).get(transformer_name)(**transformer_opts)
140 |         else:
141 |             raise ValueError(f"Invalid transformer name: {transformer_name}. See sklearn_transformers for available names.")
142 |         if subselect_column:
143 |             new_data = transformer.fit_transform(dset.data[subselect_column], **opts)
144 |         else:
145 |             new_data = transformer.fit_transform(dset.data, **opts)
146 | 
147 |         new_dsname = f"{dset.name}_{transformer.__class__.__name__}"
148 |         new_dsdict[new_dsname] = Dataset(dataset_name=new_dsname, metadata=dset.metadata, data=new_data)
149 |     return new_dsdict
150 | 
151 | def csv_to_pandas(ds_dict, *, output_map, **opts):
152 |     """
153 | 
154 |     Parameters
155 |     ----------
156 |     ds_dict:
157 |         input datasets. If multiple datasets, processing will stop at first matching csv_filename
158 |     output_map: dict(new_dataset_name:csv_filename)
159 |         datasets to create. new_dataset_name will be created using csv_filename as its data column.
160 |     **opts:
161 |         Remaining options will be ignored
162 |     """
163 |     new_ds = {}
164 |     df = None
165 |     for ds_name, dset in ds_dict.items():
166 |         extra = dset.metadata.get('extra', None)
167 |         if extra is not None:
168 |             logger.debug(f"Input dataset {ds_name} has extra data. Processing...")
169 |             for rel_dir, file_dict in extra.items():
170 |                 for new_dsname, csv_filename in output_map.items():
171 |                     if csv_filename in file_dict:
172 |                         logger.debug(f"Found {csv_filename}. Creating {new_dsname} dataset")
173 |                         path = paths['processed_data_path'] / rel_dir / csv_filename
174 |                         df = pd.read_csv(path)
175 |                         new_metadata = dset.metadata
176 |                         new_metadata.pop('extra', None)
177 |                         new_ds[new_dsname] = Dataset(dataset_name=new_dsname, data=df, metadata=new_metadata)
178 |     return new_ds
179 | 
180 | 
181 | 
182 | def apply_single_function(ds_dict, *, source_dataset_name, dataset_name, serialized_function, added_descr_txt, drop_extra, **opts):
183 |     """
184 |     Parameters
185 |     ----------
186 |     ds_dict:
187 |         input datasets.
188 |     source_dataset_name:
189 |         name of the dataset that the new dataset will be derived from
190 |     dataset_name:
191 |         name of the new dataset_catalog
192 |     added_descr_txt: Default None
193 |         new description text to be appended to the metadata descr
194 |     serialized_function:
195 |         function (serialized by src.utils.serialize_partial) to run on .data to produce the new .data
196 |     drop_extra: boolean
197 |         drop the .extra part of the metadata
198 |     **opts:
199 |         Remaining options will be ignored
200 |     """
201 | 
202 |     new_ds = {}
203 | 
204 |     logger.debug(f"Loading {source_dataset_name}...")
205 |     ds = ds_dict.get(source_dataset_name)
206 | 
207 |     new_metadata = ds.metadata.copy()
208 |     new_metadata['descr'] += added_descr_txt
209 |     if drop_extra:
210 |         if new_metadata.get('extra', 0) != 0:
211 |             new_metadata.pop('extra')
212 | 
213 |     logger.debug(f"Applying data function...")
214 |     data_function=deserialize_partial(serialized_function)
215 |     new_data = data_function(ds.data)
216 | 
217 |     if ds.target is not None:
218 |         new_target = ds.target.copy()
219 |     else:
220 |         new_target = None
221 | 
222 |     new_ds[dataset_name] = Dataset(dataset_name=dataset_name, data=new_data, target=new_target, metadata=new_metadata)
223 |     return new_ds
224 | 
225 | 
226 |     new_metadata = ds.metadata.copy()
227 | 
228 |     new_ds[new_dsname] = Dataset(dataset_name=new_dsname, data=preprocessed_corpus, metadata=new_metadata)
229 |     return new_ds
230 | 


--------------------------------------------------------------------------------
/src/tests/data/dataset-test.json:
--------------------------------------------------------------------------------
 1 | {
 2 |   "wine_reviews": {
 3 |     "dataset_name": "wine_reviews",
 4 |     "descr": "\n### Content\n\nThis dataset contains three files:\n\n  * `winemag-data-130k-v2.csv` contains 13 columns and 130k rows of wine reviews.\n\n  * `winemag-data_first150k.csv` contains 10 columns and 150k rows of wine reviews. (Does not have Taster info)\n\n  * `winemag-data-130k-v2.json` contains 6919 nodes of wine reviews.\n\nThe data consists of 13 fields:\n\n* Points: the number of points WineEnthusiast rated the wine on a scale of 1-100 (though they say they only post reviews for wines that score >=80)\n* Title: the title of the wine review, which often contains the vintage if you're interested in extracting that feature\n* Variety: the type of grapes used to make the wine (ie Pinot Noir)\n* Description: a few sentences from a sommelier describing the wine's taste, smell, look, feel, etc.\n* Country: the country that the wine is from\n* Province: the province or state that the wine is from\n* Region 1: the wine growing area in a province or state (ie Napa)\n* Region 2: sometimes there are more specific regions specified within a wine growing area (ie Rutherford inside the Napa Valley), but this value can sometimes be blank\n* Winery: the winery that made the wine\n* Designation: the vineyard within the winery where the grapes that made the wine are from\n* Price: the cost for a bottle of the wine\n* Taster Name: name of the person who tasted and reviewed the wine\n* Taster Twitter Handle: Twitter handle for the person who tasted ane reviewed the wine\n\nUPDATED 11/24/2017 Title, Taster Name, and Taster Twitter Handle were collected and the issue with duplicate entires was resolved\n\n### Acknowledgements\n\nThe data was scraped from WineEnthusiast during the week of June 15th, 2017. The code for the scraper can be found here if you have any more specific questions about data collection that I didn't address. (https://github.com/zackthoutt/wine-deep-learning)\n\nUPDATE 11/24/2017\nAfter feedback from users of the dataset I scraped the reviews again on November 22nd, 2017. This time around I collected the title of each review, which you can parse the year out of, the tasters name, and the taster's Twitter handle. This should also fix the duplicate entry issue.",
 5 |     "hashes": {
 6 |       "data": "sha1:120c359cedf8b75e1e9fb7d280668e51eea2e43f",
 7 |       "target": "sha1:38f65f3b11da4851aaaccc19b1f0cf4d3806f83b"
 8 |     },
 9 |     "license": "CC BY-NC-SA 4.0"
10 |   },
11 |   "wine_reviews_130k": {
12 |     "dataset_name": "wine_reviews_130k",
13 |     "descr": "\n### Content\n\nThis dataset contains three files:\n\n  * `winemag-data-130k-v2.csv` contains 13 columns and 130k rows of wine reviews.\n\n  * `winemag-data_first150k.csv` contains 10 columns and 150k rows of wine reviews. (Does not have Taster info)\n\n  * `winemag-data-130k-v2.json` contains 6919 nodes of wine reviews.\n\nThe data consists of 13 fields:\n\n* Points: the number of points WineEnthusiast rated the wine on a scale of 1-100 (though they say they only post reviews for wines that score >=80)\n* Title: the title of the wine review, which often contains the vintage if you're interested in extracting that feature\n* Variety: the type of grapes used to make the wine (ie Pinot Noir)\n* Description: a few sentences from a sommelier describing the wine's taste, smell, look, feel, etc.\n* Country: the country that the wine is from\n* Province: the province or state that the wine is from\n* Region 1: the wine growing area in a province or state (ie Napa)\n* Region 2: sometimes there are more specific regions specified within a wine growing area (ie Rutherford inside the Napa Valley), but this value can sometimes be blank\n* Winery: the winery that made the wine\n* Designation: the vineyard within the winery where the grapes that made the wine are from\n* Price: the cost for a bottle of the wine\n* Taster Name: name of the person who tasted and reviewed the wine\n* Taster Twitter Handle: Twitter handle for the person who tasted ane reviewed the wine\n\nUPDATED 11/24/2017 Title, Taster Name, and Taster Twitter Handle were collected and the issue with duplicate entires was resolved\n\n### Acknowledgements\n\nThe data was scraped from WineEnthusiast during the week of June 15th, 2017. The code for the scraper can be found here if you have any more specific questions about data collection that I didn't address. (https://github.com/zackthoutt/wine-deep-learning)\n\nUPDATE 11/24/2017\nAfter feedback from users of the dataset I scraped the reviews again on November 22nd, 2017. This time around I collected the title of each review, which you can parse the year out of, the tasters name, and the taster's Twitter handle. This should also fix the duplicate entry issue.",
14 |     "hashes": {
15 |       "data": "sha1:9d8db83e00877dbe2ce862040d677b29eb4e23b3",
16 |       "target": "sha1:38f65f3b11da4851aaaccc19b1f0cf4d3806f83b"
17 |     },
18 |     "license": "CC BY-NC-SA 4.0"
19 |   },
20 |   "wine_reviews_130k_varietals_75": {
21 |     "dataset_name": "wine_reviews_130k_varietals_75",
22 |     "descr": "\n### Content\n\nThis dataset contains three files:\n\n  * `winemag-data-130k-v2.csv` contains 13 columns and 130k rows of wine reviews.\n\n  * `winemag-data_first150k.csv` contains 10 columns and 150k rows of wine reviews. (Does not have Taster info)\n\n  * `winemag-data-130k-v2.json` contains 6919 nodes of wine reviews.\n\nThe data consists of 13 fields:\n\n* Points: the number of points WineEnthusiast rated the wine on a scale of 1-100 (though they say they only post reviews for wines that score >=80)\n* Title: the title of the wine review, which often contains the vintage if you're interested in extracting that feature\n* Variety: the type of grapes used to make the wine (ie Pinot Noir)\n* Description: a few sentences from a sommelier describing the wine's taste, smell, look, feel, etc.\n* Country: the country that the wine is from\n* Province: the province or state that the wine is from\n* Region 1: the wine growing area in a province or state (ie Napa)\n* Region 2: sometimes there are more specific regions specified within a wine growing area (ie Rutherford inside the Napa Valley), but this value can sometimes be blank\n* Winery: the winery that made the wine\n* Designation: the vineyard within the winery where the grapes that made the wine are from\n* Price: the cost for a bottle of the wine\n* Taster Name: name of the person who tasted and reviewed the wine\n* Taster Twitter Handle: Twitter handle for the person who tasted ane reviewed the wine\n\nUPDATED 11/24/2017 Title, Taster Name, and Taster Twitter Handle were collected and the issue with duplicate entires was resolved\n\n### Acknowledgements\n\nThe data was scraped from WineEnthusiast during the week of June 15th, 2017. The code for the scraper can be found here if you have any more specific questions about data collection that I didn't address. (https://github.com/zackthoutt/wine-deep-learning)\n\nUPDATE 11/24/2017\nAfter feedback from users of the dataset I scraped the reviews again on November 22nd, 2017. This time around I collected the title of each review, which you can parse the year out of, the tasters name, and the taster's Twitter handle. This should also fix the duplicate entry issue.Subselection of the dataset that only includes entries for wines with a given varietal that appeas in at least 75 different entries",
23 |     "hashes": {
24 |       "data": "sha1:d76d24f6ecd309aec82545c39af107c82edebc2f",
25 |       "target": "sha1:38f65f3b11da4851aaaccc19b1f0cf4d3806f83b"
26 |     },
27 |     "license": "CC BY-NC-SA 4.0"
28 |   },
29 |   "wine_reviews_150k": {
30 |     "dataset_name": "wine_reviews_150k",
31 |     "descr": "\n### Content\n\nThis dataset contains three files:\n\n  * `winemag-data-130k-v2.csv` contains 13 columns and 130k rows of wine reviews.\n\n  * `winemag-data_first150k.csv` contains 10 columns and 150k rows of wine reviews. (Does not have Taster info)\n\n  * `winemag-data-130k-v2.json` contains 6919 nodes of wine reviews.\n\nThe data consists of 13 fields:\n\n* Points: the number of points WineEnthusiast rated the wine on a scale of 1-100 (though they say they only post reviews for wines that score >=80)\n* Title: the title of the wine review, which often contains the vintage if you're interested in extracting that feature\n* Variety: the type of grapes used to make the wine (ie Pinot Noir)\n* Description: a few sentences from a sommelier describing the wine's taste, smell, look, feel, etc.\n* Country: the country that the wine is from\n* Province: the province or state that the wine is from\n* Region 1: the wine growing area in a province or state (ie Napa)\n* Region 2: sometimes there are more specific regions specified within a wine growing area (ie Rutherford inside the Napa Valley), but this value can sometimes be blank\n* Winery: the winery that made the wine\n* Designation: the vineyard within the winery where the grapes that made the wine are from\n* Price: the cost for a bottle of the wine\n* Taster Name: name of the person who tasted and reviewed the wine\n* Taster Twitter Handle: Twitter handle for the person who tasted ane reviewed the wine\n\nUPDATED 11/24/2017 Title, Taster Name, and Taster Twitter Handle were collected and the issue with duplicate entires was resolved\n\n### Acknowledgements\n\nThe data was scraped from WineEnthusiast during the week of June 15th, 2017. The code for the scraper can be found here if you have any more specific questions about data collection that I didn't address. (https://github.com/zackthoutt/wine-deep-learning)\n\nUPDATE 11/24/2017\nAfter feedback from users of the dataset I scraped the reviews again on November 22nd, 2017. This time around I collected the title of each review, which you can parse the year out of, the tasters name, and the taster's Twitter handle. This should also fix the duplicate entry issue.",
32 |     "hashes": {
33 |       "data": "sha1:84c8540f48e1350e0cf5c92a3064711b96e1a5ff",
34 |       "target": "sha1:38f65f3b11da4851aaaccc19b1f0cf4d3806f83b"
35 |     },
36 |     "license": "CC BY-NC-SA 4.0"
37 |   }
38 | }


--------------------------------------------------------------------------------
/src/helpers.py:
--------------------------------------------------------------------------------
  1 | ## Script common ways of adding a dataset to the workflow
  2 | 
  3 | from functools import partial
  4 | import pathlib
  5 | 
  6 | from .log import logger
  7 | from . import paths
  8 | from .exceptions import EasydataError
  9 | 
 10 | from .data import (DataSource, Dataset, hash_file, DatasetGraph, Catalog,
 11 |                serialize_transformer_pipeline)
 12 | from .data.transformer_functions import csv_to_pandas, new_dataset, apply_single_function, run_notebook_transformer
 13 | from .data.extra import process_extra_files
 14 | from .data.utils import serialize_partial
 15 | 
 16 | __all__ = [
 17 |     'notebook_as_transformer',
 18 |     'dataset_from_csv_manual_download',
 19 |     'dataset_from_metadata',
 20 |     'dataset_from_single_function',
 21 | ]
 22 | 
 23 | 
 24 | def notebook_as_transformer(notebook_name, *,
 25 |                             input_datasets=None,
 26 |                             output_datasets,
 27 |                             overwrite_catalog=False,
 28 |                             notebook_path=None,
 29 |                             transformer_name=None
 30 |                             ):
 31 |     """Use a Jupyter notebook as a Dataset transformer funtion.
 32 | 
 33 |     This helper simplifies the process of using a jupyter notebook as a transformer function
 34 |     in the DatasetGraph.
 35 | 
 36 |     Parameters
 37 |     ----------
 38 |     notebook_name: string
 39 |         filename of notebook. relative to `notebook_path`. Obviously, notebook must exist
 40 | 
 41 |     """
 42 | 
 43 |     if notebook_path is not None:
 44 |         notebook_fq = pathlib.Path(notebook_path) / notebook_name
 45 |         notebook_path = str(notebook_path)
 46 |     else:
 47 |         notebook_fq = paths['notebook_path'] / notebook_name
 48 | 
 49 |     dag = DatasetGraph()
 50 |     write_dataset_to_catalog = write_transformer_to_catalog = overwrite_catalog
 51 | 
 52 |     if not notebook_fq.exists():
 53 |         raise EasydataError(f"Notebook {notebook_fq} does not exist. Cannot be used as transformer.")
 54 | 
 55 |     dsdict = {}
 56 |     for ods in output_datasets:
 57 |         ods.update_hashes()
 58 |         if ods.name in dag.datasets:
 59 |             logger.debug(f"dataset:{ods.name} already in catalog")
 60 | 
 61 |             if dag.check_dataset_hashes(ods.name, ods.HASHES):
 62 |                 logger.debug(f"Hashes match for {ods.name}. Skipping Overwrite.")
 63 |                 write_dataset_to_catalog = False
 64 |             else:
 65 |                 logger.warning(f"Hashes do not match for {ods.name}")
 66 |                 if overwrite_catalog is False:
 67 |                     raise ValidationError(f"Hashes for Dataset:{ods.name} differ from catalog, but overwrite_catalog is False")
 68 |         else:
 69 |             logger.debug(f"dataset:{ods.name} not in catalog. Adding...")
 70 |             write_dataset_to_catalog=True
 71 | 
 72 |         logger.debug(f"Writing dataset:{ods.name} to disk")
 73 |         ods.dump(exists_ok=True, update_catalog=write_dataset_to_catalog)
 74 | 
 75 |         logger.debug(f"Generating Transformer edge")
 76 |         transformers = [partial(run_notebook_transformer,
 77 |                                 notebook_path=notebook_path,
 78 |                                 notebook_name=notebook_name,
 79 |                                 output_dataset_names=[ds.name for ds in output_datasets])]
 80 | 
 81 |         transformer = dag.add_edge(input_datasets=[ds.name for ds in input_datasets],
 82 |                                    output_datasets=[ds.name for ds in output_datasets],
 83 |                                    transformer_pipeline=serialize_transformer_pipeline(transformers),
 84 |                                    overwrite_catalog=write_transformer_to_catalog,
 85 |                                    edge_name=transformer_name,
 86 |                                    generate=False)
 87 | 
 88 |         dsdict[ods.name] = ods
 89 |     return dsdict
 90 | 
 91 | # Create a Dataset from a single csv file
 92 | def dataset_from_csv_manual_download(ds_name, csv_path, download_message,
 93 |                                      license_str, descr_str, *, hash_type='sha1',
 94 |                                      hash_value=None,
 95 |                                      overwrite_catalog=False,):
 96 |     """
 97 |     Add a dataset to the catalog files where .data is the dataframe from a
 98 |     single .csv file obtained via manual download.
 99 | 
100 |     ds_name: str
101 |         name of the resulting dataset
102 |     csv_path: path
103 |         relative path to the .csv file from paths['raw_data_path']
104 |     download_message: str
105 |     hash_type: {'sha1', 'md5'}
106 |     hash_value: string. required
107 |         Hash, computed via the algorithm specified in `hash_type`
108 |     license_str: str
109 |         Contents of metadata license as text
110 |     descr_str:
111 |         Contents of the metadata description as text
112 |     overwrite_catalog: boolean
113 |         If True, existing entries in datasets and transformers catalogs will be
114 |         overwritten
115 | 
116 |     Returns
117 |     -------
118 |     Dataset that was added to the Transformer graph
119 |     """
120 | 
121 |     dataset_catalog = Catalog.load('datasets')
122 |     if ds_name in dataset_catalog and not overwrite_catalog:
123 |         raise KeyError(f"'{ds_name}' already in catalog")
124 |     csv_path = pathlib.Path(csv_path)
125 |     # Create a datasource
126 |     raw_ds_name = ds_name+"-raw"
127 |     logger.debug(f"Creating raw datasource: {raw_ds_name}")
128 |     dsrc = DataSource(raw_ds_name)
129 | 
130 |     if hash_value is None:
131 |         file_path = paths['raw_data_path'] / csv_path
132 |         hash_value = hash_file(file_path, algorithm=hash_type)
133 |     dsrc.add_manual_download(message=download_message,
134 |                              file_name=str(csv_path),
135 |                              hash_type=hash_type,
136 |                              hash_value=hash_value,
137 |                              unpack_action='copy',
138 |                              force=True)
139 |     dsrc.add_metadata(contents=descr_str, force=True)
140 |     dsrc.add_metadata(contents=license_str, kind='LICENSE', force=True)
141 | 
142 |     process_function = process_extra_files
143 |     process_function = process_extra_files
144 |     process_function_kwargs = {'do_copy':True,
145 |                                'file_glob':str(csv_path.name),
146 |                                'extra_dir': raw_ds_name+'.extra',
147 |                                'extract_dir': raw_ds_name}
148 |     dsrc.process_function = partial(process_function, **process_function_kwargs)
149 |     datasource_catalog = Catalog.load('datasources')
150 |     datasource_catalog[dsrc.name] = dsrc.to_dict()
151 | 
152 |     # Add a dataset from the datasource
153 |     dag = DatasetGraph(catalog_path=paths['catalog_path'])
154 |     dag.add_source(output_dataset=raw_ds_name, datasource_name=raw_ds_name, overwrite_catalog=True)
155 |     # Run the dataset creation code to add it to the catalog
156 |     ds = Dataset.from_catalog(raw_ds_name)
157 | 
158 |     # Add transformer to create the final dataset
159 |     transformers = [partial(csv_to_pandas,
160 |                             output_map={ds_name:csv_path.name})]
161 | 
162 |     dag.add_edge(input_dataset=raw_ds_name,
163 |                  output_dataset=ds_name,
164 |                  transformer_pipeline=serialize_transformer_pipeline(transformers),
165 |                  overwrite_catalog=True)
166 | 
167 |     ds = Dataset.from_catalog(ds_name)
168 |     return ds
169 | 
170 | def dataset_from_metadata(dataset_name, metadata=None, overwrite_catalog=False):
171 |     """Create Dataset from supplied metadata
172 | 
173 |     Dataset will be a source node in the Transformer graph
174 | 
175 |     Parameters
176 |     ----------
177 |     dataset_name:
178 |         name of dataset to be created
179 |     metadata:
180 |         dictionary of metadata fields for dataset creation
181 |     overwrite_catalog: boolean
182 |         If True, existing entries in datasets and transformers catalogs will be
183 |         overwritten
184 | 
185 |     Returns
186 |     -------
187 |     Dataset that was added to the Transformer graph
188 | 
189 |     """
190 |     dataset_catalog = Catalog.load('datasets')
191 |     if dataset_name in dataset_catalog and not overwrite_catalog:
192 |         raise KeyError(f"'{dataset_name}' already in catalog")
193 |     if metadata is None:
194 |         metadata = {}
195 |     dag = DatasetGraph()
196 |     ds_opts = {'metadata': metadata}
197 |     transformers = [partial(new_dataset, dataset_name=dataset_name, dataset_opts=ds_opts)]
198 |     dag.add_source(output_dataset=dataset_name,
199 |                transformer_pipeline=serialize_transformer_pipeline(transformers),
200 |                overwrite_catalog=overwrite_catalog)
201 |     ds = Dataset.from_catalog(dataset_name)
202 |     return ds
203 | 
204 | 
205 | def dataset_from_single_function(*, source_dataset_name, dataset_name, data_function, added_descr_txt, drop_extra=True, overwrite_catalog=False):
206 |     """
207 |     Create a derived dataset (dataset_name) via a single function call on .data from a
208 |     previous dataset (source_dataset_name).
209 | 
210 |     Parameters
211 |     ----------
212 |     source_dataset_name:
213 |         name of the dataset that the new dataset will be derived from
214 |     dataset_name:
215 |         name of the new dataset_catalog
216 |     added_descr_txt: Default None
217 |         new description text to be appended to the metadata descr
218 |     data_function:
219 |         function (from src module) to run on .data to produce the new .data
220 |     overwrite_catalog: boolean
221 |         if True, existing entries in datasets and transformers catalogs will be overwritten
222 |     """
223 |     dag = DatasetGraph(catalog_path=paths['catalog_path'])
224 |     serialized_function = serialize_partial(data_function)
225 |     transformers = [partial(apply_single_function, source_dataset_name=source_dataset_name, dataset_name=dataset_name,
226 |                             serialized_function=serialized_function, added_descr_txt=added_descr_txt, drop_extra=drop_extra)]
227 |     dag.add_edge(input_dataset=source_dataset_name,
228 |                  output_dataset=dataset_name,
229 |                  transformer_pipeline=serialize_transformer_pipeline(transformers),
230 |                  overwrite_catalog=overwrite_catalog)
231 |     ds = Dataset.from_catalog(dataset_name)
232 |     logger.debug(f"{dataset_name} added to catalog")
233 |     return ds
234 | 


--------------------------------------------------------------------------------
/reference/easydata/sharing-your-work.md:
--------------------------------------------------------------------------------
  1 |  # Sharing your Work
  2 | 
  3 | * [Contributor Guidelines and Checklist](#contributor-guide-and-checklist)
  4 | * [Best Practices for Sharing](#best-practices-for-sharing)
  5 |   * [Sharing Code Using Git and github.com](#sharing-code-using-git-and-github)
  6 |   * [Sharing Datasets](#sharing-datasets)
  7 |   * [Sharing Conda Environments](#sharing-conda-environments)
  8 |   * [Sharing Notebooks](#sharing-notebooks)
  9 | * [Quick Guide to Licenses](#quick-guide-to-licenses)
 10 | 
 11 | ## Contributor Guidelines and Checklist
 12 | 
 13 | The main impetus of following the **recommended workflow** for this project is to help make it easier to share your datasets, code and analyses in a reproducible way and easy-to-use way.
 14 | 
 15 | We **want** you to share your work. We understand that your work may still be a **work-in-progress** when you first start to share it. We encourage that. There are three main ways to contribute to this repo:
 16 | 
 17 | 
 18 | * **Filing and reporting issues:** Please don't be shy here. Chances are if you encounter an issue, someone else already has, or someone else will encounter the same issue in the future. Reporting helps us to find solutions that will work for everyone. Hacks and your personal work-arounds are not reproducible. No issue is too small. Share the love and let us solve issues as best we can for everyone. Issues include anything from "I had trouble understanding and following the documentation", to feature requests, to bugs in the shared codebase iteself.
 19 |   1. First up, [make sure that you're working with the most up-to-date version](git-workflow.md) of the codebase.
 20 |   1. Check the [troubleshooting guide](conda-environments.md#troubleshooting) to see if a solution has already been documented.
 21 |   1. Check if the issue has been reported already. If so, make a comment on the issue to indicate that you're also having said issue.
 22 |   1. Finally, if your issue hasn't been resolved at this stage, file an issue. For bugs reports, please include reproducers.
 23 | * **Submitting Pull Requests (PRs):** This is the way to share your work if it involves any code. To prepare your PR, follow the [contributor checklist](#contributor-checklist). In the meantime, follow the [recommended best practices](#best-practices-for-sharing) to make your life easier when you are ready to share.
 24 | 
 25 | ### Contributor Checklist
 26 | 
 27 | When is my work ready to share? Let's find out!
 28 | 
 29 | When you are ready share your notebook or code with others, you'll be able to tick all of the following boxes.
 30 | 
 31 | #### Notebooks and Code
 32 | - [ ] Notebooks are in the `notebooks` directory following the [notebook naming convention](notebooks.md#naming-convention).
 33 | - [ ] Notebooks load data via the `Dataset.load()` API to access an available Dataset.
 34 | - [ ] Functions are in `src/user_name` and accessed in notebooks via something like `from src.user_name import my_function`. If you have  `def my_function` in your notebook or anything more elaborate, there's a good chance that it should be in the `src` module.
 35 | - [ ] Notebook cells run sequentially (i.e. **Kernel->Restart & Run All** runs to completion successfully).
 36 | - [ ] *(Optional but generally recommended)*: All notebook cell output has been cleared before checking it in (i.e. **Kernel->Restart & Clear Output** before saving).
 37 | 
 38 | #### Licences
 39 | - [ ] Decide on a [license for your data derived work (e.g. images)](#quick-guide-to-licenses) and if it's not the same as that of the dataset you used, mark it appropriately as per your license of choice (assuming it's compatible with the dataset's license). By default, the license of derived work will be the same as the dataset it came from.
 40 | 
 41 | #### Environment and Tests
 42 | - [ ] Share your conda environment. Check in your `environment.yml` file if you've made any changes.
 43 |   * If there's any chance that you added something to the conda environment needed to run your code that was **not** added via your `environment.yml` file as per [Setting up and Maintaining your Conda Environment (Reproducibly)](conda-environments.md), [delete your environment and recreate it](conda-environments.md#nuke-it-from-orbit).
 44 | - [ ] *(Optional)* Make sure all tests pass (run `make test`). This will test all of the dataset integration so if you don't have a lot of room on your machine (as it will build all the the datasets if you haven't yet), you may want to skip this step.
 45 | - [ ] At least, make sure all of the tests for your code pass. To subselect your tests you can run `pytest --pyargs src -k your_test_filename`.
 46 | 
 47 | #### Final Checks
 48 | - [ ] You've [merged the latest version](git-workflow.md) of `upstream/main` into your branch.
 49 | - [ ] [Submitted a PR via github.com](#how-to-submit-a-PR) in **Draft** status and checked the PR diff to make sure that you aren't missing anything critical, you're not adding anything extraneous, and you don't have any merge conflicts.
 50 | 
 51 | Once this checklist is complete, take your **PR** out of **Draft** status. It's ready to go!
 52 | 
 53 | As a person who is trying to contribute and share your work with others, it may at times feel like this is a lot of work. We get that, and find it useful to think of it this way: for every 5 minutes extra that you put into making your work reproducible, everyone else who tries to run or use your work will spend at least 5 minutes less trying to get it to work for them. In other words, making your work reproducible is part of being a good citizen and helping us all to learn from each other as we go. Thank you for helping us to share and use your work!
 54 | 
 55 | 
 56 | ## Best Practices for Sharing
 57 | ### Sharing Code Using Git and github.com
 58 | 
 59 | Quick References:
 60 | 
 61 | * Keeping up-to-date: [Our Git Workflow](git-workflow.md)
 62 | * Recommended [Git tutorial](https://github.com/hackalog/cookiecutter-easydata/wiki/Git-Tutorial)
 63 | 
 64 | 
 65 | There are several ways to use Git and github.com successfully, and a lot more ways to use them unsuccessfully when working with lots of other people. Here are some best practices that we suggest you use to make your life, and our lives easier. This workflow we suggest makes choosing which changes to put in a pull request easier, and helps to avoid crazy merge conflicts.
 66 | 
 67 | First off, follow the [Getting Started](../README.md#getting-started) instructions for setting yourself up to work from your own fork. The idea here will be to keep `upstream/main`, your local `main` and your `origin/main` all in sync with each other.
 68 | 
 69 | Any changes should be made in a separate branch---**not** your `main`---that you push up to your fork. Eventually, when you're ready to submit a PR, you'll do so from the branch that you've been working on. When you push to your `origin/branch_name`, you should get prompted in the terminal by `git` with a URL you can follow to submit a PR. To do so:
 70 | 
 71 | 1. Make sure your `main` is up-to-date with upstream `git fetch upstream` and `git merge upstream/main`
 72 | 1. Make sure your environment is up-to-date with upstream `make update_environment`
 73 | 1. Start your work (from your up-to-date `main`) in a new branch: `git checkout -b my_new_branch`
 74 | 1. Commit all your changes to `my_new_branch` (as per the [Easydata git Workflow](git-workflow.md))
 75 | 
 76 | 
 77 | You can pretty much blindly do this by following the [Easydata git Workflow](git-workflow.md) religiously.
 78 | 
 79 | #### How to submit a PR
 80 | 
 81 | 1. Push to your github.com fork by `git push origin my_new_branch`.
 82 | 1. If this is the first time you do this from `my_new_branch`, you'll be prompted with a URL from your terminal for how to create a PR. Otherwise, if you go to github.com, you'll see a yellow banner at the top of the screen prompting you to submit a PR (as long as you're not out of sync with the `upstream main`, in which case, re-sync your branch).
 83 | 1. You have the option to submit a PR in **Draft** status. Select this if you have a work in progress. It disables the ability to merge your PR.
 84 | 1. Once you submit your PR, there may be a yellow dot or red X beside your PR. This is because we have tests set up in CircleCI. If you are working in a private repo, you need to authorize access to CircleCI on your fork for tests to run successfully. To do so, follow the link to CircleCI and **authorize github.com** on your fork of the repo.
 85 | 1. When ready, take your PR out of **Draft** status.
 86 | 
 87 | 
 88 | #### General Git Suggestions:
 89 | * Never commit your changes to your `main` branch. Always work from a branch. Then you always have a clean local copy of `upstream/main` to work from.
 90 | * Stick to **basic git commands** unless you *really* know what you're doing. (e.g. use `add`, `fetch`, `merge`, `commit`, `diff`, `rm`, `mv`)
 91 | * While sometimes convenient, avoid using `git pull` from remotes. Or just general avoid using `git pull`. Use `git fetch` then `git merge` instead.
 92 | * Use `git add -p` instead of `git add` to break up your commits into logical pieces rather than one big snotball of changes.
 93 | 
 94 | ### Sharing Datasets
 95 | 
 96 | Most of the infrastructure behind the scenes in this repo is set up for sharing datasets reliably and reproducibly without ever checking it in. We use **recipes** for making Datasets instead. So in short, don't check in data. And use the `Dataset.load()` API.
 97 | 
 98 | In order to convert your data to a `Dataset` object, we will need to generate a catalog recipe, that uses a custom function for processing your raw data. Doing so allows us to document all the munging, pre-processing, and data verification necessary to reproducibly build the dataset. Details on how to do this can be found on the [cookiecutter-easydata repo](https://github.com/hackalog/cookiecutter-easydata), but it's likely better to ask the maintainers of this project can point you in the right direction for how to get a Dataset added to this project.
 99 | 
100 | For more on `Dataset` objects, see [Getting and Using Datasets](datasets.md).
101 | 
102 | For more on licenses, see [below](#quick-guide-to-licenses).
103 | 
104 | ### Sharing conda environments
105 | In order to make sharing virtual environments easy, the repo includes `make` commands that you can use to manage your environment via an `environment.yml` file (and corresponding `environment.${ARCH}.lock.yml` file). By [setting up and maintaining your conda environment reproducibly](conda-environments.md), sharing your environment is as easy as including any changes to your `environment.yml` file in your PR.
106 | 
107 | If there's any chance that you added something to the conda environment needed to run your code that was **not** added via your `environment.yml`, [delete your environment, recreate it](conda-environments.md#nuke-it-from-orbit) and then make the appropriate changes to your `environment.yml` file.
108 | 
109 | Remember to `make update_environment` regularly after fetching and merging the `upstream` remote to keep your conda environment up-to-date with the shared (team) repo.
110 | 
111 | ### Sharing notebooks and code
112 | We're keen on sharing notebooks for sharing stories and analyses. Best practices can be found in [using notebooks for sharing your analysis](notebooks.md). A short list of reminders:
113 | 
114 | * Follow the [notebook naming convention](notebooks.md#naming-convention)
115 | * Use the [`Dataset.load()` API](datasets.md) for accessing data
116 | * Put [code in the `src` module](notebooks.md#on-code) under `src/xyz` where `xyz` is your (the author's) initials (as in the notebook naming convention)
117 | * Run **Kernel->Restart & Run All** and optionally **Kernel->Restart & Clear Output** before saving and checking in your notebooks
118 | 
119 | ## Quick Guide to Licenses
120 | Work in progress...Add some references
121 | 
122 | ### Quick References
123 | 
124 | * [README](../README.md)
125 | * [Setting up and Maintaining your Conda Environment Reproducibly](conda-environments.md)
126 | * [Getting and Using Datasets](datasets.md)
127 | * [Using Notebooks for Analysis](notebooks.md)
128 | * [Sharing your Work](sharing-your-work.md)
129 | * [Troubleshooting Guide](troubleshooting.md)
130 | 


--------------------------------------------------------------------------------
/src/data/catalog.py:
--------------------------------------------------------------------------------
  1 | import json
  2 | import os
  3 | import pathlib
  4 | import shutil
  5 | 
  6 | from collections.abc import MutableMapping
  7 | from ..log import logger
  8 | from ..utils import load_json, save_json
  9 | from .. import paths
 10 | 
 11 | 
 12 | __all__ = [
 13 |     'Catalog',
 14 | ]
 15 | 
 16 | 
 17 | class Catalog(MutableMapping):
 18 |     """A catalog is a serializable, disk-backed git-friendly dict-like object for storing a data catalog.
 19 | 
 20 |     * "serializable" means anything stored in the catalog must be serializable to/from JSON.
 21 |     * "disk-backed" means all changes are reflected immediately in the on-disk serialization.
 22 |     * "git-friendly" means this on-disk format can be easily maintained in a git repo (with minimal
 23 |      issues around merge conflicts), and
 24 |     * "dict-like" means programmatically, it acts like a Python `dict`.
 25 | 
 26 |     On disk, a Catalog is stored as a directory of JSON files, one file per object
 27 |     The stem of the filename (e.g. stem.json) is the key (name) of the catalog entry
 28 |     in the dictionary, so `catalog/key.json` is accessible via catalog['key'].
 29 |     """
 30 | 
 31 |     def __init__(self,
 32 |                  catalog_name,
 33 |                  data=None,
 34 |                  catalog_path=None,
 35 |                  create=True,
 36 |                  delete=False,
 37 |                  extension="json",
 38 |                  ignore_errors=False,
 39 |                  merge_priority="data",
 40 |                  ):
 41 |         """
 42 |         catalog_name: str
 43 |             Name of the catalog. Also name of directory containing JSON catalog files. relative to `catalog_path`
 44 |         catalog_path: path. (default: paths['catalog_dir'])
 45 |             Location of catalog directory (i.e. data catalog is stored at `catalog_path/catalog_name`)
 46 |         create: Boolean
 47 |             if True, create the catalog if needed
 48 |         data:
 49 |             Dict-like object containing data to be merged into the catalog
 50 |         delete: boolean
 51 |             If catalog exists on disk, delete it before continuing
 52 |         extension: string
 53 |             file extension to use for serialized JSON files.
 54 |         ignore_errors: Boolean
 55 |             If True, errors in delete/create will be ignored
 56 |         merge_priority: {"disk", "data"}
 57 |             If using `data` with an existing repo, this indicates how to merge the two
 58 |             If disk, values already stored in the catalog will be retained
 59 |             If data, contents of `data` will override existing items on disk.
 60 | 
 61 |         """
 62 |         if catalog_path is None:
 63 |             self.catalog_path = paths['catalog_path']
 64 |         else:
 65 |             self.catalog_path = pathlib.Path(catalog_path)
 66 | 
 67 |         self.name = catalog_name
 68 |         self.extension = extension
 69 | 
 70 |         if data is None:
 71 |             data = {}
 72 | 
 73 |         if self.catalog_dir_fq.exists():  # Catalog exists on disk
 74 |             if delete:
 75 |                 logger.debug(f"Deleting existing catalog dir: {self.name}")
 76 |                 shutil.rmtree(self.catalog_dir_fq, ignore_errors=ignore_errors)
 77 | 
 78 |         # Load existing data (if it exists)
 79 |         self.data = {}
 80 |         disk_data = self._load(return_dict=True)
 81 |         logger.debug(f"Loaded {len(disk_data)} records from '{self.name}' Catalog.")
 82 | 
 83 |         if create:
 84 |             if not self.catalog_dir_fq.exists():  # Catalog exists on disk
 85 |                 logger.debug(f"Creating new catalog:{self.name}")
 86 |                 os.makedirs(self.catalog_dir_fq, exist_ok=ignore_errors)
 87 | 
 88 |         if data:
 89 |             logger.debug(f"Merging {len(disk_data)} on-disk and {len(data)} off-disk parameters")
 90 |             if merge_priority == "disk":
 91 |                 self.data = {**data, **disk_data}
 92 |             elif merge_priority == "data":
 93 |                 self.data = {**disk_data, **data}
 94 |             else:
 95 |                 raise ValueError(f"Unknown merge_priority:{merge_priority}")
 96 |         else:
 97 |             self.__setitem__ = self._memory_setitem
 98 |             self.data = disk_data
 99 |             self.__setitem__ = self._disk_setitem
100 | 
101 |         self._verify_save()
102 | 
103 |     @property
104 |     def file_glob(self):
105 |         """glob string that will match all key files in this catalog directory.
106 |         """
107 |         return f"*.{self.extension}"
108 | 
109 |     @property
110 |     def catalog_dir_fq(self):
111 |         """pathlib.Path returning fully qualified path to catalog directory.
112 |         """
113 |         return self.catalog_path / self.name
114 | 
115 |     def __getitem__(self, key):
116 |         return self.data[key]
117 | 
118 |     def _disk_setitem(self, key, value):
119 |         self.data[key] = value
120 |         self._save_item(key)
121 | 
122 |     def _memory_setitem(self, key, value):
123 |         self.data[key] = value
124 | 
125 |     # So we can swap between behaviors
126 |     __setitem__ = _disk_setitem
127 | 
128 |     def __delitem__(self, key):
129 |         del self.data[key]
130 |         self._del_item(key)
131 | 
132 |     def __iter__(self):
133 |         return iter(self.data)
134 | 
135 |     def __len__(self):
136 |         return len(self.data)
137 | 
138 |     def __repr__(self):
139 |         return f"<Catalog:{list(self.data.keys())}>"
140 | 
141 |     def __eq__(self, other):
142 |         """Two catalogs are equal if they have the same contents,
143 |         regardless of where or how they are stored on-disk.
144 |         """
145 |         return self.data == other.data
146 | 
147 |     def _load(self, return_dict=False):
148 |         """reload an entire catalog from its on-disk serialization.
149 | 
150 |         if return_dict is True, return the data that would have been loaded,
151 |         but do not change the contents of the catalog.
152 | 
153 |         """
154 |         catalog_dict = {}
155 |         for catalog_file in self.catalog_dir_fq.glob(self.file_glob):
156 |             catalog_dict[catalog_file.stem] = load_json(catalog_file)
157 | 
158 |         if return_dict is True:
159 |             return catalog_dict
160 |         self.__setitem__ = self._memory_setitem
161 |         self.data = catalog_dict
162 |         self.__setitem__ = self._disk_setitem
163 | 
164 |     def _del_item(self, key):
165 |         """Delete the on-disk serialization of a catalog entry"""
166 |         filename = self.catalog_dir_fq / f"{key}.{self.extension}"
167 |         logger.debug(f"Deleting catalog entry: '{key}.{self.extension}'")
168 |         filename.unlink()
169 | 
170 |     def _save_item(self, key):
171 |         """serialize a catalog entry to disk"""
172 |         value = self.data[key]
173 |         logger.debug(f"Writing entry:'{key}' to catalog:'{self.name}'.")
174 |         save_json(self.catalog_dir_fq / f"{key}.{self.extension}", value)
175 | 
176 |     def _save(self, paranoid=True):
177 |         """Save all catalog entries to disk
178 | 
179 |         if paranoid=True, verify serialization is equal to in-memory copy
180 |         """
181 |         logger.debug(f"Saving {len(self.data)} records to catalog '{self.name}'")
182 |         for key in self.data:
183 |             self._save_item(key)
184 |         if paranoid:
185 |             _verify_save()
186 | 
187 |     def _verify_save(self):
188 |         logger.debug(f"Verifying serialization for catalog '{self.name}'")
189 |         new = self._load(return_dict=True)
190 |         if new != self.data:
191 |             logger.error("Serialization failed. On-disk catalog differs from in-memory catalog")
192 | 
193 |     @classmethod
194 |     def load(cls, name, create=True, ignore_errors=True, catalog_path=None):
195 |         """Load a Catalog from disk.
196 | 
197 |         Parameters
198 |         ----------
199 |         name: String
200 |             catalog name. Also the directory name for the serialized data
201 |         create: Boolean
202 |             If the catalog doesn't exist, create it.
203 |         catalog_path:
204 |             Path to where catalog will be created. Default: paths['catalog_path']
205 |         ignore_errors: Boolean
206 |             if False, and create=True, an error is thrown if the catalog already exists.
207 |         """
208 | 
209 |         if catalog_path is None:
210 |             catalog_path = paths['catalog_path']
211 |         else:
212 |             catalog_path = pathlib.Path(catalog_path)
213 | 
214 |         catalog_dir_fq = catalog_path / name
215 |         if not catalog_dir_fq.exists() and not create:
216 |             raise FileNotFoundError(f"Catalog:{name} not found and create=False")
217 | 
218 |         catalog = cls(name, create=create, ignore_errors=ignore_errors, catalog_path=catalog_path,
219 |                       delete=False, data=None)
220 |         return catalog
221 | 
222 |     @classmethod
223 |     def create(cls, name, data=None, replace=False):
224 |         """Create (or replace) a Catalog.
225 | 
226 |         Parameters
227 |         ----------
228 |         name: String
229 |             catalog name. Also the directory name for the serialized data
230 |         data: dict (or dict-like object)
231 |             Initial contents of Catalog object
232 |         replace: Boolean
233 |             If True, replace an existing catalog.
234 |             If False, an error is thrown if the catalog exists.
235 |         """
236 | 
237 |         catalog = cls(name, create=True, delete=replace, data=data)
238 |         return catalog
239 | 
240 | 
241 |     @staticmethod
242 |     def delete(name, ignore_errors=False, catalog_path=None):
243 |         """Delete the on-disk Catalog
244 | 
245 |         Parameters
246 |         ----------
247 |         name: String
248 |             Catalog name. Also the name of the directory to be deleted
249 |         ignore_errors:
250 |             If False, throw an error if catalog does not exist
251 |         catalog_path:
252 |             Directory containing catalog. Default paths['catalog_path']
253 |         """
254 |         if catalog_path is None:
255 |             catalog_path = paths['catalog_path']
256 |         else:
257 |             catalog_path = pathlib.Path(catalog_path)
258 | 
259 |         logger.debug(f"Deleting existing catalog dir: {name}")
260 |         shutil.rmtree(catalog_path / name, ignore_errors=ignore_errors)
261 | 
262 |     @classmethod
263 |     def from_old_catalog(cls, catalog_file_fq, catalog_name=None, replace=False, catalog_path=None):
264 |         """Create a catalog from an old combined-format JSON file
265 | 
266 |         Converts an old-format (combined) JSON catalog file to a new format (directory
267 |         of JSON files) catalog file.
268 | 
269 |         Parameters
270 |         ----------
271 |         catalog_file_fq: String or Path
272 |             fully qualified (or valid relative) path to old-format JSON catalog file
273 |         catalog_name: None or String or Path
274 |             if None, new-format catalog directory will be the stem (extensionless part)
275 |             of `catalog_file_fq`
276 |         replace: Boolean
277 |             If True, an existing catalog file will be overwritten
278 | 
279 |         Other parameters are the same as per `Catalog.__init__()`
280 |         """
281 |         if catalog_path is None:
282 |             catalog_path = paths['catalog_path']
283 |         else:
284 |             catalog_path = pathlib.Path(catalog_path)
285 | 
286 |         catalog_file_fq = pathlib.Path(catalog_file_fq)
287 | 
288 |         if catalog_file_fq.exists():
289 |             catalog_dict = load_json(catalog_file_fq)
290 |         else:
291 |             logger.warning(f"Old catalog file:'{catalog_file_fq}' does not exist.")
292 |             catalog_dict = {}
293 | 
294 |         if catalog_name is None:
295 |             catalog_name = pathlib.Path(catalog_file_fq).stem
296 | 
297 |         catalog_dir_fq = catalog_path / catalog_name
298 |         if catalog_dir_fq.exists() and not replace:
299 |             raise FileExistsError(f"Catalog:{catalog_name} exists but replace=False")
300 | 
301 |         catalog = cls(catalog_name,
302 |                       data=catalog_dict,
303 |                       create=True, delete=replace,
304 |                       catalog_path=catalog_path)
305 |         return catalog
306 | 


--------------------------------------------------------------------------------
/src/data/fetch.py:
--------------------------------------------------------------------------------
  1 | import gzip
  2 | import hashlib
  3 | import joblib
  4 | import os
  5 | import pathlib
  6 | import requests
  7 | import shutil
  8 | import tarfile
  9 | import tempfile
 10 | import zipfile
 11 | import zlib
 12 | import requests
 13 | import joblib
 14 | import gdown
 15 | 
 16 | from tqdm.auto import tqdm
 17 | 
 18 | from .. import paths
 19 | from ..log import logger
 20 | 
 21 | __all__ = [
 22 |     'available_hashes',
 23 |     'fetch_file',
 24 |     'fetch_files',
 25 |     'fetch_text_file',
 26 |     'get_dataset_filename',
 27 |     'hash_file',
 28 |     'hash_object',
 29 |     'infer_filename',
 30 |     'unpack',
 31 | ]
 32 | 
 33 | _HASH_FUNCTION_MAP = {
 34 |     'md5': hashlib.md5,
 35 |     'sha1': hashlib.sha1,
 36 |     'size': os.path.getsize,
 37 | }
 38 | 
 39 | def safe_symlink(target, link_name, overwrite=False):
 40 |     '''
 41 |     Create a symbolic link named link_name pointing to target.
 42 |     If link_name exists then FileExistsError is raised, unless overwrite=True.
 43 |     When trying to overwrite a directory, IsADirectoryError is raised.
 44 |     '''
 45 | 
 46 |     if not overwrite:
 47 |         os.symlink(target, link_name)
 48 |         return
 49 | 
 50 |     # os.replace() may fail if files are on different filesystems
 51 |     link_dir = os.path.dirname(link_name)
 52 | 
 53 |     # Create link to target with temporary filename
 54 |     while True:
 55 |         temp_link_name = tempfile.mktemp(dir=link_dir)
 56 | 
 57 |         # os.* functions mimic as closely as possible system functions
 58 |         # The POSIX symlink() returns EEXIST if link_name already exists
 59 |         # https://pubs.opengroup.org/onlinepubs/9699919799/functions/symlink.html
 60 |         try:
 61 |             os.symlink(target, temp_link_name)
 62 |             break
 63 |         except FileExistsError:
 64 |             pass
 65 | 
 66 |     # Replace link_name with temp_link_name
 67 |     try:
 68 |         # Pre-empt os.replace on a directory with a nicer message
 69 |         if not os.path.islink(link_name) and os.path.isdir(link_name):
 70 |             raise IsADirectoryError(f"Cannot symlink over existing directory: '{link_name}'")
 71 |         os.replace(temp_link_name, link_name)
 72 |     except:
 73 |         if os.path.islink(temp_link_name):
 74 |             os.remove(temp_link_name)
 75 |         raise
 76 | 
 77 | 
 78 | def available_hashes():
 79 |     """Valid Hash Functions
 80 | 
 81 |     This function simply returns the dict known hash function
 82 |     algorithms.
 83 | 
 84 |     It exists to allow for a description of the mapping for
 85 |     each of the valid strings.
 86 | 
 87 |     The hash functions are:
 88 | 
 89 |     ============     ====================================
 90 |     Algorithm        Function
 91 |     ============     ====================================
 92 |     md5              hashlib.md5
 93 |     sha1             hashlib.sha1
 94 |     size             os.path.getsize
 95 |     ============     ====================================
 96 | 
 97 |     >>> list(available_hashes().keys())
 98 |     ['md5', 'sha1', 'size']
 99 |     """
100 |     return _HASH_FUNCTION_MAP
101 | 
102 | def hash_object(obj, hash_type="sha1"):
103 |     '''compute the hash of a python object
104 | 
105 |     Parameters
106 |     ----------
107 |     hash_type: {'md5', 'sha1', 'size'}
108 |         hash function to use.
109 |         Must be in `available_hashes`
110 | 
111 |     Returns
112 |     -------
113 |     A string: f"{hash_type}:{hash_value}"
114 |     '''
115 |     data_hash = joblib.hash(obj, hash_name=hash_type).hexdigest()
116 |     return f"{hash_type}:{data_hash}"
117 | 
118 | def hash_file(fname, algorithm="sha1", block_size=4096):
119 |     '''Compute the hash of an on-disk file
120 | 
121 |     hash_type: {'md5', 'sha1', 'size'}
122 |         hash function to use.
123 |         Must be in `available_hashes`
124 |     block_size:
125 |         size of chunks to read when hashing
126 | 
127 |     Returns
128 |     -------
129 |     String: f"{hash_type}:{hash_value}"
130 |     '''
131 |     if algorithm == 'size':
132 |         hashval = _HASH_FUNCTION_MAP[algorithm]
133 |         return f"{algorithm}:{hashval(fname)}"
134 | 
135 |     hashval = _HASH_FUNCTION_MAP[algorithm]()
136 |     with open(fname, "rb") as fd:
137 |         for chunk in iter(lambda: fd.read(block_size), b""):
138 |             hashval.update(chunk)
139 |     return f"{algorithm}:{hashval.hexdigest()}"
140 | 
141 | def tqdm_download(url, url_options=None, filename=None,
142 |                   download_path=None,chunk_size=1024):
143 |     """Download a URL via requests, displaying a tqdm status bar
144 | 
145 |     Parameters
146 |     ----------
147 |     url:
148 |         URL to download
149 |     url_options:
150 |         Options passed to requests.request() for download
151 |     filename:
152 |         filename to save. If omitted, it's inferred from the URL
153 |     download_path: path, default paths['raw_data_path']
154 |         Inferred filename is relative to this path
155 |     chunk_size:
156 |         block size for writes
157 | 
158 |     Raises
159 |     ------
160 |     HTTPError if download fails
161 | 
162 |     Returns
163 |     -------
164 |     filename of written file
165 |     """
166 |     if url_options is None:
167 |         url_options = {}
168 |     if download_path is None:
169 |         download_path = paths['raw_data_path']
170 |     else:
171 |         download_path = pathlib.Path(download_path)
172 |     if filename is None:
173 |         fn = url.split("/")[-1]
174 |         logger.debug(f"filename not specified. Inferring '{fn}' from url")
175 |         filename = download_path / fn
176 |     else:
177 |         filename = pathlib.Path(filename)
178 |     resp = requests.get(url, stream=True, **url_options)
179 |     total = int(resp.headers.get('content-length', 0))
180 |     with open(filename, 'wb') as file, tqdm(
181 |             desc=filename.name,
182 |             total=total,
183 |             unit='iB',
184 |             unit_scale=True,
185 |             unit_divisor=1024,
186 |     ) as bar:
187 |         for data in resp.iter_content(chunk_size=chunk_size):
188 |             size = file.write(data)
189 |             bar.update(size)
190 |     resp.raise_for_status()
191 | 
192 |     return filename
193 | 
194 | def fetch_files(force=False, dst_dir=None, **kwargs):
195 |     '''
196 |     fetches a list of files via URL
197 | 
198 |     url_list: list of dicts, each containing:
199 |         url:
200 |             url to be downloaded
201 |         hash_type:
202 |             Type of hash to compute
203 |         hash_value: (optional)
204 |             if specified, the hash of the downloaded file will be
205 |             checked against this value
206 |         name: (optional)
207 |             Name of this dataset component
208 |         fetch_action: {'copy', 'message', 'url'}
209 |             Method used to obtain file
210 |         raw_file:
211 |             output file name. If not specified, use the last
212 |             component of the URL
213 | 
214 |     Examples
215 |     --------
216 |     >>> fetch_files()
217 |     Traceback (most recent call last):
218 |       ...
219 |     Exception: One of `file_name`, `url`, or `source_file` is required
220 |     '''
221 |     url_list = kwargs.get('url_list', None)
222 |     if not url_list:
223 |         return fetch_file(force=force, dst_dir=dst_dir, **kwargs)
224 |     result_list = []
225 |     for url_dict in url_list:
226 |         name = url_dict.get('name', None)
227 |         if name is None:
228 |             name = url_dict.get('url', 'dataset')
229 |         logger.debug(f"Ready to fetch {name}")
230 |         result_list.append(fetch_file(force=force, dst_dir=dst_dir, **url_dict))
231 |     return all([r[0] for r in result_list]), result_list
232 | 
233 | def fetch_text_file(url, file_name=None, dst_dir=None, force=True, **kwargs):
234 |     """Fetch a text file (via URL) and return it as a string.
235 | 
236 |     Arguments
237 |     ---------
238 | 
239 |     file_name:
240 |         output file name. If not specified, use the last
241 |         component of the URL
242 |     dst_dir:
243 |         directory to place downloaded files
244 |     force: boolean
245 |         normally, the URL is only downloaded if `file_name` is
246 |         not present on the filesystem, or if the existing file has a
247 |         bad hash. If force is True, download is always attempted.
248 | 
249 |     In addition to these options, any of `fetch_file`'s keywords may
250 |     also be passed
251 | 
252 |     Returns
253 |     -------
254 |     fetched string, or None if something went wrong with the download
255 |     """
256 |     retlist = fetch_file(url, file_name=file_name, dst_dir=dst_dir,
257 |                          force=force, **kwargs)
258 |     if retlist[0]:
259 |         _, filename, _ = retlist
260 |         with open(filename, 'r') as txt:
261 |             return txt.read()
262 |     else:
263 |         logger.warning(f'fetch of {url} failed with status: {retlist[0]}')
264 |         return None
265 | 
266 | def infer_filename(url=None, file_name=None, source_file=None, **kwargs):
267 |     """Infer a filename for a file-to-be-fetched.
268 | 
269 |     Parameters
270 |     ----------
271 |     file_name: string
272 |         if given, this is returned as the inferred filename (as a string, in case
273 |         if is in pathlib.Path format)
274 |     url: string
275 |         if supplied (and no file_name is specified), the last component of the URL is
276 |         returned as the inferred filename
277 |     source_file: string
278 |         If neither file_name nor url are specified, the last component of the source file
279 |         is returned as the inferred filename.
280 |     """
281 |     if file_name is not None:
282 |         return str(file_name)
283 |     elif url is not None:
284 |         file_name = url.split("/")[-1]
285 |         logger.debug(f"`file_name` not specified. Inferring from URL: {file_name}")
286 |     elif source_file is not None:
287 |         file_name = str(pathlib.Path(source_file).name)
288 |         logger.debug(f"`file_name` not specified. Inferring from `source_file`: {file_name}")
289 |     else:
290 |         raise Exception('One of `file_name`, `url`, or `source_file` is required')
291 |     return file_name
292 | 
293 | 
294 | def fetch_file(url=None, url_options=None, contents=None,
295 |                file_name=None, dst_dir=None,
296 |                force=False, source_file=None,
297 |                hash_type=None, hash_value=None,
298 |                fetch_action=None, message=None,
299 |                **kwargs):
300 |     '''Fetch the raw files needed by a DataSource.
301 | 
302 |     A DataSource is usually constructed from one or more raw files.
303 |     This function handles the process of obtaining the raw files.
304 | 
305 |     Raw files are always specified relative to paths['raw_data_path']
306 | 
307 |     If `file_name` does not exist, this will attempt to fetch or create
308 |     the file based on the contents of `fetch_action`:
309 |     * message:
310 |         Display `message` to the user and fail. Used when manual intervention
311 |         is required, such as when a licence agreement must be completed.
312 |     * copy:
313 |         Copies the file from somewhere in the filesystem (`source_file`).
314 |         WARNING: This approach rarely leads to a reproducible data workflow
315 |     * url:
316 |         Fetches the source file from `url`
317 |     * create:
318 |         File will be created from the contents of `contents`
319 | 
320 |     If `file_name` already exists, compute the hash of the on-disk file
321 |     and check
322 | 
323 |     contents:
324 |         contents of file to be created (if fetch_action == 'create')
325 |     url:
326 |         url to be downloaded
327 |     hash_type: {'md5', 'sha1'}
328 |         Type of hash to compute. Should not be used with hash_value, as it is already specified there.
329 |     hash_value: String (optional)
330 |         "{hash_type}:{hash_hexvalue}" where "hash_type" in {'md5', 'sha1'}
331 |         and hash_hexvalue is a hex-encoded string representing the hash value.
332 |         if specified, the hash of the downloaded file will be
333 |         checked against this value.
334 |     name: (optional)
335 |         Name of this dataset component
336 |     message: string
337 |         Text to be displayed to user (if fetch_action == 'message')
338 |     fetch_action: {'copy', 'message', 'url', 'create'}
339 |         Method used to obtain file
340 |     url_options: dict
341 |         kwargs to pass when fetching URLs using requests
342 |     file_name:
343 |         output file name. If not specified, use the last
344 |         component of the URL
345 |     dst_dir:
346 |         Can be used to override the default raw file location
347 |         (paths['raw_data_path'])
348 |     force: boolean
349 |         normally, the URL is only downloaded if `file_name` is
350 |         not present on the filesystem, or if the existing file has a
351 |         bad hash. If force is True, download is always attempted.
352 |     source_file: path
353 |         Path to source file. (if fetch_action == 'copy')
354 |         Will be copied to `paths['raw_data_path']`
355 | 
356 |     Returns
357 |     -------
358 |     one of:
359 |         (HTTP_Code, downloaded_filename, hash) (if downloaded from URL)
360 |         (True, filename, hash) (if already exists)
361 |         (False, [error], None)
362 |         (False, `message`, None) (if fetch_action == 'message')
363 | 
364 |     Examples
365 |     --------
366 |     >>> fetch_file()
367 |     Traceback (most recent call last):
368 |       ...
369 |     Exception: One of `file_name`, `url`, or `source_file` is required
370 |     '''
371 |     _valid_fetch_actions = ('message', 'copy', 'url', 'create', 'google-drive')
372 | 
373 |     if url_options is None:
374 |         url_options = {}
375 |     # infer filename from url or src_path if needed
376 |     if file_name is None:
377 |         file_name = infer_filename(url=url, source_file=source_file)
378 |         logger.debug(f"Inferred filename:{file_name} from url:{url}, source_file:{source_file}")
379 |     if dst_dir is None:
380 |         dst_dir = paths['raw_data_path']
381 |     else:
382 |         dst_dir = pathlib.Path(dst_dir)
383 | 
384 |     if not dst_dir.exists():
385 |         os.makedirs(dst_dir)
386 | 
387 |     raw_data_file = dst_dir / file_name
388 | 
389 |     if fetch_action not in _valid_fetch_actions:
390 |         # infer fetch action (for backwards compatibility)
391 |         if contents is not None:
392 |             fetch_action = 'create'
393 |         elif message is not None:
394 |             fetch_action = 'message'
395 |         elif url is not None:
396 |             fetch_action = 'url'
397 |         elif source_file is not None:
398 |             fetch_action = 'copy'
399 |         logger.debug(f"No `fetch_action` specified. Inferring type: {fetch_action}")
400 | 
401 |     if hash_type is None:
402 |         if hash_value is None:
403 |             hash_type = 'sha1'
404 |         else:
405 |             hash_type, _ = hash_value.split(":")
406 |     else: # hash_type is not None
407 |         if hash_value:
408 |             old_hash_type = hash_type
409 |             hash_type, _ = hash_value.split(":")
410 |             if hash_type != old_hash_type:
411 |                 logger.warning(f"Conflicting hash_type and hash_value. Using {hash_type}")
412 | 
413 |     # If the file is already present, check its hash.
414 |     if raw_data_file.exists() and fetch_action != 'create':
415 |         logger.debug(f"{file_name} already exists. Checking hash...")
416 |         raw_file_hash = hash_file(raw_data_file, algorithm=hash_type)
417 |         if hash_value is not None:
418 |             if raw_file_hash == hash_value:
419 |                 if force is False:
420 |                     logger.debug(f"{file_name} hash is valid. Skipping download.")
421 |                     return True, raw_data_file, raw_file_hash
422 |             else:  # raw_file_hash != hash_value
423 |                 logger.warning(f"{file_name} exists but has bad hash {raw_file_hash} != {hash_value}."
424 |                                " Re-fetching.")
425 |         else:  # hash_value is None
426 |             if force is False:
427 |                 logger.debug(f"{file_name} exists, but no hash to check. "
428 |                              f"Setting to {raw_file_hash}")
429 |                 return True, raw_data_file, raw_file_hash
430 | 
431 |     if url is None and contents is None and source_file is None and message is None:
432 |         raise Exception(f"Cannot proceed: {file_name} not found on disk, and no fetch information "
433 |                         "(`url`, `source_file`, `contents` or `message`) specified.")
434 | 
435 |     if fetch_action == 'url':
436 |         if url is None:
437 |             raise Exception(f"fetch_action = {fetch_action} but `url` unspecified")
438 |         # Download the file
439 |         try:
440 |             logger.debug(f"fetching {url}")
441 |             filename = tqdm_download(url, url_options=url_options, filename=raw_data_file)
442 |             raw_file_hash = hash_file(filename, algorithm=hash_type)
443 |             results = requests.get(url, **url_options)
444 |             if hash_value is not None:
445 |                 if raw_file_hash != hash_value:
446 |                     logger.error(f"Invalid hash on downloaded {file_name}"
447 |                                  f" {raw_file_hash} != {hash_value}")
448 |                     return False, f"Bad Hash: {raw_file_hash}", None
449 |         except requests.exceptions.HTTPError as err:
450 |             return False, err, None
451 |     elif fetch_action == 'google-drive':
452 |         if url is None:
453 |             raise Exception(f"fetch_action = {fetch_action} but file ID unspecified (expected through url field)")
454 |         # Download the file
455 |         try:
456 |             url_google_drive = f"https://drive.google.com/uc?id={url}"
457 |             logger.debug(f"Fetch file ID {url} off of Google Drive (full URL {url_google_drive})")
458 |             gdown.download(url_google_drive, str(raw_data_file), quiet=False)
459 |         except Exception as err:
460 |             return False, err, None
461 |         raw_file_hash = hash_file(raw_data_file, algorithm=hash_type)
462 |         return True, raw_data_file, raw_file_hash
463 |     elif fetch_action == 'create':
464 |         if contents is None:
465 |             raise Exception(f"fetch_action == 'create' but `contents` unspecified")
466 |         if hash_value is not None:
467 |             logger.debug(f"Hash value ({hash_value}) ignored for fetch_action=='create'")
468 |         with open(raw_data_file, 'w') as fw:
469 |             fw.write(contents)
470 |         logger.debug(f"Generating {file_name} hash...")
471 |         raw_file_hash = hash_file(raw_data_file, algorithm=hash_type)
472 |         return True, raw_data_file, raw_file_hash
473 |     elif fetch_action == 'copy':
474 |         if source_file is None:
475 |             raise Exception("fetch_action == 'copy' but `copy` unspecified")
476 |         logger.warning(f"Hardcoded paths for fetch_action == 'copy' may not be reproducible. Consider using fetch_action='message' instead")
477 |         shutil.copyfile(source_file, raw_data_file)
478 |         logger.debug(f"Checking hash of {file_name}...")
479 |         raw_file_hash = hash_file(raw_data_file, algorithm=hash_type)
480 |         source_file = pathlib.Path(source_file)
481 |         logger.debug(f"Copying {source_file.name} to raw_data_path")
482 |         return True, raw_data_file, raw_file_hash
483 |     elif fetch_action == 'message':
484 |         if message is None:
485 |             raise Exception("fetch_action == 'copy' but `copy` unspecified")
486 |         print(message)
487 |         return False, message, None
488 |     else:
489 |         raise Exception("No valid fetch_action found: (fetch_action=='{fetch_action}')")
490 | 
491 |     logger.debug(f'Retrieved {raw_data_file.name} ({hash_type}:{raw_file_hash})')
492 |     return results.status_code, raw_data_file, raw_file_hash
493 | 
494 | def unpack(filename, dst_dir=None, src_dir=None, create_dst=True, unpack_action=None):
495 |     '''Unpack a compressed file
496 | 
497 |     filename: path
498 |         file to unpack
499 |     dst_dir: path (default paths['interim_data_path'])
500 |         destination directory for the unpack
501 |     src_dir: path (default paths['raw_data_path'])
502 |         destination directory for the unpack
503 |     create_dst: boolean
504 |         create the destination directory if needed
505 |     unpack_action: {'zip', 'tgz', 'tbz2', 'tar', 'gzip', 'compress', 'copy'} or None
506 |         action to take in order to unpack this file. If None, it is inferred.
507 |     '''
508 |     if dst_dir is None:
509 |         dst_dir = paths['interim_data_path']
510 |     if src_dir is None:
511 |         src_dir = paths['raw_data_path']
512 | 
513 |     if create_dst:
514 |         if not os.path.exists(dst_dir):
515 |             os.makedirs(dst_dir)
516 | 
517 |     # in case it is a Path
518 |     filename = pathlib.Path(filename)
519 |     path = str((src_dir / filename).resolve())
520 | 
521 |     if unpack_action is None:
522 |         # infer unpack action
523 |         if path.endswith('.zip'):
524 |             unpack_action = 'zip'
525 |         elif path.endswith('.tar.gz') or path.endswith('.tgz'):
526 |             unpack_action = 'tgz'
527 |         elif path.endswith('.tar.bz2') or path.endswith('.tbz'):
528 |             unpack_action = 'tbz2'
529 |         elif path.endswith('.tar'):
530 |             unpack_action = 'tar'
531 |         elif path.endswith('.gz'):
532 |             unpack_action = 'gz'
533 |         elif path.endswith('.Z'):
534 |             unpack_action = 'compress'
535 |         else:
536 |             logger.warning(f"Can't infer `unpack_action` from filename {filename.name}. Defaulting to 'copy'.")
537 |             unpack_action = 'copy'
538 | 
539 |     archive = False
540 |     verb = "Copying"
541 |     if unpack_action == 'none':
542 |         logger.debug(f"Skipping unpack for {filename.name}")
543 |         return
544 |     elif unpack_action == 'symlink':
545 |         logger.debug(f"Linking {filename.name}...")
546 |         safe_symlink(pathlib.Path(dst_dir) / path, path, overwrite=True)
547 |         return
548 |     elif unpack_action == 'copy':
549 |         opener, mode = open, 'rb'
550 |         outfile, outmode = path, 'wb'
551 |     elif unpack_action == 'zip':
552 |         archive = True
553 |         verb = "Unzipping"
554 |         opener, mode = zipfile.ZipFile, 'r'
555 |     elif unpack_action == 'tgz':
556 |         archive = True
557 |         verb = "Untarring and ungzipping"
558 |         opener, mode = tarfile.open, 'r:gz'
559 |     elif unpack_action == 'tbz2':
560 |         archive = True
561 |         verb = "Untarring and unbzipping"
562 |         opener, mode = tarfile.open, 'r:bz2'
563 |     elif unpack_action == 'tar':
564 |         archive = True
565 |         verb = "Untarring"
566 |         opener, mode = tarfile.open, 'r'
567 |     elif unpack_action == 'gz':
568 |         verb = "Ungzipping"
569 |         opener, mode = gzip.open, 'rb'
570 |         outfile, outmode = path[:-3], 'wb'
571 |     elif unpack_action == 'compress':
572 |         verb = "Uncompressing"
573 |         logger.warning(".Z files are only supported on systems that ship with gzip. Trying...")
574 |         os.system(f'gzip -f -d {path}')
575 |         opener, mode = open, 'rb'
576 |         path = path[:-2]
577 |         outfile, outmode = path, 'wb'
578 |     else:
579 |         raise Exception(f"Unknown unpack_action: {unpack_action}")
580 | 
581 |     with opener(path, mode) as f_in:
582 |         if archive:
583 |             logger.debug(f"Extracting {filename.name}...")
584 |             f_in.extractall(path=dst_dir)
585 |         else:
586 |             outfile = pathlib.Path(outfile).name
587 |             logger.debug(f"{verb} {outfile}...")
588 |             with open(pathlib.Path(dst_dir) / outfile, outmode) as f_out:
589 |                 shutil.copyfileobj(f_in, f_out)
590 | 
591 | def get_dataset_filename(ds_dict):
592 |     """Figure out the downloaded filename for a dataset entry
593 | 
594 |     if a `file_name` key is present, use this,
595 |     otherwise, use the last component of the `url`
596 | 
597 |     Returns the filename
598 | 
599 |     Examples
600 |     --------
601 |     >>> ds_dict = {'url': 'http://example.com/path/to/file.txt'}
602 |     >>> get_dataset_filename(ds_dict)
603 |     'file.txt'
604 |     >>> ds_dict['file_name'] = 'new_filename.blob'
605 |     >>> get_dataset_filename(ds_dict)
606 |     'new_filename.blob'
607 |     """
608 | 
609 |     file_name = ds_dict.get('file_name', None)
610 |     url = ds_dict.get('url', [])
611 |     if file_name is None:
612 |         file_name = url.split("/")[-1]
613 |     return file_name
614 | 


--------------------------------------------------------------------------------