├── .gitattributes
├── {{ cookiecutter.repo_name }}
    ├── catalog
    │   ├── .gitignore
    │   └── config.ini
    ├── data
    │   ├── raw
    │   │   └── .gitkeep
    │   ├── external
    │   │   └── .gitkeep
    │   ├── interim
    │   │   └── .gitkeep
    │   └── processed
    │   │   └── .gitkeep
    ├── notebooks
    │   └── .gitkeep
    ├── reference
    │   ├── .gitkeep
    │   ├── datasets
    │   │   └── .gitkeep
    │   ├── templates
    │   │   └── .gitkeep
    │   └── easydata
    │   │   ├── images
    │   │       └── toolbar-screenshot.png
    │   │   ├── git-workflow.md
    │   │   ├── git-configuration.md
    │   │   ├── troubleshooting.md
    │   │   ├── paths.md
    │   │   ├── datasets.md
    │   │   ├── notebooks.md
    │   │   └── conda-environments.md
    ├── reports
    │   ├── .gitkeep
    │   └── figures
    │   │   └── .gitkeep
    ├── models
    │   ├── figures
    │   │   └── .gitkeep
    │   ├── output
    │   │   └── .gitkeep
    │   └── trained
    │   │   └── .gitkeep
    ├── {{ cookiecutter.module_name }}
    │   ├── data
    │   │   ├── .gitkeep
    │   │   ├── __init__.py
    │   │   ├── process_functions.py
    │   │   ├── fileset.py
    │   │   └── utils.py
    │   ├── tests
    │   │   ├── __init__.py
    │   │   ├── no_ci
    │   │   │   ├── __init__.py
    │   │   │   └── test_user_dataset_environment_integration.py
    │   │   ├── test_imports.py
    │   │   ├── test_ci.py
    │   │   ├── test_catalog.py
    │   │   ├── test_iter_directory.py
    │   │   ├── make_test_datasets.py
    │   │   └── data
    │   │   │   └── dataset-test.json
    │   ├── analysis
    │   │   └── __init__.py
    │   ├── log
    │   │   └── __init__.py
    │   ├── exceptions.py
    │   ├── decorators.py
    │   ├── __init__.py
    │   ├── conftest.py
    │   ├── workflow.py
    │   ├── _paths.py
    │   ├── utils
    │   │   ├── ipynbname.py
    │   │   └── __init__.py
    │   └── kvstore.py
    ├── .easydata.json
    ├── docs
    │   ├── commands.rst
    │   ├── getting-started.rst
    │   ├── index.rst
    │   ├── make.bat
    │   ├── Makefile
    │   └── conf.py
    ├── scripts
    │   ├── bootstrap.yml
    │   ├── tests
    │   │   ├── add-dependency.py
    │   │   └── add-extra-channel-dependency.py
    │   ├── clean.py
    │   ├── am_i_ready.py
    │   ├── help-update.py
    │   ├── debug.py
    │   ├── help.py
    │   └── split_pip.py
    ├── setup.py
    ├── .env
    ├── Makefile.win32
    ├── .post-create-environment.txt
    ├── Makefile.include
    ├── environment.yml
    ├── .gitignore
    ├── .circleci
    │   └── config.yml
    ├── LICENSE
    ├── Makefile
    ├── Makefile.envs
    └── README.md
├── docs
    ├── css
    │   └── extra.css
    ├── favicon.ico
    ├── README.md
    ├── test_docs.py
    ├── index.md
    ├── 00-xyz-sample-notebook.ipynb
    ├── datasets.md
    ├── New-Edge-Template.ipynb
    ├── opinions.md
    └── Add-derived-dataset.ipynb
├── requirements.txt
├── .gitignore
├── .cookiecutter-easydata-test-circleci.yml
├── .cookiecutter-easydata-test.yml
├── circleci-cookiecutter-easydata.json
├── cookiecutter.json
├── mkdocs.yml
├── LICENSE
├── hooks
    └── post_gen_project.py
├── tests
    └── test_creation.py
├── .circleci
    └── config.yml
└── README.md


/.gitattributes:
--------------------------------------------------------------------------------
1 | * text=auto
2 | 


--------------------------------------------------------------------------------
/{{ cookiecutter.repo_name }}/catalog/.gitignore:
--------------------------------------------------------------------------------
1 | 


--------------------------------------------------------------------------------
/{{ cookiecutter.repo_name }}/data/raw/.gitkeep:
--------------------------------------------------------------------------------
1 | 


--------------------------------------------------------------------------------
/{{ cookiecutter.repo_name }}/notebooks/.gitkeep:
--------------------------------------------------------------------------------
1 | 


--------------------------------------------------------------------------------
/{{ cookiecutter.repo_name }}/reference/.gitkeep:
--------------------------------------------------------------------------------
1 | 


--------------------------------------------------------------------------------
/{{ cookiecutter.repo_name }}/reports/.gitkeep:
--------------------------------------------------------------------------------
1 | 


--------------------------------------------------------------------------------
/{{ cookiecutter.repo_name }}/data/external/.gitkeep:
--------------------------------------------------------------------------------
1 | 


--------------------------------------------------------------------------------
/{{ cookiecutter.repo_name }}/data/interim/.gitkeep:
--------------------------------------------------------------------------------
1 | 


--------------------------------------------------------------------------------
/{{ cookiecutter.repo_name }}/data/processed/.gitkeep:
--------------------------------------------------------------------------------
1 | 


--------------------------------------------------------------------------------
/{{ cookiecutter.repo_name }}/models/figures/.gitkeep:
--------------------------------------------------------------------------------
1 | 


--------------------------------------------------------------------------------
/{{ cookiecutter.repo_name }}/models/output/.gitkeep:
--------------------------------------------------------------------------------
1 | 


--------------------------------------------------------------------------------
/{{ cookiecutter.repo_name }}/models/trained/.gitkeep:
--------------------------------------------------------------------------------
1 | 


--------------------------------------------------------------------------------
/{{ cookiecutter.repo_name }}/reports/figures/.gitkeep:
--------------------------------------------------------------------------------
1 | 


--------------------------------------------------------------------------------
/{{ cookiecutter.repo_name }}/reference/datasets/.gitkeep:
--------------------------------------------------------------------------------
1 | 


--------------------------------------------------------------------------------
/{{ cookiecutter.repo_name }}/reference/templates/.gitkeep:
--------------------------------------------------------------------------------
1 | 


--------------------------------------------------------------------------------
/docs/css/extra.css:
--------------------------------------------------------------------------------
1 | h1, h2, h3 {
2 |     margin-top: 77px;
3 | }
4 | 


--------------------------------------------------------------------------------
/{{ cookiecutter.repo_name }}/{{ cookiecutter.module_name }}/data/.gitkeep:
--------------------------------------------------------------------------------
1 | 


--------------------------------------------------------------------------------
/{{ cookiecutter.repo_name }}/.easydata.json:
--------------------------------------------------------------------------------
1 | {{ cookiecutter | jsonify }}
2 | 


--------------------------------------------------------------------------------
/{{ cookiecutter.repo_name }}/{{ cookiecutter.module_name }}/tests/__init__.py:
--------------------------------------------------------------------------------
1 | 


--------------------------------------------------------------------------------
/{{ cookiecutter.repo_name }}/{{ cookiecutter.module_name }}/analysis/__init__.py:
--------------------------------------------------------------------------------
1 | 


--------------------------------------------------------------------------------
/{{ cookiecutter.repo_name }}/{{ cookiecutter.module_name }}/tests/no_ci/__init__.py:
--------------------------------------------------------------------------------
1 | 


--------------------------------------------------------------------------------
/docs/favicon.ico:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/hackalog/easydata/HEAD/docs/favicon.ico


--------------------------------------------------------------------------------
/requirements.txt:
--------------------------------------------------------------------------------
1 | mkdocs
2 | mkdocs-jupyter
3 | cookiecutter
4 | pytest
5 | ruamel.yaml
6 | 


--------------------------------------------------------------------------------
/.gitignore:
--------------------------------------------------------------------------------
 1 | docs/site/
 2 | 
 3 | # OSX Junk
 4 | .DS_Store
 5 | 
 6 | # test cache
 7 | .cache/*
 8 | tests/__pycache__/*
 9 | *~
10 | 


--------------------------------------------------------------------------------
/{{ cookiecutter.repo_name }}/docs/commands.rst:
--------------------------------------------------------------------------------
1 | Commands
2 | ========
3 | 
4 | The Makefile contains the central entry points for common tasks related to this project.
5 | Type `make` for help
6 | 


--------------------------------------------------------------------------------
/{{ cookiecutter.repo_name }}/reference/easydata/images/toolbar-screenshot.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/hackalog/easydata/HEAD/{{ cookiecutter.repo_name }}/reference/easydata/images/toolbar-screenshot.png


--------------------------------------------------------------------------------
/{{ cookiecutter.repo_name }}/{{ cookiecutter.module_name }}/data/__init__.py:
--------------------------------------------------------------------------------
1 | from .catalog import *
2 | from .datasets import *
3 | from .fetch import *
4 | from .utils import *
5 | from .fileset import *
6 | 


--------------------------------------------------------------------------------
/{{ cookiecutter.repo_name }}/docs/getting-started.rst:
--------------------------------------------------------------------------------
1 | Getting started
2 | ===============
3 | 
4 | This is where you describe how to get set up on a clean install, including the
5 | commands necessary to get the raw data (using the `sync_data_from_s3` command,
6 | for example), and then how to make the cleaned, final data sets.
7 | 


--------------------------------------------------------------------------------
/{{ cookiecutter.repo_name }}/{{ cookiecutter.module_name }}/log/__init__.py:
--------------------------------------------------------------------------------
1 | import logging
2 | import os
3 | import sys
4 | 
5 | _log_fmt = '%(asctime)s - %(module)s - %(levelname)s - %(message)s'
6 | logging.basicConfig(level=os.environ.get('LOGLEVEL', 'INFO'), format=_log_fmt)
7 | _MODULE = sys.modules[__name__]
8 | logger = logging.getLogger(__name__)
9 | 


--------------------------------------------------------------------------------
/.cookiecutter-easydata-test-circleci.yml:
--------------------------------------------------------------------------------
 1 | default_context:
 2 |   author_name: acwooding
 3 |   conda_path: /home/circleci/miniconda/bin
 4 |   description: Template for testing cookiecutter-easydata
 5 |   module_name: src
 6 |   open_source_license: MIT
 7 |   project_name: test-env
 8 |   python_version: latest
 9 |   repo_name: test-env
10 |   virtualenv: conda


--------------------------------------------------------------------------------
/.cookiecutter-easydata-test.yml:
--------------------------------------------------------------------------------
 1 | default_context:
 2 |   author_name: acwooding
 3 |   conda_path: /home/travis/miniconda3/conda
 4 |   description: Template for testing cookiecutter-easydata
 5 |   module_name: src
 6 |   open_source_license: MIT
 7 |   project_name: test-env
 8 |   python_version: latest
 9 |   repo_name: test-env
10 |   virtualenv: conda
11 | 


--------------------------------------------------------------------------------
/circleci-cookiecutter-easydata.json:
--------------------------------------------------------------------------------
1 | {"cookiecutter": {"project_name": "test-env", "repo_name": "test-env", "default_branch": "master", "module_name": "src", "author_name": "acwooding", "description": "Template for testing cookiecutter-easydata", "open_source_license": "MIT", "python_version": "latest", "conda_path": "/home/circleci/miniconda/bin", "upstream_location": "github.com", "_template": "."}}


--------------------------------------------------------------------------------
/{{ cookiecutter.repo_name }}/scripts/bootstrap.yml:
--------------------------------------------------------------------------------
 1 | {% macro pyver() -%}
 2 | {% if cookiecutter.python_version == 'latest' -%}
 3 |   - python
 4 | {% else -%}
 5 |   - python={{ cookiecutter.python_version }}
 6 | {% endif -%}
 7 | {% endmacro -%}
 8 | name: {{ cookiecutter.repo_name }}
 9 | channels:
10 |    - defaults
11 | dependencies:
12 |    - pyyaml
13 | {{ pyver()|indent(3, true) }}
14 | 


--------------------------------------------------------------------------------
/{{ cookiecutter.repo_name }}/setup.py:
--------------------------------------------------------------------------------
 1 | from setuptools import find_packages, setup
 2 | 
 3 | setup(
 4 |     name='{{ cookiecutter.module_name }}',
 5 |     packages=find_packages(),
 6 |     version='2.0.0',
 7 |     description='''{{ cookiecutter.description }}''',
 8 |     author='{{ cookiecutter.author_name }}',
 9 |     license='{% if cookiecutter.open_source_license == 'MIT' %}MIT{% elif cookiecutter.open_source_license == 'BSD-2-Clause' %}BSD-2{% endif %}',
10 | )
11 | 


--------------------------------------------------------------------------------
/{{ cookiecutter.repo_name }}/{{ cookiecutter.module_name }}/tests/no_ci/test_user_dataset_environment_integration.py:
--------------------------------------------------------------------------------
 1 | ## Test dataset information
 2 | import unittest
 3 | 
 4 | from {{ cookiecutter.module_name }}.data import Dataset
 5 | 
 6 | 
 7 | class TestDatasets(unittest.TestCase):
 8 |     """
 9 |     Basic smoke tests to ensure that all of the available datasets
10 |     load and have some expected property.
11 |     """
12 |     def basic_unit_test(self):
13 |         assert True
14 | 


--------------------------------------------------------------------------------
/{{ cookiecutter.repo_name }}/catalog/config.ini:
--------------------------------------------------------------------------------
 1 | [Paths]
 2 | cache_path = ${data_path}/interim/cache
 3 | data_path = ${project_path}/data
 4 | figures_path = ${output_path}/figures
 5 | interim_data_path = ${data_path}/interim
 6 | notebook_path = ${project_path}/notebooks
 7 | output_path = ${project_path}/reports
 8 | processed_data_path = ${data_path}/processed
 9 | project_path = ${catalog_path}/..
10 | raw_data_path = ${data_path}/raw
11 | template_path = ${project_path}/reference/templates
12 | 
13 | 


--------------------------------------------------------------------------------
/{{ cookiecutter.repo_name }}/scripts/tests/add-dependency.py:
--------------------------------------------------------------------------------
 1 | import sys
 2 | import yaml
 3 | 
 4 | 
 5 | if __name__ == "__main__":
 6 |     assert len(sys.argv[1:]) == 1
 7 |     dependency_new = sys.argv[1]
 8 | 
 9 |     with open("environment.yml", "rt", encoding="utf-8") as file_env:
10 |         env = yaml.safe_load(file_env)
11 |     env["dependencies"].append(dependency_new)
12 |     with open("environment.yml", "wt", encoding="utf-8") as file_env:
13 |         yaml.safe_dump(env, file_env)
14 | 


--------------------------------------------------------------------------------
/{{ cookiecutter.repo_name }}/.env:
--------------------------------------------------------------------------------
 1 | # Environment variables go here, can be read by `python-dotenv` package:
 2 | #
 3 | #   `{{ cookiecutter.module_name }}/script.py`
 4 | #   ----------------------------------------------------------------
 5 | #    import dotenv
 6 | #
 7 | #    project_dir = os.path.join(os.path.dirname(__file__), os.pardir)
 8 | #    dotenv_path = os.path.join(project_dir, '.env')
 9 | #    dotenv.load_dotenv(dotenv_path)
10 | #   ----------------------------------------------------------------
11 | #
12 | # DO NOT ADD THIS FILE TO VERSION CONTROL!
13 | 


--------------------------------------------------------------------------------
/{{ cookiecutter.repo_name }}/scripts/tests/add-extra-channel-dependency.py:
--------------------------------------------------------------------------------
 1 | import sys
 2 | import yaml
 3 | 
 4 | 
 5 | if __name__ == "__main__":
 6 |     channel_order = ['defaults', 'pytorch']
 7 |     dependency_new = "pytorch::cpuonly"
 8 | 
 9 |     with open("environment.yml", "rt", encoding="utf-8") as file_env:
10 |         env = yaml.safe_load(file_env)
11 |     env["dependencies"].append(dependency_new)
12 |     env["channel-order"] = channel_order
13 |     with open("environment.yml", "wt", encoding="utf-8") as file_env:
14 |         yaml.safe_dump(env, file_env)
15 | 


--------------------------------------------------------------------------------
/docs/README.md:
--------------------------------------------------------------------------------
 1 | Generating the docs
 2 | ----------
 3 | 
 4 | Install requirements:
 5 | 
 6 |     pip install -r requirements.txt
 7 | 
 8 | Change directories into the docs folder:
 9 | 
10 |     cd docs
11 | 
12 | Use [mkdocs](http://www.mkdocs.org/) structure to update the documentation. Test locally with:
13 | 
14 |     mkdocs serve
15 | 
16 | Once the docs look good, publish to `gh-pages` branch with:
17 | 
18 |     mkdocs gh-deploy --clean
19 | 
20 | ** Note **: Never edit the generated site by hand because using `gh-deploy` blows away the `gh-pages` branch and you'll lose your edits.
21 | 


--------------------------------------------------------------------------------
/{{ cookiecutter.repo_name }}/Makefile.win32:
--------------------------------------------------------------------------------
 1 | ECHO = set EMPTY=; echo %EMPTY%
 2 | RM = del
 3 | RMTREE = rmdir /s /q
 4 | CAT = type
 5 | SET = set
 6 | WHICH = where
 7 | DEVNULL = nul
 8 | CMDSEP = &
 9 | 
10 | # Some UNIXish packages force the installation of a Bourne-compatible shell, and Make
11 | # prefers using this when it sees it. We thus force the usage of the good ole Batch
12 | # shell to avoid issues. Note that **exact path spelling** against the value of
13 | # environment variable COMSPEC is necessary, as GNU Make believes paths are case-
14 | # sensitive even on Windows.
15 | SHELL = C:\WINDOWS\system32\cmd.exe
16 | 


--------------------------------------------------------------------------------
/{{ cookiecutter.repo_name }}/docs/index.rst:
--------------------------------------------------------------------------------
 1 | .. {{ cookiecutter.project_name }} documentation master file, created by
 2 |    sphinx-quickstart.
 3 |    You can adapt this file completely to your liking, but it should at least
 4 |    contain the root `toctree` directive.
 5 | 
 6 | {{ cookiecutter.project_name }} documentation!
 7 | ==============================================
 8 | 
 9 | Contents:
10 | 
11 | .. toctree::
12 |    :maxdepth: 2
13 | 
14 |    getting-started
15 |    commands
16 | 
17 | 
18 | 
19 | Indices and tables
20 | ==================
21 | 
22 | * :ref:`genindex`
23 | * :ref:`modindex`
24 | * :ref:`search`
25 | 


--------------------------------------------------------------------------------
/cookiecutter.json:
--------------------------------------------------------------------------------
 1 | {
 2 |     "project_name": "project_name",
 3 |     "repo_name": "{{ cookiecutter.project_name.lower().replace(' ', '_') }}",
 4 |     "default_branch": ["main", "master"],
 5 |     "module_name": "src",
 6 |     "author_name": "Your name (or the copyright holder)",
 7 |     "description": "A short description of this project.",
 8 |     "open_source_license": ["MIT", "BSD-2-Clause", "Proprietary"],
 9 |     "python_version": ["latest", "3.10", "3.9", "3.8", "3.7"],
10 |     "conda_path": "~/anaconda3/bin/conda",
11 |     "upstream_location": ["github.com", "gitlab.com", "bitbucket.org", "your-custom-repo"]
12 | }
13 | 


--------------------------------------------------------------------------------
/{{ cookiecutter.repo_name }}/scripts/clean.py:
--------------------------------------------------------------------------------
 1 | import glob
 2 | import os
 3 | import os.path
 4 | import shutil
 5 | 
 6 | 
 7 | for path, dirs, files in os.walk("."):
 8 |     try:
 9 |         i_pycache = dirs.index("__pycache__")
10 |         shutil.rmtree(os.path.join(path, "__pycache__"))
11 |         del dirs[i_pycache]
12 |     except ValueError:
13 |         pass
14 | 
15 |     for file_ in files:
16 |         file = os.path.join(path, file_)
17 |         ext = os.path.splitext(file)[1]
18 |         if any(ext.endswith(x) for x in ["pyo", "pyc"]):
19 |             os.unlink(file)
20 | 
21 | for p in glob.glob(".make.*"):
22 |     os.unlink(p)
23 | 


--------------------------------------------------------------------------------
/{{ cookiecutter.repo_name }}/{{ cookiecutter.module_name }}/tests/test_imports.py:
--------------------------------------------------------------------------------
 1 | import unittest
 2 | 
 3 | 
 4 | class TestImports(unittest.TestCase):
 5 |     """
 6 |     Basic smoke test to ensure that the installed packages can actually be
 7 |     imported (we had a compatibility issue once that was not resolved
 8 |     properly by conda).
 9 |     """
10 |     def test_infrastructure_packages(self):
11 |         import gdown
12 |         import sphinx
13 |         import click
14 |         import joblib
15 |         import requests
16 | 
17 |     def test_common_packages(self):
18 |         import numpy
19 |         import scipy.sparse
20 |         import pandas
21 |         import bokeh
22 |         import matplotlib
23 |         import sklearn
24 | 


--------------------------------------------------------------------------------
/{{ cookiecutter.repo_name }}/{{ cookiecutter.module_name }}/exceptions.py:
--------------------------------------------------------------------------------
 1 | class EasydataError(Exception):
 2 |     """General Easydata Error. Further error types are subclassed from this Exception"""
 3 |     pass
 4 | 
 5 | class ValidationError(EasydataError):
 6 |     """Hash check failed"""
 7 |     pass
 8 | 
 9 | class ObjectCollision(EasydataError):
10 |     """Object already exists in object store
11 | 
12 |     This is more general than a FileExistsError, as it applies to more than just the filesystem.
13 |     """
14 |     pass
15 | 
16 | class NotFoundError(EasydataError):
17 |     """Named object not found in object store
18 | 
19 |     This is more general than a FileNotFoundError, as it applies to more than just the filesystem.
20 |     """
21 |     pass
22 | 


--------------------------------------------------------------------------------
/{{ cookiecutter.repo_name }}/.post-create-environment.txt:
--------------------------------------------------------------------------------
 1 | 
 2 | Now would be a good time to initialize a git repo; i.e.
 3 | >>> git init
 4 | >>> git add .
 5 | >>> git commit -m 'initial import'
 6 | >>> git branch easydata    # tag for future easydata upgrades
 7 | 
 8 | NOTE: By default, raw data is installed and unpacked in the
 9 | `{{ cookiecutter.repo_name}}/data` directory. If you are working with big data (or
10 | have a small disk partition), it is HIGHLY recommended that you point
11 | this directory elsewhere; i.e. by setting paths['data_path']. For example:
12 | >>> conda activate {{ cookiecutter.repo_name }}
13 | >>> python -c "import {{ cookiecutter.module_name }}; {{ cookiecutter.module_name }}.paths['data_path'] = '/path/to/big/data'"
14 | 
15 | You have been warned.
16 | 


--------------------------------------------------------------------------------
/{{ cookiecutter.repo_name }}/Makefile.include:
--------------------------------------------------------------------------------
 1 | CONDA_EXE ?= {{ cookiecutter.conda_path }}
 2 | PYTHON_INTERPRETER ?= python
 3 | DEBUG_FILE := debug.txt
 4 | MODULE_NAME := {{ cookiecutter.module_name }}
 5 | TESTS_NO_CI = $(MODULE_NAME)/tests/no_ci
 6 | PROJECT_DIR := $(dir $(realpath $(lastword $(MAKEFILE_LIST))))
 7 | PROJECT_NAME := {{ cookiecutter.repo_name }}
 8 | VIRTUALENV := conda
 9 | PLATFORM := $(shell $(PYTHON_INTERPRETER) -c "import platform; print(platform.platform())")
10 | LOCKFILE := environment.$(PLATFORM).lock.yml
11 | INSTALL_DEPS = conda make git
12 | 
13 | ARCH = $(shell $(PYTHON_INTERPRETER) -c "import sys; print(sys.platform)")
14 | -include Makefile.$(ARCH)
15 | ECHO ?= echo
16 | RM ?= rm
17 | RMTREE ?= rm -rf
18 | CAT ?= cat
19 | SET ?= export
20 | WHICH ?= which
21 | DEVNULL ?= /dev/null
22 | CMDSEP ?= ;
23 | 


--------------------------------------------------------------------------------
/{{ cookiecutter.repo_name }}/scripts/am_i_ready.py:
--------------------------------------------------------------------------------
 1 | from subprocess import run
 2 | 
 3 | ## Tiny script to check if conda, make and git are installed
 4 | 
 5 | all_installed = True
 6 | for requirement in ['conda', 'make', 'git status']:
 7 |     try:
 8 |         process_run = run(requirement.split(), capture_output=True)
 9 |         if process_run. returncode == 0:
10 |             print(f"{requirement} installed!")
11 |         else:
12 |             print(f"{requirement} does not seem to be installed correctly. See the README for help getting this tool ready.")
13 |             all_installed = False
14 |     except:
15 |         print(f"{requirement} does not seem to be installed correctly. See the README for help getting this tool ready.")
16 |         all_installed = False
17 | 
18 | if all_installed:
19 |     print("You're ready! Type 'make' for more options")
20 | 


--------------------------------------------------------------------------------
/{{ cookiecutter.repo_name }}/{{ cookiecutter.module_name }}/tests/test_ci.py:
--------------------------------------------------------------------------------
 1 | ## Test dataset information
 2 | import logging
 3 | import unittest
 4 | 
 5 | from {{ cookiecutter.module_name }}.data import Dataset
 6 | from {{ cookiecutter.module_name }} import workflow
 7 | from {{ cookiecutter.module_name }}.log import logger
 8 | 
 9 | 
10 | class TestDatasetsSmall(unittest.TestCase):
11 |     """
12 |     Basic smoke tests to ensure that the smaller (and more quickly processed)
13 |     available datasets load and have some expected property.
14 |     """
15 |     def test_20_newsgroups(self):
16 |         ds = Dataset.load('20_newsgroups')
17 |         ds = Dataset.load('20_newsgroups')
18 |         assert len(ds.data) == 18846
19 |         assert len(ds.target) == 18846
20 | 
21 | def test_logging_is_debug_level():
22 |     assert logger.getEffectiveLevel() == logging.DEBUG
23 | 


--------------------------------------------------------------------------------
/{{ cookiecutter.repo_name }}/scripts/help-update.py:
--------------------------------------------------------------------------------
 1 | import os
 2 | import os.path
 3 | 
 4 | 
 5 | here = os.path.basename(os.getcwd())
 6 | print("""\
 7 | To update easydata on an existing repo, verify that you have an 'easydata' branch
 8 | 
 9 | >>> git rev-parse -q --verify easydata
10 | 
11 | If no output is given, do this:
12 | 
13 | >>> git rev-list --max-parents=0 HEAD
14 | 
15 | and copy-paste the output hash in this command:
16 | 
17 | >>> git branch easydata #PASTE HASH HERE#
18 | 
19 | Once you have the easydata branch, let's commit ("check in") all your changes,
20 | then merge the new easydata branch into yours:
21 | 
22 | cd ..
23 | cookiecutter --config-file {here}/.easydata.yml easydata -f --no-input
24 | """)
25 | print("cd " + here)
26 | print("""\
27 | git add -p  # add all the changes
28 | git commit -m "sync with easydata"
29 | git checkout main
30 | git merge easydata
31 | """)
32 | 


--------------------------------------------------------------------------------
/{{ cookiecutter.repo_name }}/{{ cookiecutter.module_name }}/decorators.py:
--------------------------------------------------------------------------------
 1 | 
 2 | # Singleton/SingletonDecorator.py
 3 | class SingletonDecorator:
 4 |     """Turns a class into a Singleton class
 5 | 
 6 |     When placed before a class definition, ensures that all
 7 |     insances of this class return the same data; i.e. editing one
 8 |     will change them all.
 9 |     """
10 |     def __init__(self,klass):
11 |         self.klass = klass
12 |         self.instance = None
13 |     def __call__(self,*args,**kwds):
14 |         if self.instance == None:
15 |             self.instance = self.klass(*args,**kwds)
16 |         return self.instance
17 | 
18 |     # https://softwareengineering.stackexchange.com/questions/386755/sharing-docstrings-between-similar-functions
19 | def is_documented_by(original):
20 |   def wrapper(target):
21 |     target.__doc__ = original.__doc__
22 |     return target
23 |   return wrapper
24 | 


--------------------------------------------------------------------------------
/{{ cookiecutter.repo_name }}/environment.yml:
--------------------------------------------------------------------------------
 1 | {% macro pyver() -%}
 2 | {% if cookiecutter.python_version == 'latest' -%}
 3 |   - python
 4 | {% else -%}
 5 |   - python={{ cookiecutter.python_version }}
 6 | {% endif -%}
 7 | {% endmacro -%}
 8 | name: {{ cookiecutter.repo_name }}
 9 | channels:
10 |   - defaults
11 | dependencies:
12 |   - pip
13 |   - pip:
14 |     - -e .  # conda >= 4.4 only
15 |     - python-dotenv>=0.5.1
16 |     - nbval
17 |     - nbdime
18 |     - gdown
19 |   - setuptools
20 |   - wheel
21 |   - git>=2.5  # for git worktree template updating
22 |   - sphinx
23 |   - bokeh
24 |   - click
25 |   - colorcet
26 |   - coverage
27 |   - coveralls
28 |   - matplotlib
29 |   - jupyter
30 |   - scikit-learn
31 |   - scipy
32 |   - joblib
33 |   - nb_conda_kernels   # Try <2.2.0  if you hit nb_conda_kernels issue #158
34 |   - pandas
35 |   - requests
36 |   - pathlib
37 |   - fsspec
38 | {{ pyver()|indent(2, true) }}
39 | 


--------------------------------------------------------------------------------
/mkdocs.yml:
--------------------------------------------------------------------------------
 1 | site_name: Easydata
 2 | site_description: A project template and directory structure for reproducible and collaborative Python-based data science projects.
 3 | site_favicon: favicon.ico
 4 | repo_url: https://github.com/hackalog/cookiecutter-easydata
 5 | copyright: CCBY 4.0
 6 | google_analytics: ['UA-54096005-4', 'auto']
 7 | theme: readthedocs
 8 | extra_css:
 9 |   - css/extra.css
10 | nav:
11 |   - Home: index.md
12 |   - Opinions: opinions.md
13 |   - Datasets: datasets.md
14 |   - 'Example Notebooks':
15 |       - 'Basic Starter Notebook': 00-xyz-sample-notebook.ipynb
16 |       - 'Creating a dataset from a csv file': Add-csv-template.ipynb
17 |       - 'Creating a derived dataset from a single function': Add-derived-dataset.ipynb
18 |       - 'Generic Raw Dataset Creation Template': New-Dataset-Template.ipynb
19 |       - 'Generic Derived Dataset Creation Template': New-Edge-Template.ipynb
20 | plugins:
21 |   - mkdocs-jupyter


--------------------------------------------------------------------------------
/{{ cookiecutter.repo_name }}/{{ cookiecutter.module_name }}/__init__.py:
--------------------------------------------------------------------------------
 1 | import pathlib
 2 | from ._paths import Paths
 3 | 
 4 | _module_dir = pathlib.Path(__file__).parent.resolve()
 5 | 
 6 | _path_defaults = {
 7 |     'cache_path': '${data_path}/interim/cache',
 8 |     'data_path': '${project_path}/data',
 9 |     'figures_path': '${output_path}/figures',
10 |     'interim_data_path': '${data_path}/interim',
11 |     'notebook_path': '${project_path}/notebooks',
12 |     'output_path': '${project_path}/reports',
13 |     'processed_data_path': '${data_path}/processed',
14 |     'project_path': '${catalog_path}/..',
15 |     'raw_data_path': '${data_path}/raw',
16 |     'template_path': '${project_path}/reference/templates',
17 |     'abfs_cache': '${interim_data_path}/abfs_cache',
18 | }
19 | _catalog_file = _module_dir.parent / "catalog" / "config.ini"
20 | 
21 | paths = Paths(_path_defaults, config_file=_catalog_file, config_section="Paths")
22 | 


--------------------------------------------------------------------------------
/{{ cookiecutter.repo_name }}/{{ cookiecutter.module_name }}/conftest.py:
--------------------------------------------------------------------------------
 1 | import os
 2 | import pathlib as pl
 3 | import pytest
 4 | import shutil
 5 | import tempfile as tf
 6 | 
 7 | from {{ cookiecutter.module_name }}.log import logger
 8 | 
 9 | 
10 | @pytest.fixture
11 | def manage_config_ini(doctest_namespace):
12 |     path_config_ini = pl.Path("config.ini")
13 |     if path_config_ini.exists():
14 |         # Save the current config.ini
15 |         fd_temp, path_temp = tf.mkstemp()
16 |         try:
17 |             shutil.copyfile(path_config_ini, path_temp)
18 |             path_config_ini.unlink()
19 |             yield
20 |             shutil.copyfile(path_temp, path_config_ini)
21 |         finally:
22 |             os.close(fd_temp)
23 |             os.remove(path_temp)
24 |     else:
25 |         # Make sure we don't leave a spurious config.ini
26 |         try:
27 |             yield
28 |         finally:
29 |             if path_config_ini.exists():
30 |                 path_config_ini.unlink()
31 | 


--------------------------------------------------------------------------------
/LICENSE:
--------------------------------------------------------------------------------
 1 | The MIT License (MIT)
 2 | Copyright (c) 2016 DrivenData, Inc.
 3 | Copyright (c) 2018 Kjell Wooding <kjell@wooding.org>
 4 | 
 5 | Permission is hereby granted, free of charge, to any person obtaining a copy of this software and associated documentation files (the "Software"), to deal in the Software without restriction, including without limitation the rights to use, copy, modify, merge, publish, distribute, sublicense, and/or sell copies of the Software, and to permit persons to whom the Software is furnished to do so, subject to the following conditions:
 6 | 
 7 | The above copyright notice and this permission notice shall be included in all copies or substantial portions of the Software.
 8 | 
 9 | THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
10 | 


--------------------------------------------------------------------------------
/{{ cookiecutter.repo_name }}/scripts/debug.py:
--------------------------------------------------------------------------------
 1 | import os
 2 | from subprocess import Popen, PIPE, STDOUT
 3 | import sys
 4 | 
 5 | 
 6 | assert len(sys.argv) >= 2
 7 | path_debug = sys.argv[1]
 8 | 
 9 | with open(path_debug, "wb") as file_debug:
10 |     def shell_out(*cmd):
11 |         p = Popen(list(cmd), stdout=file_debug, stderr=STDOUT, stdin=PIPE)
12 |         p.communicate(b"yes\n")
13 |         return p.wait()
14 | 
15 |     def heading(h):
16 |         file_debug.write(b"\n##\n## " + h.encode("utf-8") + b"\n##\n")
17 |         file_debug.flush()
18 | 
19 |     heading("Git status")
20 |     shell_out("git", "status")
21 | 
22 |     heading("git log")
23 |     shell_out("git", "log", "-8", "--graph", "--oneline", "--decorate", "--all")
24 | 
25 |     heading("Git remotes")
26 |     shell_out("git", "remote", "-v")
27 | 
28 |     heading("GitHub SSH credentials")
29 |     shell_out("ssh", "git@github.com")
30 | 
31 |     heading("Conda config")
32 |     shell_out(os.environ["CONDA_EXE"], "config", "--get")
33 | 
34 |     heading("Conda info")
35 |     shell_out(os.environ["CONDA_EXE"], "info")
36 | 
37 |     heading("Conda list")
38 |     shell_out(os.environ["CONDA_EXE"], "list")
39 | 
40 | msg = "Please include the contents of " + path_debug + " when submitting an issue or support request."
41 | print("=" * len(msg))
42 | print(msg)
43 | print("=" * len(msg))


--------------------------------------------------------------------------------
/{{ cookiecutter.repo_name }}/{{ cookiecutter.module_name }}/tests/test_catalog.py:
--------------------------------------------------------------------------------
 1 | import pytest
 2 | import pathlib
 3 | 
 4 | from {{ cookiecutter.module_name }}.data import Catalog
 5 | from {{ cookiecutter.module_name }}.log import logger
 6 | 
 7 | @pytest.fixture
 8 | def catalog(tmpdir):
 9 |     """Create a test catalog"""
10 | 
11 |     # Setup
12 |     # tmpdir should be empty when we get here
13 |     c = Catalog.create(catalog_path=tmpdir)
14 |     yield c
15 | 
16 |     # Teardown
17 | 
18 | @pytest.fixture
19 | def old_catalog_file():
20 |     test_dir = pathlib.Path(__file__).parent.resolve()
21 | 
22 |     yield test_dir / 'data' / 'dataset-test.json'
23 | 
24 | def test_old_catalog_init(tmpdir, old_catalog_file):
25 |     c = Catalog.from_old_catalog(old_catalog_file, catalog_path=tmpdir)
26 |     # Verify the catalog is nonempty and contains the expected data
27 |     assert len(c) == 4
28 |     for dsname in ["wine_reviews_130k", "wine_reviews_150k", 'wine_reviews_130k_varietals_75', 'wine_reviews']:
29 |         assert dsname in c
30 | 
31 |     # Should fail, as it already exists
32 |     with pytest.raises(FileExistsError):
33 |         c = Catalog.from_old_catalog(old_catalog_file, catalog_path=tmpdir)
34 | 
35 |     # Should succeed, as replace is set
36 |     c = Catalog.from_old_catalog(old_catalog_file, catalog_path=tmpdir, replace=True)
37 | 


--------------------------------------------------------------------------------
/{{ cookiecutter.repo_name }}/{{ cookiecutter.module_name }}/tests/test_iter_directory.py:
--------------------------------------------------------------------------------
 1 | from contextlib import contextmanager
 2 | from tempfile import mkdtemp
 3 | from pathlib import Path
 4 | import pytest
 5 | import shutil
 6 | 
 7 | from ..data.utils import iter_directory
 8 | 
 9 | 
10 | @contextmanager
11 | def dir_temp() -> Path:
12 |     path = Path(mkdtemp())
13 |     try:
14 |         yield path
15 |     finally:
16 |         shutil.rmtree(path)
17 | 
18 | 
19 | def test_iter_directory_empty():
20 |     with dir_temp() as d:
21 |         assert list(iter_directory(d)) == []
22 | 
23 | 
24 | def test_iter_directory_flat():
25 |     with dir_temp() as d:
26 |         (d / "qwer").touch()
27 |         (d / "asdf").touch()
28 |         (d / "ghgh").touch()
29 |         (d / "1234").touch()
30 |         assert list(iter_directory(d)) == [d / i for i in ["1234", "asdf", "ghgh", "qwer"]]
31 | 
32 | 
33 | def test_iter_directory_deep():
34 |     with dir_temp() as d:
35 |         (d / "a" / "b" / "a" / "A").mkdir(parents=True)
36 |         (d / "a" / "hoho").touch()
37 |         (d / "1").touch()
38 |         (d / "a" / "b" / "a" / "A" / "v").touch()
39 |         (d / "a" / "b" / "3").touch()
40 |         (d / "a" / "b" / "z").touch()
41 |         assert list(iter_directory(d)) == [
42 |             d / i
43 |             for i in ["1", "a", "a/b", "a/b/3", "a/b/a", "a/b/a/A", "a/b/a/A/v", "a/b/z", "a/hoho"]
44 |         ]
45 | 


--------------------------------------------------------------------------------
/{{ cookiecutter.repo_name }}/{{ cookiecutter.module_name }}/workflow.py:
--------------------------------------------------------------------------------
 1 | """A module where we temporarily smooth our way around API issues in Easydata.
 2 | 
 3 | This is a place where we temporarily address UX and API issues in Easydata, usually by writing convenient wrappers around existing functionality.
 4 | 
 5 | Nothing in here is intended to be a stable API, so use at your own risk, as these contents are regularly deprecated.
 6 | 
 7 | """
 8 | 
 9 | import sys
10 | import logging
11 | from .data import Catalog, Dataset, DataSource
12 | from .log import logger
13 | 
14 | __all__ = [
15 |     'make_target'
16 | ]
17 | 
18 | def make_target(target):
19 |     """process command from makefile
20 | 
21 |     Parameters
22 |     ----------
23 |     target: target to execute
24 |     """
25 | 
26 |     if target == "datasets":
27 |         c = Catalog.load('datasets')
28 |         for dsname in c:
29 |             logger.info(f"Generating Dataset:'{dsname}'")
30 |             ds = Dataset.load(dsname)
31 |     elif target == "datasources":
32 |         c = Catalog.load('datasources')
33 |         for name in c:
34 |             logger.info(f"Fetching, unpacking, and processing DataSource:'{name}'")
35 |             dsrc = DataSource.from_catalog(name)
36 |             ds = dsrc.process()
37 |     else:
38 |         raise NotImplementedError(f"Target: '{target}' not implemented")
39 | 
40 | 
41 | if __name__ == '__main__':
42 |     make_target(sys.argv[1])
43 | 


--------------------------------------------------------------------------------
/{{ cookiecutter.repo_name }}/.gitignore:
--------------------------------------------------------------------------------
 1 | # Byte-compiled / optimized / DLL files
 2 | __pycache__/
 3 | *.py[cod]
 4 | 
 5 | # C extensions
 6 | *.so
 7 | 
 8 | # Distribution / packaging
 9 | .Python
10 | env/
11 | build/
12 | develop-eggs/
13 | dist/
14 | downloads/
15 | eggs/
16 | .eggs/
17 | lib/
18 | lib64/
19 | parts/
20 | sdist/
21 | var/
22 | *.egg-info/
23 | .installed.cfg
24 | *.egg
25 | 
26 | # PyInstaller
27 | #  Usually these files are written by a python script from a template
28 | #  before PyInstaller builds the exe, so as to inject date/other infos into it.
29 | *.manifest
30 | *.spec
31 | 
32 | # Installer logs
33 | pip-log.txt
34 | pip-delete-this-directory.txt
35 | 
36 | # Unit test / coverage reports
37 | htmlcov/
38 | .tox/
39 | .coverage
40 | .coverage.*
41 | .cache
42 | nosetests.xml
43 | coverage.xml
44 | *.cover
45 | 
46 | # Translations
47 | *.mo
48 | *.pot
49 | 
50 | # Django stuff:
51 | *.log
52 | 
53 | # Sphinx documentation
54 | docs/_build/
55 | 
56 | # PyBuilder
57 | target/
58 | 
59 | # DotEnv configuration
60 | .env
61 | 
62 | # Database
63 | *.db
64 | *.rdb
65 | 
66 | # Pycharm
67 | .idea
68 | 
69 | # VS Code
70 | .vscode/
71 | 
72 | # Spyder
73 | .spyproject/
74 | 
75 | # Jupyter NB Checkpoints
76 | .ipynb_checkpoints/
77 | 
78 | # exclude data and local config from source control by default
79 | /data/
80 | catalog/config.ini
81 | 
82 | # Mac OS-specific storage files
83 | .DS_Store
84 | 
85 | # Vim
86 | *.swp
87 | *.swo
88 | 
89 | # Emacs
90 | *~
91 | .*~
92 | 
93 | # Makefile Machinery
94 | .make.*
95 | 


--------------------------------------------------------------------------------
/{{ cookiecutter.repo_name }}/{{ cookiecutter.module_name }}/data/process_functions.py:
--------------------------------------------------------------------------------
 1 | """
 2 | Custom dataset processing/generation functions should be added to this file
 3 | """
 4 | 
 5 | import pathlib
 6 | from sklearn.datasets import fetch_20newsgroups
 7 | 
 8 | from tqdm.auto import tqdm
 9 | 
10 | from .. import paths
11 | from ..log import logger
12 | 
13 | __all__ = [
14 |     'process_20_newsgroups'
15 | ]
16 | 
17 | def process_20_newsgroups(*, extract_dir='20_newsgroups',
18 |                           metadata=None, unpack_dir=None,
19 |                           opts={"subset":"all", "remove":"('headers', 'footers', 'quotes')"}):
20 |     """
21 |     Process 20 newsgroups into (data, target, metadata) format.
22 | 
23 | 
24 |     Parameters
25 |     ----------
26 |     unpack_dir: path
27 |         The interim parent directory the dataset files have been unpacked into.
28 |     extract_dir: str
29 |         Name of the directory of the unpacked files relative to the unpack_dir. Note that
30 |     opts: dict default {"subset":"all", "remove"="('headers', 'footers', 'quotes')"}
31 |         Options to pass to sklearn.datasets.fetch_20newsgroups.
32 | 
33 | 
34 |     Returns
35 |     -------
36 |     A tuple:
37 |         (data, target, additional_metadata)
38 | 
39 |     """
40 |     if metadata is None:
41 |         metadata = {}
42 | 
43 |     if unpack_dir is None:
44 |         unpack_dir = paths['interim_data_path']
45 |     else:
46 |         unpack_dir = pathlib.Path(unpack_dir)
47 |     data_dir = unpack_dir / f"{extract_dir}"
48 | 
49 |     news = fetch_20newsgroups(**opts)
50 | 
51 |     return news.data, news.target, metadata
52 | 


--------------------------------------------------------------------------------
/hooks/post_gen_project.py:
--------------------------------------------------------------------------------
 1 | import pathlib
 2 | import shutil
 3 | import logging
 4 | import json
 5 | try:
 6 |     from ruamel_yaml import YAML
 7 | except ModuleNotFoundError:
 8 |     from ruamel.yaml import YAML
 9 | 
10 | from cookiecutter.config import get_user_config
11 | 
12 | logger = logging.getLogger(__name__)
13 | 
14 | 
15 | def copy_cookiecutter_resume(template_name='easydata'):
16 |     """Make a copy of the cookiecutter replay file in the generated project.
17 | 
18 |     By default, cookiecutter creates a replay directory in a user's
19 |     ~/.cookiecutter directory. This is largely useless. Easydata dumps
20 |     this data to the generated project (also as json) using a jsonify
21 |     call, but this doesn't yet help us regenerate the project
22 |     automatically.  This hook creates a YAML version of those values
23 |     in the generated project.  This can be used to regenerate the
24 |     project by doing a:
25 | 
26 |     >>> cookiecutter --config_file path/to/.easydata.yaml easydata
27 | 
28 |     """
29 |     # relative to root of generated project
30 |     src_path = f'.{template_name}.json'
31 |     yml_path = f'.{template_name}.yml'
32 | 
33 |     logger.debug(f"Reading cookiecutter replay data from {src_path}")
34 |     with open(src_path) as f:
35 |         cookiecutter_opts = json.load(f)
36 |         yaml_opts = {k:v
37 |                      for k,v in sorted(cookiecutter_opts.items())
38 |                      if not k.startswith('_')}
39 |     yaml = YAML()
40 |     yaml.default_flow_style=False
41 |     yaml.width=4096
42 |     yaml.indent(offset=4)
43 |     logger.debug(f"Dumping cookiecutter replay (YAML) info to {yml_path}")
44 |     with open(yml_path, 'w') as fw:
45 |         yaml.dump({'default_context': yaml_opts}, fw)
46 | 
47 | if __name__ == '__main__':
48 |     copy_cookiecutter_resume()
49 | 


--------------------------------------------------------------------------------
/{{ cookiecutter.repo_name }}/.circleci/config.yml:
--------------------------------------------------------------------------------
 1 | # Python CircleCI 2.0 configuration file
 2 | #
 3 | # Check https://circleci.com/docs/2.0/language-python/ for more details
 4 | #
 5 | version: 2
 6 | jobs:
 7 |   build:
 8 |     docker:
 9 |       # specify the version you desire here
10 |       # use `-browsers` prefix for selenium tests, e.g. `3.6.1-browsers`
11 |       - image: continuumio/miniconda3
12 | 
13 | 
14 |       # Specify service dependencies here if necessary
15 |       # CircleCI maintains a library of pre-built images
16 |       # documented at https://circleci.com/docs/2.0/circleci-images/
17 |       # - image: circleci/postgres:9.4
18 | 
19 |     working_directory: ~/repo
20 | 
21 |     steps:
22 |       - checkout
23 | 
24 |       - run:
25 |           name: Create environment and contrive to always use it
26 |           command: |
27 |             conda update --yes --quiet conda;
28 |             export CONDA_EXE=/home/circleci/miniconda/bin/conda
29 |             make create_environment
30 |             conda init bash
31 |             sed -ne '/>>> conda initialize/,/<<< conda initialize/p' ~/.bashrc >> $BASH_ENV
32 |             echo "conda activate {{ cookiecutter.repo_name }}" >> $BASH_ENV;
33 | 
34 |       - run:
35 |           name: Create test report directory
36 |           command: |
37 |             mkdir test-reports
38 | 
39 |       # Cache dependencies
40 |       - save_cache:
41 |           key: pip-cache
42 |           paths:
43 |             - ~/.cache/pip
44 | 
45 |       - restore_cache:
46 |           keys:
47 |             - pip-cache
48 | 
49 |       - run:
50 |           name: Run tests
51 |           command: |
52 |             make test CI_RUNNING=yes
53 | 
54 |       - store_test_results:
55 |           path: test-reports
56 | 
57 |       - store_artifacts:
58 |           path: test-reports
59 |           destination: test-reports
60 | 


--------------------------------------------------------------------------------
/docs/test_docs.py:
--------------------------------------------------------------------------------
 1 | ## To be run from inside a cookiecutter-easdata project conda environment
 2 | ## that has the catalog files and structure of the test-env from the cookiecutter-easydata
 3 | ## CI setup.
 4 | 
 5 | import subprocess
 6 | import tempfile
 7 | import unittest
 8 | from pathlib import Path
 9 | import requests
10 | 
11 | from src import paths
12 | from src.log import logger
13 | 
14 | 
15 | CCDS_ROOT = Path(__file__).parents[1].resolve()
16 | DOCS_DIR = CCDS_ROOT / "docs"
17 | 
18 | def _exec_notebook(path):
19 |     """
20 |     Helper function to execute a notebook.
21 |     """
22 |     with tempfile.NamedTemporaryFile(suffix=".ipynb") as fout:
23 |         args = ["jupyter", "nbconvert", "--to", "notebook", "--execute",
24 |                 "--ExecutePreprocessor.timeout=1000",
25 |                 "--ExecutePreprocessor.kernel_name=python",
26 |                 "--output", fout.name, path]
27 |         subprocess.check_call(args)
28 | 
29 | class TestDocNotebooks(unittest.TestCase):
30 |     """
31 |     Test that the documentation notebooks run.
32 |     """
33 |     def test_notebook_00(self):
34 |         _exec_notebook(DOCS_DIR / "00-xyz-sample-notebook.ipynb")
35 | 
36 |     def test_notebook_csv(self):
37 |         csv_url = "https://storage.googleapis.com/covid19-open-data/v2/epidemiology.csv"
38 |         csv_dest = paths['raw_data_path'] / "epidemiology.csv"
39 |         if not csv_dest.exists():
40 |             logger.debug("Downloading epidemiology.csv")
41 |             csv_file = requests.get(csv_url)
42 |             with open(csv_dest, 'wb') as f:
43 |                 f.write(csv_file.content)
44 |         _exec_notebook(DOCS_DIR / "Add-csv-template.ipynb")
45 | 
46 |     def test_notebook_derived(self):
47 |         _exec_notebook(DOCS_DIR / "Add-derived-dataset.ipynb")
48 | 
49 |     def test_notebook_generic_dataset(self):
50 |         _exec_notebook(DOCS_DIR / "New-Dataset-Template.ipynb")
51 | 
52 |     def test_notebook_generic_edge(self):
53 |         _exec_notebook(DOCS_DIR / "New-Edge-Template.ipynb")
54 | 


--------------------------------------------------------------------------------
/{{ cookiecutter.repo_name }}/scripts/help.py:
--------------------------------------------------------------------------------
 1 | from argparse import ArgumentParser
 2 | import re
 3 | import sys
 4 | 
 5 | 
 6 | def bold(s):
 7 |     return s
 8 | def cyan(s):
 9 |     return s
10 | 
11 | ap = ArgumentParser()
12 | ap.add_argument("-v", nargs=2, action="append", dest="variables")
13 | ap.add_argument("makefiles", nargs="+")
14 | ns = ap.parse_args(sys.argv[1:])
15 | ls_variables = list(ns.variables)
16 | d_variables = dict(ls_variables)
17 | makefiles = list(ns.makefiles)
18 | if "PROJECT_NAME" not in d_variables:
19 |     print("No project name! Consult with EasyData maintainers.")
20 |     sys.exit(1)
21 | project_name = d_variables["PROJECT_NAME"]
22 | 
23 | print("*** To get started ***")
24 | print("")
25 | print("  >>> " + bold("make create_environment"))
26 | print("  >>> " + bold("conda activate " + project_name))
27 | print("  >>> " + bold("make update_environment"))
28 | 
29 | print("")
30 | print(bold("*** Project variables ***"))
31 | print("")
32 | 
33 | len_name_max = max([len(name) for name, value in ls_variables])
34 | for name, value in ls_variables:
35 |     print(name + " " * (len_name_max - len(name)) + " = " + value)
36 | 
37 | print("")
38 | print(bold("*** Available rules ***"))
39 | print("")
40 | rules = []
41 | for path in set(makefiles):
42 |     f = open(path, "rb")
43 |     makefile = iter(f.read().split(b"\n"))
44 |     f.close()
45 |     while True:
46 |         try:
47 |             line = next(makefile)
48 |             lines_doc = []
49 |             while line.startswith(b"## "):
50 |                 lines_doc.append(line[2:])
51 |                 line = next(makefile)
52 |             if len(lines_doc) > 0:
53 |                 # We have collected some documentation. Current line now contains the target name.
54 |                 target = line.split(b":")[0]
55 |                 lines_doc = [
56 |                     re.sub(
57 |                         r"\$\(([-a-zA-Z0-9_]+)\)",
58 |                         lambda m: d_variables.get(m.group(1), b"???"),
59 |                         ll.decode("utf-8")
60 |                     ).strip()
61 |                     for ll in lines_doc
62 |                 ]
63 |                 rules.append((target.decode("utf-8"), " ".join(lines_doc)))
64 |         except StopIteration:
65 |             break
66 | 
67 | width_target = max([len(target) for target, _ in rules])
68 | for target, doc in sorted(rules, key=lambda p: p[0]):
69 |     print(cyan(target) + " " * (width_target - len(target)) + "  " + doc)
70 | 


--------------------------------------------------------------------------------
/docs/index.md:
--------------------------------------------------------------------------------
 1 | # Easydata
 2 | _A python framework and git template for data scientists, teams, and workshop organizers
 3 | aimed at making your data science work **reproducible and shareable**_
 4 | 
 5 | For most of us, data science is 5% science, 60% data cleaning, and 35%
 6 | IT hell.  Easydata focuses the 95% by helping you deliver
 7 | 
 8 | * reproducible python environments,
 9 | * reproducible datasets, and
10 | * reproducible workflows
11 | 
12 | In other words, Easydata is a template, library, and workflow that lets you **get up and running with your data science analysis, quickly and reproducibly**.
13 | 
14 | ## What is Easydata?
15 | 
16 | Easydata is a python cookiecutter for building custom data science git repos that provides:
17 | 
18 | * An [**opinionated workflow**](opinions.md) for collaboration, storytelling,
19 | * A **python framework** to support this workflow,
20 | * A **makefile wrapper** for conda and pip environment management,
21 | * A catalog of prebuilt **dataset recipes**, and
22 | * A library of training materials and documentation around doing reproducible data science.
23 | 
24 | Easydata is **not**
25 | 
26 | * an ETL tooklit
27 | * A data analysis pipeline
28 | * a containerization solution, or
29 | * a prescribed data format.
30 | 
31 | ## Contributing
32 | 
33 | The Easydata project is opinionated, but not afraid to be wrong. Best practices change, tools evolve, and lessons are learned. **The goal of this project is to make it easier to start, structure, and share your data science work.** [Pull requests](https://github.com/hackalog/cookiecutter-easydata/pulls) and [filing issues](https://github.com/hackalog/cookiecutter-easydata/issues) is encouraged. We'd love to hear what works for you, and what doesn't.
34 | 
35 | If you use the Cookiecutter Easydata Project, link back to this page.
36 | 
37 | ## Links to related projects and references
38 | 
39 | Easydata started life as an [opinionated fork](opinions.md) of the [cookiecutter-datascience] project. Easydata has evolved considerably since then with a specific focus on enabling overall team efficiency by improving collaboration and reproducibility.
40 | We owe the cookiecutter-datascience project a great debt for the work they have done in creating
41 | a flexible but highly useful project template.
42 | 
43 | 
44 | [cookiecutter-datascience]: https://github.com/drivendata/cookiecutter-data-science/
45 | 
46 | Also, a huge thanks to the
47 | [Cookiecutter](https://cookiecutter.readthedocs.org/en/latest/)
48 | project ([github](https://github.com/audreyr/cookiecutter)), which is
49 | helping us all spend less time thinking about and writing boilerplate
50 | and more time getting things done.
51 | 


--------------------------------------------------------------------------------
/{{ cookiecutter.repo_name }}/{{ cookiecutter.module_name }}/tests/make_test_datasets.py:
--------------------------------------------------------------------------------
 1 | from sklearn.datasets import fetch_20newsgroups
 2 | from functools import partial
 3 | 
 4 | from {{ cookiecutter.module_name }}.data import DataSource, Dataset, DatasetGraph, Catalog
 5 | from {{ cookiecutter.module_name }}.data.process_functions import process_20_newsgroups
 6 | from {{ cookiecutter.module_name }} import paths
 7 | from {{ cookiecutter.module_name }}.log import logger
 8 | 
 9 | # Set up a 20 newsgroups dataset
10 | 
11 | license = """
12 | Custom Academic License: "You may use this material free of charge for any educational purpose, provided attribution is given in any lectures or publications that make use of this material." As in http://kdd.ics.uci.edu/databases/20newsgroups/20newsgroups.data.html.
13 | """
14 | metadata = """
15 | The 20 Newsgroups dataset is a collection of approximately 20,000 newsgroup documents, partitioned (nearly) evenly across 20 different newsgroups.
16 | 
17 | The data is organized into 20 different newsgroups, each corresponding to a different topic. Some of the newsgroups are very closely related to each other (e.g. comp.sys.ibm.pc.hardware / comp.sys.mac.hardware), while others are highly unrelated (e.g misc.forsale / soc.religion.christian).
18 | 
19 | Here are the categories:
20 | 
21 |  * `alt.atheism`,
22 |  * `comp.graphics`,
23 |  * `comp.os.ms-windows.misc`,
24 |  * `comp.sys.ibm.pc.hardware`,
25 |  * `comp.sys.mac.hardware`,
26 |  * `comp.windows.x`,
27 |  * `misc.forsale`,
28 |  * `rec.autos`,
29 |  * `rec.motorcycles`,
30 |  * `rec.sport.baseball`,
31 |  * `rec.sport.hockey`,
32 |  * `sci.crypt`,
33 |  * `sci.electronics`,
34 |  * `sci.med`,
35 |  * `sci.space`,
36 |  * `soc.religion.christian`,
37 |  * `talk.politics.guns`,
38 |  * `talk.politics.mideast`,
39 |  * `talk.politics.misc`,
40 |  * `talk.religion.misc`
41 | 
42 | The current version is obtained by wrapping `sklearn.datasets.fetch_20newsgroups`, which comes from this [20 newsgroups webpage](http://qwone.com/~jason/20Newsgroups/).
43 | 
44 | By default we follow the sklearn suggestion to set `remove=('headers', 'footers', 'quotes')` to avoid overfitting.
45 | """
46 | if __name__ =='__main__':
47 |     ds_name = '20_newsgroups'
48 |     output_ds_name = ds_name
49 |     dsrc = DataSource(ds_name)
50 | 
51 |     dsrc.add_metadata(contents=metadata, force=True)
52 |     dsrc.add_metadata(contents=license, kind='LICENSE', force=True)
53 | 
54 |     process_function = process_20_newsgroups
55 |     process_kwargs = {}
56 | 
57 |     dsrc.process_function = partial(process_function, **process_kwargs)
58 |     dsrc.update_catalog()
59 | 
60 |     dag = DatasetGraph()
61 |     dag.add_source(output_dataset=output_ds_name, datasource_name=ds_name, overwrite_catalog=True)
62 | 


--------------------------------------------------------------------------------
/tests/test_creation.py:
--------------------------------------------------------------------------------
  1 | import os
  2 | import pytest
  3 | import shutil
  4 | from pathlib import Path
  5 | 
  6 | from cookiecutter import main
  7 | 
  8 | CCDS_ROOT = Path(__file__).parents[1].resolve()
  9 | 
 10 | 
 11 | @pytest.fixture(scope='function')
 12 | def default_baked_project(tmpdir):
 13 |     temp = tmpdir.mkdir('data-project')
 14 |     out_dir = Path(temp).resolve()
 15 | 
 16 |     main.cookiecutter(
 17 |         str(CCDS_ROOT),
 18 |         no_input=True,
 19 |         extra_context={},
 20 |         output_dir=out_dir
 21 |     )
 22 | 
 23 |     # default project name is project_name
 24 |     yield out_dir / 'project_name'
 25 | 
 26 |     # cleanup after
 27 |     shutil.rmtree(out_dir)
 28 | 
 29 | 
 30 | def test_readme(default_baked_project):
 31 |     readme_path = default_baked_project / 'README.md'
 32 | 
 33 |     assert readme_path.exists()
 34 |     assert no_curlies(readme_path)
 35 | 
 36 | 
 37 | def test_license(default_baked_project):
 38 |     license_path = default_baked_project / 'LICENSE'
 39 | 
 40 |     assert license_path.exists()
 41 |     assert no_curlies(license_path)
 42 | 
 43 | 
 44 | def test_requirements(default_baked_project):
 45 |     reqs_path = default_baked_project / 'environment.yml'
 46 | 
 47 |     assert reqs_path.exists()
 48 |     assert no_curlies(reqs_path)
 49 | 
 50 | 
 51 | def test_makefile(default_baked_project):
 52 |     makefile_path = default_baked_project / 'Makefile'
 53 | 
 54 |     assert makefile_path.exists()
 55 |     assert no_curlies(makefile_path)
 56 | 
 57 | 
 58 | def test_folders(default_baked_project):
 59 |     expected_dirs = [
 60 |         'catalog',
 61 |         'data',
 62 |         'data/external',
 63 |         'data/interim',
 64 |         'data/processed',
 65 |         'data/raw',
 66 |         'docs',
 67 |         'models',
 68 |         'notebooks',
 69 |         'references',
 70 |         'reports',
 71 |         'reports/figures',
 72 |         'src',
 73 |         'src/data',
 74 |     ]
 75 | 
 76 |     ignored_dirs = [
 77 |         str(default_baked_project)
 78 |     ]
 79 | 
 80 |     abs_expected_dirs = [str(default_baked_project / d) for d in expected_dirs]
 81 |     abs_dirs, _, _ = list(zip(*os.walk(default_baked_project)))
 82 |     assert len(set(abs_expected_dirs + ignored_dirs) - set(abs_dirs)) == 0
 83 | 
 84 | 
 85 | def no_curlies(filepath):
 86 |     """ Utility to make sure no curly braces appear in a file.
 87 |         That is, was jinja able to render everthing?
 88 |     """
 89 |     with open(filepath, 'r') as f:
 90 |         data = f.read()
 91 | 
 92 |     template_strings = [
 93 |         '{{',
 94 |         '}}',
 95 |         '{%',
 96 |         '%}'
 97 |     ]
 98 | 
 99 |     template_strings_in_file = [s in data for s in template_strings]
100 |     return not any(template_strings_in_file)
101 | 


--------------------------------------------------------------------------------
/{{ cookiecutter.repo_name }}/LICENSE:
--------------------------------------------------------------------------------
 1 | {% if cookiecutter.open_source_license == 'MIT' %}
 2 | The MIT License (MIT)
 3 | Copyright (c) {% now 'utc', '%Y' %}, {{ cookiecutter.author_name }}
 4 | 
 5 | Permission is hereby granted, free of charge, to any person obtaining a copy of this software and associated documentation files (the "Software"), to deal in the Software without restriction, including without limitation the rights to use, copy, modify, merge, publish, distribute, sublicense, and/or sell copies of the Software, and to permit persons to whom the Software is furnished to do so, subject to the following conditions:
 6 | 
 7 | The above copyright notice and this permission notice shall be included in all copies or substantial portions of the Software.
 8 | 
 9 | THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
10 | {% elif cookiecutter.open_source_license == 'BSD-3-Clause' %}
11 | Copyright (c) {% now 'utc', '%Y' %}, {{ cookiecutter.author_name }}
12 | All rights reserved.
13 | 
14 | Redistribution and use in source and binary forms, with or without modification,
15 | are permitted provided that the following conditions are met:
16 | 
17 | * Redistributions of source code must retain the above copyright notice, this
18 |   list of conditions and the following disclaimer.
19 | 
20 | * Redistributions in binary form must reproduce the above copyright notice, this
21 |   list of conditions and the following disclaimer in the documentation and/or
22 |   other materials provided with the distribution.
23 | 
24 | * Neither the name of {{ cookiecutter.project_name }} nor the names of its
25 |   contributors may be used to endorse or promote products derived from this
26 |   software without specific prior written permission.
27 | 
28 | THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND
29 | ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED
30 | WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED.
31 | IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT,
32 | INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING,
33 | BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
34 | DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY
35 | OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE
36 | OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED
37 | OF THE POSSIBILITY OF SUCH DAMAGE.
38 | {% elif cookiecutter.open_source_license == 'Proprietary' %}
39 | Copyright (c) {% now 'utc', '%Y' %}, {{ cookiecutter.author_name }}
40 | All rights reserved.
41 | {% endif %}
42 | 


--------------------------------------------------------------------------------
/{{ cookiecutter.repo_name }}/scripts/split_pip.py:
--------------------------------------------------------------------------------
 1 | #!env python
 2 | import json
 3 | import sys
 4 | import yaml
 5 | from collections import defaultdict
 6 | 
 7 | 
 8 | def env_split(conda_env, channel_order):
 9 |     """Given a conda_environment dict, and a channel order, split into versions for each channel.
10 | 
11 |     Returns:
12 | 
13 |     conda_env: (list)
14 |        remaining setup bits of the environment.yml file
15 |     channel_dict: (dict)
16 |        dict containing the list of dependencies by channel name
17 | 
18 |         Python object corresponding to environment.yml"""
19 |     # Cheater way to make deep Copies
20 |     json_copy = json.dumps(conda_env)
21 |     conda_env = json.loads(json_copy)
22 |     pip_env = json.loads(json_copy)
23 | 
24 |     pipdeps = None
25 |     deplist = conda_env.pop('dependencies')
26 |     channel_dict = defaultdict(list)
27 | 
28 |     for k, dep in enumerate(deplist[:]):  # Note: copy list, as we mutate it
29 |         if isinstance(dep, dict):  # nested yaml
30 |             if dep.get('pip', None):
31 |                 channel_dict['pip'] = deplist.pop(k)
32 |         else:
33 |             prefix_check = dep.split('::')
34 |             if len(prefix_check) > 1:
35 |                 channel = prefix_check[0]
36 |                 if not channel in channel_order:
37 |                     raise Exception(f'the channel {channel} required for {dep} is not specified in a channel-order section of the environment file')
38 |                 channel_dict[f'{channel}'].append(prefix_check[1])
39 |                 deplist.remove(dep)
40 | 
41 |     channel_dict['defaults'] = deplist
42 |     conda_env.pop('channel-order', None)
43 |     return conda_env, channel_dict
44 | 
45 | def get_channel_order(conda_env):
46 |     """
47 |     Given a conda_environment dict, get the channels from the channel order.
48 |     """
49 |     channel_order = conda_env.get('channel-order')
50 | 
51 |     if channel_order is None:
52 |         channel_order = ['defaults']
53 |     if not 'defaults' in channel_order:
54 |         channel_order.insert(0, 'defaults')
55 |     channel_order.append('pip')
56 |     return channel_order
57 | 
58 | def usage():
59 |     print(f"""
60 | Usage:    split_pip.py path/to/environment.yml
61 |     """)
62 | if __name__ == '__main__':
63 |     if len(sys.argv) != 2:
64 |         usage()
65 |         exit(1)
66 | 
67 |     with open(sys.argv[1], 'r') as yamlfile:
68 |         conda_env = yaml.safe_load(yamlfile)
69 | 
70 |     #check for acceptable formats
71 |     channel_order = get_channel_order(conda_env)
72 |     with open('.make.channel-order.include', 'w') as f:
73 |         f. write(' '.join(channel_order[:-1])) #exclude pip as a channel here
74 | 
75 |     cenv, channel_dict = env_split(conda_env, channel_order)
76 | 
77 |     for kind in channel_order:
78 |         if kind == "pip":
79 |             filename = '.make.pip-requirements.txt'
80 |             with open(filename, 'w') as f:
81 |                 f.write("\n".join(channel_dict['pip']['pip']))
82 |         else:
83 |             filename = f'.make.{kind}-environment.txt'
84 |             with open(filename, 'w') as f:
85 |                 f.write("\n".join(channel_dict[kind]))
86 | 


--------------------------------------------------------------------------------
/{{ cookiecutter.repo_name }}/reference/easydata/git-workflow.md:
--------------------------------------------------------------------------------
 1 | # The EasyData Git Workflow
 2 | Here's our suggestion for a reliable git workflow that works well in **small team settings**; e.g. when using [Easydata][easydata] in a group setting.
 3 | 
 4 | ## Git configuration
 5 | 
 6 | If you haven't yet done so, please follow the instrucitons
 7 | in our [Git Configuration Guide](git-configuration.md) first.
 8 | 
 9 | ## Git Workflow
10 | 
11 | We suggest you start each day by doing this:
12 | 
13 | ### Where was I? What was I doing? Did I check it in?
14 | Sometimes, you stop work without checking things back in to the repo.
15 | Now, before you do any additional work, is the time to fix that.
16 | ```bash
17 | git branch   # what branch am I on?
18 | git status   # are there any files that need checking in?
19 | git add -p   # accept or reject parts of the modified files
20 | git commit -m "put your commit message here"
21 | ```
22 | 
23 | ### Did I do any work elsewhere?
24 | Did you make changes to your personal fork, but on a different machine? Make sure your local branch is up-to-date with your personal fork (`origin`):
25 | ```bash
26 | git checkout {{cookiecutter.default_branch}}
27 | git fetch origin --prune
28 | git merge origin/{{cookiecutter.default_branch}}
29 | ```
30 | 
31 | ### What happened upstream?
32 | Did someone make changes to the `upstream` repo in your absense?
33 | Let's fetch and merge those changes
34 | 
35 | ```bash
36 | git checkout {{cookiecutter.default_branch}}
37 | git fetch upstream --prune
38 | git merge upstream/{{cookiecutter.default_branch}}
39 | git push origin {{cookiecutter.default_branch}}
40 | make update_environment
41 | ```
42 | 
43 | ### Am I working from the latest `{{cookiecutter.default_branch}}`?
44 | Now that your `{{cookiecutter.default_branch}}` branch is up-to-date with both `origin` and `upstream`, you should use it to update your local working branches. If you are already developing in a branch called, e.g. `my_branch`, do this before writing any more code:
45 | 
46 | ```bash
47 | git checkout my_branch
48 | git merge {{cookiecutter.default_branch}}
49 | git push origin my_branch
50 | ```
51 | 
52 | ### Clean up the junk
53 | With your local `{{cookiecutter.default_branch}}`, `origin/{{cookiecutter.default_branch}}` and `upstream/{{cookiecutter.default_branch}}` all in sync, we like to clean up any old branches that are fully merged (and hence, can be deleted without data loss.)
54 | ```bash
55 | git branch --merged {{cookiecutter.default_branch}}
56 | git branch -d <name_of_merged_branch>
57 | ```
58 | A really great feature of `git branch -d` is that it will refuse to remove a branch that hasn't been fully merged into another. Thus it's safe to use without any fear of data loss.
59 | 
60 | 
61 | ### Start the day
62 | Once you've finished all your merge tasks, you can create a clean working branch from the latest `{{cookiecutter.default_branch}}` by doing a:
63 | ```bash
64 | git checkout {{cookiecutter.default_branch}}
65 | git checkout -b new_branch_name
66 | ```
67 | 
68 | That's it! Do you have any suggestions for improvements to this workflow? Drop us a line or file an issue in our
69 | [easydata issue tracker].
70 | 
71 | [easydata issue tracker]: https://github.com/hackalog/easydata/issues
72 | [easydata]: https://github.com/hackalog/easydata


--------------------------------------------------------------------------------
/.circleci/config.yml:
--------------------------------------------------------------------------------
 1 | # Python CircleCI 2.0 configuration file
 2 | #
 3 | # Check https://circleci.com/docs/2.0/language-python/ for more details
 4 | #
 5 | version: 2
 6 | jobs:
 7 |   build:
 8 |     docker:
 9 |       # specify the version you desire here
10 |       # use `-browsers` prefix for selenium tests, e.g. `3.6.1-browsers`
11 |       - image: continuumio/miniconda3
12 | 
13 |       # Specify service dependencies here if necessary
14 |       # CircleCI maintains a library of pre-built images
15 |       # documented at https://circleci.com/docs/2.0/circleci-images/
16 |       # - image: circleci/postgres:9.4
17 | 
18 |     working_directory: ~/repo
19 | 
20 |     steps:
21 |       - checkout
22 |       
23 |       - run:
24 |           name: Set up Conda
25 |           command: |
26 |             conda init bash
27 |             conda update --yes --quiet conda;
28 |             export CONDA_EXE=/opt/conda/bin/conda
29 |             sed -ne '/>>> conda initialize/,/<<< conda initialize/p' ~/.bashrc >> $BASH_ENV 
30 |       
31 |       - run:
32 |           name: Build cookiecutter environment and test-env project
33 |           command: |
34 |             conda create -n cookiecutter --yes python=3.8 make
35 |             conda activate cookiecutter
36 |             pip install cookiecutter
37 |             pip install ruamel.yaml
38 |             mkdir -p /root/repo/.cookiecutter_replay
39 |             cp circleci-cookiecutter-easydata.json /root/repo/.cookiecutter_replay/cookiecutter-easydata.json
40 |             pwd
41 |             which make
42 |             cookiecutter --config-file .cookiecutter-easydata-test-circleci.yml . -f --no-input
43 | 
44 |       - run:
45 |           name: Create test-env environment and contrive to always use it
46 |           command: |
47 |             conda activate cookiecutter
48 |             cd test-env
49 |             export CONDA_EXE=/opt/conda/bin/conda
50 |             make create_environment
51 |             python scripts/tests/add-extra-channel-dependency.py
52 |             conda activate test-env
53 |             conda install -c anaconda make
54 |             touch environment.yml
55 |             make update_environment
56 |             echo "conda activate test-env" >> $BASH_ENV;
57 | 
58 |       - run:
59 |           name: Create test report directory
60 |           command: |
61 |             mkdir test-reports
62 | 
63 |       # Cache dependencies
64 |       - save_cache:
65 |           key: pip-cache
66 |           paths:
67 |             - ~/.cache/pip
68 | 
69 |       - restore_cache:
70 |           keys:
71 |             - pip-cache
72 | 
73 |       - run:
74 |           name: Run tests
75 |           command: |
76 |             cd test-env
77 |             python src/tests/make_test_datasets.py
78 |             make test CI_RUNNING=yes
79 | 
80 |       - run:
81 |           name: Run documentation notebook tests
82 |           command: |
83 |             cd docs
84 |             pytest -v test_docs.py
85 | 
86 |       - store_test_results:
87 |           path: test-reports
88 | 
89 |       - store_artifacts:
90 |           path: test-reports
91 |           destination: test-reports
92 |     # The resource_class feature allows configuring CPU and RAM resources for each job. Different resource classes are available for different executors. https://circleci.com/docs/2.0/configuration-reference/#resourceclass
93 |     resource_class: large
94 | 


--------------------------------------------------------------------------------
/{{ cookiecutter.repo_name }}/{{ cookiecutter.module_name }}/data/fileset.py:
--------------------------------------------------------------------------------
 1 | """
 2 | Functions for handling "fileset" data; i.e.  collections of raw files associated with a Dataset
 3 | """
 4 | 
 5 | from collections import defaultdict
 6 | import pathlib
 7 | import shutil
 8 | import os
 9 | 
10 | from tqdm.auto import tqdm
11 | 
12 | from .. import paths
13 | from ..log import logger
14 | 
15 | __all__ = [
16 |     'process_fileset_files',
17 | ]
18 | 
19 | def process_fileset_files(*, extract_dir=None, metadata=None, unpack_dir=None, file_glob="*", fileset_dir=".fileset", dataset_dir=None, do_copy=False):
20 |     """
21 |     Process unpacked raw files into its minimal dataset components (data, target, metadata).
22 |     Here, 'minimal' means `data` and `target` will be None, and `fileset` will contain a
23 |     file dict of files matching the specified file_glob (and their sizes).
24 | 
25 |     Parameters
26 |     ----------
27 |     unpack_dir: default paths['interim_data_path']
28 |         The directory the interim data files have been unpacked into
29 |     dataset_dir: default paths['processed_data_path']
30 |         location of processed datasets.
31 |     extract_dir:
32 |         Name of the directory of the unpacked zip file containing the raw data files.
33 |         relative to unpack_dir
34 |     file_glob: string
35 |         Add only files matching this glob pattern to FILESET
36 |     fileset_dir: string
37 |         Used in building the file_dict keys.
38 |     do_copy: boolean
39 |         if True, actually copy the files. Otherwise just build FILESET
40 | 
41 |     Returns
42 |     -------
43 |     (data, target, additional_metadata)
44 | 
45 |     where
46 | 
47 |     data and target are None,
48 | 
49 |     metadata contains a file dict; i.e.
50 |     'fileset': {"path_relative_to_processed_dir_1": {"filename_1":["size:33"], "filename_2":["size:54"], ...}, ...}
51 |     """
52 |     if metadata is None:
53 |         metadata = {}
54 | 
55 |     if dataset_dir is None:
56 |         dataset_dir = paths['processed_data_path']
57 |     else:
58 |         dataset_dir = pathlib.Path(dataset_dir)
59 |     if unpack_dir is None:
60 |         unpack_dir = paths['interim_data_path']
61 |     else:
62 |         unpack_dir = pathlib.Path(unpack_dir)
63 |     if extract_dir is not None:
64 |         unpack_dir /= extract_dir
65 | 
66 |     fileset_dir = pathlib.Path(fileset_dir)
67 |     fileset_dir_fq = dataset_dir / fileset_dir
68 |     logger.debug(f"Do copy: {do_copy}")
69 |     if do_copy:
70 |         if fileset_dir_fq.is_dir():
71 |             logger.warning(f"Cleaning contents of {fileset_dir}")
72 |             shutil.rmtree(fileset_dir_fq)
73 |             logger.debug(f"Copying files to {fileset_dir_fq}...")
74 | 
75 |     file_dict = defaultdict(dict)
76 |     files = sorted(list(unpack_dir.rglob(file_glob)))
77 |     for i, file in enumerate(tqdm(files)):
78 |         if file.is_dir():
79 |             continue
80 |         relative_path = file.relative_to(unpack_dir)
81 |         fileset_path = fileset_dir / relative_path
82 |         file_dict[str(fileset_path.parent)][str(fileset_path.name)] = [f'size:{os.path.getsize(file)}']
83 |         if do_copy:
84 |             os.makedirs(dataset_dir / fileset_path.parent, exist_ok=True)
85 |             shutil.copyfile(file, dataset_dir / fileset_path)
86 |     metadata['fileset'] = dict(file_dict)
87 | 
88 |     return None, None, metadata
89 | 


--------------------------------------------------------------------------------
/{{ cookiecutter.repo_name }}/Makefile:
--------------------------------------------------------------------------------
  1 | # Project Variables
  2 | include Makefile.include
  3 | # Reproducible Environments
  4 | include Makefile.envs
  5 | 
  6 | #
  7 | # Deprecated
  8 | #
  9 | 
 10 | .PHONY: requirements
 11 | 
 12 | requirements: update_environment
 13 | 	@$(ECHO) "WARNING: 'make requirements' is deprecated. Use 'make update_environment'"
 14 | 
 15 | .PHONY: unfinished
 16 | unfinished:
 17 | 	@$(ECHO) "WARNING: this target is unfinished and may be removed or changed dramatically in future releases"
 18 | 
 19 | #
 20 | # COMMANDS                                                                      #
 21 | #
 22 | 
 23 | .PHONY: check_requirements
 24 | ## Ensure all installation requirements are installed
 25 | check_requirements: check_installation
 26 | 	@$(PYTHON_INTERPRETER) scripts/am_i_ready.py
 27 | 
 28 | .PHONY: data
 29 | data: datasets
 30 | 
 31 | .PHONY: raw
 32 | raw: datasources
 33 | 
 34 | .PHONY: datasources
 35 | datasources: .make.datasources
 36 | 
 37 | .make.datasources: catalog/datasources/*
 38 | 	$(PYTHON_INTERPRETER) -m $(MODULE_NAME).workflow datasources
 39 | 	#touch .make.datasources
 40 | 
 41 | .PHONY: datasets
 42 | datasets: .make.datasets
 43 | 
 44 | .make.datasets: catalog/datasets/* catalog/transformers/*
 45 | 	$(PYTHON_INTERPRETER) -m $(MODULE_NAME).workflow datasets
 46 | 	#touch .make.datasets
 47 | 
 48 | .PHONY: clean
 49 | ## Delete all compiled Python files
 50 | clean:
 51 | 	$(PYTHON_INTERPRETER) scripts/clean.py
 52 | 
 53 | .PHONY: clean_interim
 54 | clean_interim:
 55 | 	$(RM) data/interim/*
 56 | 
 57 | .PHONY: clean_raw
 58 | clean_raw:
 59 | 	$(RM) data/raw/*
 60 | 
 61 | .PHONY: clean_processed
 62 | clean_processed:
 63 | 	$(RM) data/processed/*
 64 | 
 65 | .PHONY: clean_workflow
 66 | clean_workflow:
 67 | 	$(RM) catalog/datasources.json
 68 | 	$(RM) catalog/transformer_list.json
 69 | 
 70 | .PHONY: test
 71 | 
 72 | ## Run all Unit Tests
 73 | test: update_environment
 74 | 	$(SET) LOGLEVEL=DEBUG; pytest --pyargs --doctest-modules --doctest-continue-on-failure --verbose \
 75 | 		$(if $(CI_RUNNING),--ignore=$(TESTS_NO_CI)) \
 76 | 		$(MODULE_NAME)
 77 | 
 78 | ## Run all Unit and code coverage tests
 79 | test_with_coverage: update_environment
 80 | 	$(SET) LOGLEVEL=DEBUG; coverage run -m pytest --pyargs --doctest-modules --doctest-continue-on-failure --verbose \
 81 | 		$(if $(CI_RUNNING),--ignore=$(TESTS_NO_CI)) \
 82 | 		$(MODULE_NAME)
 83 | 
 84 | .phony: help_update_easydata
 85 | help_update_easydata:
 86 | 	@$(PYTHON_INTERPRETER) scripts/help-update.py
 87 | 
 88 | .PHONY: debug
 89 | ## dump useful debugging information to $(DEBUG_FILE)
 90 | debug:
 91 | 	@$(PYTHON_INTERPRETER) scripts/debug.py $(DEBUG_FILE)
 92 | 
 93 | 
 94 | #################################################################################
 95 | # PROJECT RULES                                                                 #
 96 | #################################################################################
 97 | 
 98 | 
 99 | #################################################################################
100 | # Self Documenting Commands                                                     #
101 | #################################################################################
102 | 
103 | HELP_VARS := PROJECT_NAME DEBUG_FILE ARCH PLATFORM SHELL
104 | 
105 | .DEFAULT_GOAL := show-help
106 | .PHONY: show-help
107 | show-help:
108 | 	@$(PYTHON_INTERPRETER) scripts/help.py $(foreach v,$(HELP_VARS),-v $(v) $($(v))) $(MAKEFILE_LIST)
109 | 


--------------------------------------------------------------------------------
/{{ cookiecutter.repo_name }}/Makefile.envs:
--------------------------------------------------------------------------------
 1 | #
 2 | # Environment Management Makefile
 3 | #
 4 | 
 5 | include Makefile.include
 6 | 
 7 | $(LOCKFILE): check_installation .make.bootstrap split_environment_files
 8 | ifeq (conda, $(VIRTUALENV))
 9 | 	$(foreach channel, $(shell $(CAT) .make.channel-order.include),\
10 | 	   $(CONDA_EXE) install --name $(PROJECT_NAME) --file .make.$(channel)-environment.txt --channel defaults --channel $(channel) --strict-channel-priority --yes $(CMDSEP))
11 | 	$(CONDA_EXE) run --name $(PROJECT_NAME) --no-capture pip install --requirement .make.pip-requirements.txt
12 | 	$(CONDA_EXE) env export --name $(PROJECT_NAME) --file $(LOCKFILE)
13 | else
14 | 	$(error Unsupported Environment `$(VIRTUALENV)`. Use conda)
15 | endif
16 | 
17 | .PHONY: split_environment_files
18 | # extract multi-phase dependencies from environment.yml and create ordering file
19 | split_environment_files: environment.yml .make.bootstrap
20 | 	$(CONDA_EXE) run --name $(PROJECT_NAME) --no-capture $(PYTHON_INTERPRETER) scripts/split_pip.py $(PROJECT_DIR)environment.yml
21 | 
22 | .make.bootstrap: scripts/bootstrap.yml
23 | 	$(CONDA_EXE) env update --name $(PROJECT_NAME) --file scripts/bootstrap.yml
24 | 	$(ECHO) "" > $@
25 | 
26 | .PHONY: create_environment
27 | ## Set up virtual (conda) environment for this project
28 | create_environment: $(LOCKFILE)
29 | ifeq (conda,$(VIRTUALENV))
30 | 	@$(RM) $(LOCKFILE)
31 | 	@$(PYTHON_INTERPRETER) -c "print('\nNew conda env created. Activate with:\n>>> conda activate $(PROJECT_NAME)\n>>> make update_environment')"
32 | ifneq ("X$(wildcard .post-create-environment.txt)","X")
33 | 	@$(CAT) .post-create-environment.txt
34 | endif
35 | else
36 | 	$(error Unsupported Environment `$(VIRTUALENV)`. Use conda)
37 | endif
38 | 
39 | .PHONY: delete_environment
40 | ## Delete the virtual (conda) environment for this project
41 | delete_environment: clean
42 | ifeq (conda,$(VIRTUALENV))
43 | 	@$(PYTHON_INTERPRETER) -c "print('Deleting conda environment.')"
44 | 	$(CONDA_EXE) env remove --name $(PROJECT_NAME)
45 | 	$(RM) $(LOCKFILE)
46 | ifneq ("X$(wildcard .post-delete-environment.txt)","X")
47 | 	@$(CAT) .post-delete-environment.txt
48 | endif
49 | else
50 | 	$(error Unsupported Environment `$(VIRTUALENV)`. Use conda)
51 | endif
52 | 
53 | .PHONY: update_environment
54 | ## Install or update Python Dependencies in the virtual (conda) environment
55 | update_environment: environment_enabled $(LOCKFILE)
56 | ifneq ("X$(wildcard .post-update-environment.txt)","X")
57 | 	@$(CAT) .post-update_environment.txt
58 | endif
59 | 
60 | .PHONY: environment_enabled
61 | # Checks that the conda environment is active
62 | environment_enabled:
63 | ifeq (conda,$(VIRTUALENV))
64 | ifneq ($(notdir ${CONDA_DEFAULT_ENV}), $(PROJECT_NAME))
65 | 	$(error Run "$(VIRTUALENV) activate $(PROJECT_NAME)" before proceeding...)
66 | endif
67 | else
68 | 	$(error Unsupported Environment `$(VIRTUALENV)`. Use conda)
69 | endif
70 | 
71 | .PHONY: check_lockfile
72 | # Test that an environment lockfile exists
73 | check_lockfile:
74 | ifeq (X,X$(wildcard $(LOCKFILE)))
75 | 	$(error Run "make update_environment" before proceeding...)
76 | endif
77 | 
78 | .PHONY: check_environment
79 | ## Check if configuration is correct and environment is enabled
80 | check_environment: check_installation environment_enabled check_lockfile $(LOCKFILE)
81 | 
82 | .PHONY: check_installation
83 | # Check if Easydata installation is correct, and all dependencies are installed
84 | check_installation:
85 | 	$(foreach testCommand,$(INSTALL_DEPS),\
86 | 	   $(if $(shell ${WHICH} $(testCommand) 2>${DEVNULL} ),,\
87 | 	       $(error "Dependency '$(testCommand)' not found. Please re-install this dependency.")))
88 | 


--------------------------------------------------------------------------------
/{{ cookiecutter.repo_name }}/reference/easydata/git-configuration.md:
--------------------------------------------------------------------------------
 1 | # Setting up git and Checking Out the Repo
 2 | 
 3 | **Note**: These instructions assume you are using SSH keys (and not HTTPS authentication) with {{ cookiecutter.upstream_location }}. If you haven't set up SSH access to your repo host, see [Configuring SSH Access to Github or Gitlab][git-ssh]. This also includes instructions for using more than one account with SSH keys.
 4 | 
 5 | [git-ssh]: https://github.com/hackalog/cookiecutter-easydata/wiki/Configuring-SSH-Access-to-Github-or-GitLab
 6 | 
 7 | ## Git Configuration
 8 | When sharing a git repo with a small team, your code usually lives in at least 3 different places:
 9 | 
10 | * "local" refers to any git checkout on a local machine (or JupyterHub instance). This is where you work most of the time.
11 | * `upstream` refers to the shared Easydata repo on {{ cookiecutter.upstream_location}}; i.e. the **team repo**,
12 | * `origin` refers to your **personal fork** of the shared Easydata repo. It also lives on {{ cookiecutter.upstream_location}}.
13 | 
14 | ### Create a Personal Fork
15 | 
16 | We strongly recommend you make all your edits on a personal fork of this repo. Here's how to create such a fork:
17 | 
18 | * On Github or Gitlab, press the Fork button in the top right corner.
19 | * On Bitbucket, press the "+" icon on the left and choose **Fork this Repo**
20 | 
21 | ### Local, `origin`, and `upstream`
22 | git calls `upstream` (the **team repo**), and `origin` (your **personal fork** of the team repo) "remote" branches. Here's how to create them.
23 | 
24 | Create a local git checkout by cloning your personal fork:
25 | ```bash
26 | git clone git@{{ cookiecutter.upstream_location }}:<your_git_handle>/{{cookiecutter.repo_name}}.git
27 | ```
28 | Add the team (shared) repo as a remote branch named `upstream`:
29 | ```bash
30 |   cd {{cookiecutter.repo_name}}
31 |   git remote add upstream git@{{ cookiecutter.upstream_location }}:<upstream-repo>/{{cookiecutter.repo_name}}.git
32 | ```
33 | 
34 | You can verify that these branches are configured correctly by typing
35 | 
36 | ```
37 | >>> git remote -v
38 | origin	git@{{ cookiecutter.upstream_location }}:<your_git_handle>/{{cookiecutter.repo_name}}.git (fetch)
39 | origin	git@{{ cookiecutter.upstream_location }}:<your_git_handle>/{{cookiecutter.repo_name}}.git (push)
40 | upstream	git@{{ cookiecutter.upstream_location }}:<upstream-repo>/{{cookiecutter.repo_name}}.git (fetch)
41 | upstream	git@{{ cookiecutter.upstream_location }}:<upstream-repo>/{{cookiecutter.repo_name}}.git (push)
42 | ```
43 | or if you use HTTPS-based authentication:
44 | ```
45 | origin	https://{{ cookiecutter.upstream_location }}/<your_git_handle>/{{cookiecutter.repo_name}}.git (fetch)
46 | origin	https://{{ cookiecutter.upstream_location }}/<your_git_handle>/{{cookiecutter.repo_name}}.git (push)
47 | upstream	https://{{ cookiecutter.upstream_location }}/<upstream-repo>/{{cookiecutter.repo_name}}.git (fetch)
48 | upstream	https://{{ cookiecutter.upstream_location }}/<upstream-repo>/{{cookiecutter.repo_name}}.git (push)
49 | ```
50 | 
51 | ### Do Your Work in Branches
52 | To make life easiest, we recommend you do all your development **in branches**, and use your {{cookiecutter.default_branch}} branch **only** for tracking changes in the shared `upstream/{{cookiecutter.default_branch}}`. This combination makes it much easier not only to stay up to date with changes in the shared project repo, but also makes it easier to submit Pull/Merge Requests (PRs) against the upstream project repository should you want to share your code or data.
53 | 
54 | ### A Useful Git Workflow
55 | Once you've got your local, `origin`, and `upstream` branches configured, you can follow the instructions in this handy [Git Workflow Cheat Sheet](git-workflow.md) to keep your working copy of the repo in sync with the others.
56 | 


--------------------------------------------------------------------------------
/{{ cookiecutter.repo_name }}/{{ cookiecutter.module_name }}/_paths.py:
--------------------------------------------------------------------------------
  1 | from .decorators import SingletonDecorator
  2 | from .kvstore import KVStore
  3 | from .log import logger
  4 | from pathlib import Path
  5 | 
  6 | class PathStore(KVStore):
  7 |     """Persistent Key-Value store for project-level paths
  8 | 
  9 |     >>> b = PathStore(config_file='/tmpx/project/catalog/config.ini', \
 10 |         project_path='${catalog_path}/..', \
 11 |         data_path='${project_path}/data', \
 12 |         persistent=False)
 13 | 
 14 |     By default, the project directory is the parent of the directory containing the `config_file`:
 15 | 
 16 | 
 17 |     >>> b['project_path'] == Path('/tmpx/project').resolve()
 18 |     True
 19 |     >>> b['data_path'] == Path('/tmpx/project/data').resolve()
 20 |     True
 21 | 
 22 |     The `catalog_path` is set upon instantiation and is read-only:
 23 | 
 24 |     >>> b['catalog_path'] == Path('/tmpx/project/catalog').resolve()
 25 |     True
 26 |     >>> b['catalog_path'] = '/tmp'
 27 |     Traceback (most recent call last):
 28 |      ...
 29 |     AttributeError: catalog_path is write-protected
 30 | 
 31 |     Changing a value changes all values that expand to contain it:
 32 | 
 33 |     >>> b['project_path'] = '/tmpy'
 34 |     >>> b['project_path'] ==  Path('/tmpy').resolve()
 35 |     True
 36 |     >>> b['data_path'] == Path('/tmpy/data').resolve()
 37 |     True
 38 | 
 39 |     We can have multiple levels of expansion:
 40 | 
 41 |     >>> b['raw_data_path'] = "${data_path}/raw"
 42 |     >>> b['raw_data_path'] == Path('/tmpy/data/raw').resolve()
 43 |     True
 44 |     >>> b['project_path'] = '/tmp3'
 45 |     >>> b['data_path'] == Path('/tmp3/data').resolve()
 46 |     True
 47 |     >>> b['raw_data_path'] == Path('/tmp3/data/raw').resolve()
 48 |     True
 49 |     """
 50 | 
 51 |     # These keys should never be written to disk, though they may be used
 52 |     # as variables in relative paths
 53 |     _protected = ['catalog_path']
 54 | 
 55 |     def __init__(self, *args,
 56 |                  config_section='Paths', config_file=None,
 57 |                  **kwargs):
 58 |         """Handle the special case of the config file"""
 59 |         if config_file is None:
 60 |             self._config_file = "config.ini"
 61 |         else:
 62 |             self._config_file = Path(config_file)
 63 |         self._usage_warning = False
 64 |         super().__init__(*args, config_section=config_section,
 65 |                          config_file=self._config_file, **kwargs)
 66 |         self._usage_warning = True
 67 | 
 68 |     def _write(self):
 69 |         """temporarily hide protected keys when saving"""
 70 |         for key in self._protected:
 71 |             self._config.remove_option(self._config_section, key)
 72 |         super()._write()
 73 |         for key in self._protected:
 74 |             self._config.set(self._config_section, key, str(getattr(self, key)))
 75 | 
 76 |     def __setitem__(self, key, value):
 77 |         """Do not set a key if it is protected"""
 78 |         if key in self._protected:
 79 |             raise AttributeError(f"{key} is write-protected")
 80 | 
 81 |         if self._usage_warning:
 82 |             logger.warning(f"'{key}' is a local configuration variable, and for reproducibility reasons, should not set from a notebook or shared code. It is better to edit '{self._config_file}' instead. We have set it, but you have been warned.")
 83 | 
 84 |         super().__setitem__(key, value)
 85 | 
 86 | 
 87 |     def __getitem__(self, key):
 88 |         """get keys (including protected ones), converting to paths and fully resolving them"""
 89 |         if key in self._protected:
 90 |             return getattr(self, key)
 91 |         self._read()
 92 |         return Path(super().__getitem__(key)).resolve()
 93 | 
 94 |     @property
 95 |     def catalog_path(self):
 96 |         return self._config_file.parent.resolve()
 97 | 
 98 | @SingletonDecorator
 99 | class Paths(PathStore):
100 |     pass
101 | 
102 | 
103 | if __name__ == "__main__":
104 |     import doctest
105 |     doctest.testmod()
106 | 


--------------------------------------------------------------------------------
/{{ cookiecutter.repo_name }}/reference/easydata/troubleshooting.md:
--------------------------------------------------------------------------------
 1 | ## Troubleshooting Guide
 2 | 
 3 | It's impossible to test the configurations on every possible machine, so we haven't caught everything. But we're working on making fixes as problems come up. Here's what we've encountered so far (with links to the issues in question if you want to deep dive into the fix).
 4 | 
 5 | Before you report a problem, make sure you are running the latest version of the surge repo.
 6 | Assuming you are following the [recommended git workflow](git-workflow.md) (i.e. you have set your `upstream` remote to point to the surge repo, you are working in a branch, and your `{{ cookiecutter.default_branch }}` branch is tracking the surge repo), this means doing a:
 7 | ```
 8 | git checkout {{ cookiecutter.default_branch }}
 9 | git fetch upstream --prune
10 | git merge upstream/{{ cookiecutter.default_branch }}
11 | git push origin {{ cookiecutter.default_branch }}
12 | make update_environment
13 | ```
14 | 
15 | You can then update your working branches as follows:
16 | ```
17 | git checkout my_branch
18 | git merge {{ cookiecutter.default_branch }}  # advanced git users can do a rebase here. Others please merge.
19 | ```
20 | 
21 | Next, turn on debugging in your notebook. Add these cells to the top:
22 | ```
23 | import logging
24 | from {{ cookiecutter.module_name }}.log import logger
25 | 
26 | logger.setLevel(logging.DEBUG)
27 | ```
28 | 
29 | Third, ensure your notebook is running the correct environment; i.e. select **Kernel -> Change kernel -> Python[conda env:{{ cookiecutter.repo_name }}]**. If you don't seem to have that option, make sure that you ran `jupyter notebooks` with the `{{ cookiecutter.repo_name }}` conda environment enabled, and that `which jupyter` points to the correct ({{ cookiecutter.repo_name }}) version of jupyter.
30 | 
31 | 
32 | If your problem persists, work through the table below. If these fail to resolve your issue, please post your issue. Include with your issue:
33 | 
34 | * A copy/pasted copy of the error traceback text (preferably posted as a "code snippet"), including DEBUG-level log messages.
35 | * The contents of your `environment.*.lock.yml`
36 | * the output of `%conda info` (run from within your jupyter notebook)
37 | * The output of `which python` and `which jupyter`
38 | 
39 | | Problem  | Status                    | Fix  |
40 | | :---          |    :----                             |   :----                             |
41 | | General weirdness due to not being in the right conda environment  | **Try this first**  | `conda activate {{ cookiecutter.repo_name }}` or change the kernel in your jupyter notebook |
42 | | Old conda (e.g. `{{ cookiecutter.module_name }}` module is not being installed correctly) | **Try this second**| Upgrade conda to version > 4.8 |
43 | | `{{ cookiecutter.module_name }}` module not found | **Try this first** | `conda activate {{ cookiecutter.repo_name }}`|
44 | | `{{ cookiecutter.module_name }}` module still doesn't work | **Try this second** | `touch environment.yml && make update_environment` |
45 | | Nothing works | Take off and nuke it from orbit | `conda deactivate && make delete_environment && make create_environment`|
46 | 
47 | ### Other specific troubleshooting FAQ
48 | 
49 | If `import cairo` fails, this may suggest some library (such as `libXrender.so`) could be missing. If you’ve followed all the troubleshooting instructions above, then proceed.
50 | 
51 | There is an open issue with Conda's handling of system dependencies related to the Cairo library, which is used for graph visualization through the `igraph` library, amongst other things. Seemingly, on cloud-borne virtual machines, such libraries that are common on desktop installs go undeployed, a fact that Conda apparently neglects.
52 | 
53 | Once can work around this issue by locally installing the missing dependency through their system's package manager (e.g. APT, Yum, Homebrew, and so on). For instance, on Ubuntu 18.04, the aforementioned Xrender library can be installed with the command
54 | 
55 | ```
56 | sudo apt-get install -y libxrender-dev
57 | ```
58 | 
59 | 
60 | ### Quick References
61 | 
62 | * [README](../README.md)
63 | * [Setting up and Maintaining your Conda Environment Reproducibly](conda-environments.md)
64 | * [Getting and Using Datasets](datasets.md)
65 | * [Using Notebooks for Analysis](notebooks.md)
66 | * [Sharing your Work](sharing-your-work.md)
67 | 


--------------------------------------------------------------------------------
/{{ cookiecutter.repo_name }}/{{ cookiecutter.module_name }}/utils/ipynbname.py:
--------------------------------------------------------------------------------
  1 | import json
  2 | import urllib.error
  3 | import urllib.request
  4 | from itertools import chain
  5 | from pathlib import Path, PurePath
  6 | from typing import Generator, Tuple, Union
  7 | 
  8 | import ipykernel
  9 | from jupyter_core.paths import jupyter_runtime_dir
 10 | from traitlets.config import MultipleInstanceError
 11 | 
 12 | __license__ = """
 13 | Copyright (c) 2020 Mark McPherson
 14 | 
 15 | Permission is hereby granted, free of charge, to any person obtaining a copy
 16 | of this software and associated documentation files (the "Software"), to deal
 17 | in the Software without restriction, including without limitation the rights
 18 | to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
 19 | copies of the Software, and to permit persons to whom the Software is
 20 | furnished to do so, subject to the following conditions:
 21 | 
 22 | The above copyright notice and this permission notice shall be included in all
 23 | copies or substantial portions of the Software.
 24 | 
 25 | THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
 26 | IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
 27 | FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
 28 | AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
 29 | LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
 30 | OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
 31 | SOFTWARE.
 32 | """
 33 | 
 34 | 
 35 | 
 36 | FILE_ERROR = "Can't identify the notebook {}."
 37 | CONN_ERROR = "Unable to access server;\n" \
 38 |            + "ipynbname requires either no security or token based security."
 39 | 
 40 | 
 41 | def _list_maybe_running_servers(runtime_dir=None) -> Generator[dict, None, None]:
 42 |     """ Iterate over the server info files of running notebook servers.
 43 |     """
 44 |     if runtime_dir is None:
 45 |         runtime_dir = jupyter_runtime_dir()
 46 |     runtime_dir = Path(runtime_dir)
 47 | 
 48 |     if runtime_dir.is_dir():
 49 |         for file_name in chain(
 50 |             runtime_dir.glob('nbserver-*.json'),  # jupyter notebook (or lab 2)
 51 |             runtime_dir.glob('jpserver-*.json'),  # jupyterlab 3
 52 |         ):
 53 |             yield json.loads(file_name.read_bytes())
 54 | 
 55 | 
 56 | def _get_kernel_id() -> str:
 57 |     """ Returns the kernel ID of the ipykernel.
 58 |     """
 59 |     connection_file = Path(ipykernel.get_connection_file()).stem
 60 |     kernel_id = connection_file.split('-', 1)[1]
 61 |     return kernel_id
 62 | 
 63 | 
 64 | def _get_sessions(srv):
 65 |     """ Given a server, returns sessions, or HTTPError if access is denied.
 66 |         NOTE: Works only when either there is no security or there is token
 67 |         based security. An HTTPError is raised if unable to connect to a
 68 |         server.
 69 |     """
 70 |     try:
 71 |         qry_str = ""
 72 |         token = srv['token']
 73 |         if token:
 74 |             qry_str = f"?token={token}"
 75 |         url = f"{srv['url']}api/sessions{qry_str}"
 76 |         with urllib.request.urlopen(url) as req:
 77 |             return json.load(req)
 78 |     except Exception:
 79 |         raise urllib.error.HTTPError(CONN_ERROR)
 80 | 
 81 | 
 82 | def _find_nb_path() -> Union[Tuple[dict, PurePath], Tuple[None, None]]:
 83 |     try:
 84 |         kernel_id = _get_kernel_id()
 85 |     except (MultipleInstanceError, RuntimeError, IndexError):
 86 |         return None, None  # Could not determine
 87 |     for srv in _list_maybe_running_servers():
 88 |         try:
 89 |             sessions = _get_sessions(srv)
 90 |             for sess in sessions:
 91 |                 if sess['kernel']['id'] == kernel_id:
 92 |                     return srv, PurePath(sess['notebook']['path'])
 93 |         except Exception:
 94 |             pass  # There may be stale entries in the runtime directory
 95 |     return None, None
 96 | 
 97 | def filepath():
 98 |     """Return notebook filename and path as a tuple"""
 99 |     _, path = _find_nb_path()
100 |     if path:
101 |         return path.name, path.parent
102 |     raise FileNotFoundError(FILE_ERROR.format('name'))
103 | 
104 | 
105 | def name() -> str:
106 |     """ Returns the short name of the notebook w/o the .ipynb extension,
107 |         or raises a FileNotFoundError exception if it cannot be determined.
108 |     """
109 |     _, path = _find_nb_path()
110 |     if path:
111 |         return path.stem
112 |     raise FileNotFoundError(FILE_ERROR.format('name'))
113 | 
114 | def path() -> Path:
115 |     """ Returns the absolute path of the notebook,
116 |         or raises a FileNotFoundError exception if it cannot be determined.
117 |     """
118 |     srv, path = _find_nb_path()
119 |     if srv and path:
120 |         root_dir = Path(srv.get('root_dir') or srv['notebook_dir'])
121 |         return root_dir / path
122 |     raise FileNotFoundError(FILE_ERROR.format('path'))
123 | 


--------------------------------------------------------------------------------
/{{ cookiecutter.repo_name }}/reference/easydata/paths.md:
--------------------------------------------------------------------------------
  1 | ## Specifying paths in Easydata
  2 | 
  3 | As hardcoded paths are a notorious source of reproducibility issues, Easydata attempts to help avoid path-related issues by introducing a mechanism called `paths`.
  4 | 
  5 | ```
  6 | >>> from {{ cookiecutter.module_name }} import paths
  7 | ```
  8 | 
  9 | The goal of the `paths` mechanism is to help ensure that **hardcoded path data is never checked-in** to the git repository.
 10 | 
 11 | In an Easydata project, paths are recorded in `catalog/config.ini`.  This is a standard `configparser`-format _ini_ file (in [ExtendedInterpolation] format).  The paths specified in this file are used throughout Easydata to specify the standard locations of data artifacts.
 12 | 
 13 | [ExtendedInterpolation]: https://docs.python.org/3/library/configparser.html#configparser.ExtendedInterpolation
 14 | 
 15 | Because [ExtendedInterpolation] format is used, paths may refer to each other without the need to specify absolute path names.  The default paths, for example, are all relative to `project_path`:
 16 | 
 17 | ```
 18 | [Paths]
 19 | data_path = ${project_path}/data
 20 | raw_data_path = ${data_path}/raw
 21 | interim_data_path = ${data_path}/interim
 22 | processed_data_path = ${data_path}/processed
 23 | project_path = ${catalog_path}/..
 24 | ```
 25 | 
 26 | Note that, for chicken-and-egg reasons, `catalog_path` (the location of the `config.ini` file used to specify the paths) is **not specified** in this file. It is set upon module instantiation (when ` {{ cookiecutter.module_name }}` is imported) and is write-protected:
 27 | 
 28 | ```
 29 | >>> paths['catalog_path']
 30 | PosixPath('/tmpx/project/catalog')
 31 | >>> paths['catalog_path'] = '/tmp'
 32 | Traceback (most recent call last):
 33 |  ...
 34 | AttributeError: catalog_path is write-protected
 35 | ```
 36 | 
 37 | ### Accessing `paths` from Python
 38 | 
 39 | Within Python, `paths` appears to be a dictionary of standard path locations.  For instance, if your Easydata project lives in the `/path/to/repo` directory:
 40 | 
 41 | ```python
 42 | >>>  {{ cookiecutter.module_name }}.path['project_path']
 43 | /paths/to/repo
 44 | >>> type(paths['project_path'])
 45 | pathlib.PosixPath
 46 | ```
 47 | 
 48 | Notice that paths are automatically resolved to absolute filenames (in [pathlib] format) when accessed.
 49 | 
 50 | ```python
 51 | >>> for name, location in paths.items():
 52 | >>>     print(f"{name}: {location}")
 53 | data_path: /path/to/repo/{{ cookiecutter.repo_name }}/data
 54 | raw_data_path: /path/to/repo/{{ cookiecutter.repo_name }}/data/raw
 55 | interim_data_path: /path/to/repo/{{ cookiecutter.repo_name }}/data/interim
 56 | processed_data_path: /path/to/repo/{{ cookiecutter.repo_name }}/data/processed
 57 | project_path: /path/to/repo/{{ cookiecutter.repo_name }}
 58 | ```
 59 | [pathlib]: https://docs.python.org/3/library/pathlib.html
 60 | 
 61 | Even though absolute paths are returned from the dictionary, the relative nature of the paths is preserved when these paths are modified.
 62 | 
 63 | 
 64 | ### Modifying paths
 65 | 
 66 | Recall that one of the Easydata design goals is to ensure that hardcoded paths should not be checked into your git repository. To this end, paths should **never be set from within notebooks or source code that is checked-in** to git. If you wish to modify a path on your local system, edit `config.ini` directly, or use python from the command line, as shown show below:
 67 | 
 68 | ```bash
 69 | >>> python -c "import  {{ cookiecutter.module_name }};  {{ cookiecutter.module_name }}.paths['project_path'] = /alternate/bigdata/path"
 70 | ```
 71 | 
 72 | When accessed from Python, you'll immediately see the paths have all changed:
 73 | 
 74 | ```python
 75 | >>> for name, location in paths.items():
 76 | >>>     print(f"{name}: {location}")
 77 | data_path: /alternate/bigdata/path/{{ cookiecutter.repo_name }}/data
 78 | raw_data_path: /alternate/bigdata/path/{{ cookiecutter.repo_name }}/data/raw
 79 | interim_data_path: /alternate/bigdata/path/{{ cookiecutter.repo_name }}/data/interim
 80 | processed_data_path: /alternate/bigdata/path/{{ cookiecutter.repo_name }}/data/processed
 81 | project_path: /alternate/bigdata/path/{{ cookiecutter.repo_name }}
 82 | ```
 83 | as has `config.ini`:
 84 | 
 85 | ```bash
 86 | >>> cat catalog/config.ini
 87 | [Paths]
 88 | data_path = ${project_path}/data
 89 | raw_data_path = ${data_path}/raw
 90 | interim_data_path = ${data_path}/interim
 91 | processed_data_path = ${data_path}/processed
 92 | project_path:/alternate/bigdata/path
 93 | ```
 94 | 
 95 | ### Accessing the unresolved paths from Python
 96 | 
 97 | If you ever need to see the raw (non-resolved) versions of the paths from within Python, use `paths.data`:
 98 | 
 99 | ```python
100 | >>> for name, location in paths.data.items():
101 | >>>     print(f"{name}: {location}")
102 | data_path:${project_path}/data
103 | raw_data_path:${data_path}/raw
104 | interim_data_path:${data_path}/interim
105 | processed_data_path:${data_path}/processed
106 | project_path:/alternate/bigdata/path
107 | ```
108 | 
109 | ### For more information
110 | ```python
111 | >>> from  {{ cookiecutter.module_name }} import paths
112 | >>> help(paths)
113 | ```
114 | 


--------------------------------------------------------------------------------
/{{ cookiecutter.repo_name }}/{{ cookiecutter.module_name }}/utils/__init__.py:
--------------------------------------------------------------------------------
  1 | import json
  2 | import numpy as np
  3 | import pathlib
  4 | import time
  5 | 
  6 | import nbformat
  7 | from nbconvert.preprocessors import ExecutePreprocessor, CellExecutionError
  8 | 
  9 | 
 10 | from ..log import logger
 11 | from .ipynbname import name as ipynb_name, path as ipynb_path
 12 | from .. import paths
 13 | 
 14 | # Timing and Performance
 15 | 
 16 | def timing_info(method):
 17 |     def wrapper(*args, **kw):
 18 |         start_time = time.time()
 19 |         result = method(*args, **kw)
 20 |         end_time = time.time()
 21 |         logger.info(f"timing_info: {method.__name__}"
 22 |                     f"@{round((end_time-start_time)*1000,1)} ms")
 23 | 
 24 |         return result
 25 | 
 26 |     return wrapper
 27 | 
 28 | def record_time_interval(section, start_time, line_break=False):
 29 |     """Record a time interval since the last timestamp"""
 30 |     end_time = time.time()
 31 |     delta = end_time - start_time
 32 |     if delta < 1:
 33 |         delta *= 1000
 34 |         units = "ms"
 35 |     else:
 36 |         units = "s"
 37 |     if line_break:
 38 |         logger.debug("PROCESS_TIME:{:>36}    {} {}\n".format(section, round(delta, 1), units))
 39 |     else:
 40 |         logger.debug("PROCESS_TIME:{:>36}    {} {}".format(section, round(delta, 1), units))
 41 |     return end_time
 42 | 
 43 | def normalize_numpy_dict(d):
 44 |     ret = d.copy()
 45 |     for k, v in ret.items():
 46 |         if isinstance(v, np.generic):
 47 |             ret[k] = np.asscalar(v)
 48 |     return ret
 49 | 
 50 | def save_json(filename, obj, indent=2, sort_keys=True):
 51 |     """Dump an object to disk in json format
 52 | 
 53 |     filename: pathname
 54 |         Filename to dump to
 55 |     obj: object
 56 |         Object to dump
 57 |     indent: integer
 58 |         number of characters to indent
 59 |     sort_keys: boolean
 60 |         Whether to sort keys before writing. Should be True if you ever use revision control
 61 |         on the resulting json file.
 62 |     """
 63 |     blob = json.dumps(obj, indent=indent, sort_keys=sort_keys)
 64 | 
 65 |     with open(filename, 'w') as fw:
 66 |         fw.write(blob)
 67 | 
 68 | def load_json(filename):
 69 |     """Read a json file from disk"""
 70 |     with open(filename) as f:
 71 |         obj = json.load(f)
 72 |     return obj
 73 | 
 74 | def head_file(filename, n=5):
 75 |     """Return the first `n` lines of a file
 76 |     """
 77 |     with open(filename, 'r') as fd:
 78 |         lines = []
 79 |         for i, line in enumerate(fd):
 80 |             if i > n:
 81 |                 break
 82 |             lines.append(line)
 83 |     return "".join(lines)
 84 | 
 85 | def list_dir(path, fully_qualified=False, glob_pattern='*'):
 86 |     """do an ls on a path
 87 | 
 88 |     fully_qualified: boolean (default: False)
 89 |         If True, return a list of fully qualified pathlib objects.
 90 |         if False, return just the bare filenames
 91 |     glob_pattern: glob (default: '*')
 92 |         File mattern to match
 93 | 
 94 |     Returns
 95 |     -------
 96 |     A list of names, or fully qualified pathlib objects"""
 97 |     if fully_qualified:
 98 |         return list(pathlib.Path(path).glob(glob_pattern))
 99 | 
100 |     return [file.name for file in pathlib.Path(path).glob(glob_pattern)]
101 | 
102 | def normalize_to_list(str_or_iterable):
103 |     """Convert strings to lists. convert None to list. Convert all other iterables to lists
104 |     """
105 |     if isinstance(str_or_iterable, str):
106 |         return [str_or_iterable]
107 |     if str_or_iterable is None:
108 |         return []
109 |     return str_or_iterable
110 | 
111 | 
112 | def run_notebook(*,
113 |                 notebook_name=None,
114 |                 notebook_path=None,
115 |                 output_notebook_name=None,
116 |                 output_notebook_path=None,
117 |                 timeout=-1,
118 |                 notebook_version=4,
119 |                 kernel='python3',
120 |                 ):
121 |     """Execute a jupyter notebook
122 | 
123 |     kernel name is an issue: https://github.com/jupyter/nbconvert/issues/515
124 | 
125 |     """
126 |     if notebook_path is None:
127 |         notebook_path = paths['notebook_path']
128 |     else:
129 |         notebook_path = pathlib.Path(notebook_path)
130 | 
131 |     if output_notebook_path is None:
132 |         output_notebook_path = paths['interim_data_path']
133 |     else:
134 |         output_notebook_path = pathlib.Path(output_notebook_path)
135 | 
136 |     if output_notebook_name is None:
137 |         output_notebook_name = notebook_name
138 | 
139 |     output_notebook_fq = output_notebook_path / output_notebook_name
140 | 
141 |     with open(notebook_path / notebook_name) as f:
142 |         nb = nbformat.read(f, as_version=notebook_version)
143 | 
144 |     ep = ExecutePreprocessor(timeout=timeout, kernel_name=kernel)
145 |     try:
146 |         out = ep.preprocess(nb, {'metadata': {'path': notebook_path}})
147 |     except CellExecutionError:
148 |         out = None
149 |         msg = f"""Error executing the notebook "{notebook_name}".
150 | 
151 |         See notebook "{str(output_notebook_fq)}" for the traceback.'
152 |         """
153 |         logger.error(msg)
154 |         raise
155 |     finally:
156 |         with open(output_notebook_fq, mode='w', encoding='utf-8') as f:
157 |             nbformat.write(nb, f)
158 |     return output_notebook_name
159 | 


--------------------------------------------------------------------------------
/{{ cookiecutter.repo_name }}/{{ cookiecutter.module_name }}/kvstore.py:
--------------------------------------------------------------------------------
  1 | import configparser
  2 | import pathlib
  3 | from collections.abc import MutableMapping
  4 | 
  5 | class KVStore(MutableMapping):
  6 |     """Dictionary-like key-value store backed to disk by a ConfigParser (ini) file
  7 | 
  8 |     Basic functionality is that of a dictionary, with the addition of an implicit
  9 |     `config_file` and `config_section`:
 10 | 
 11 |     >>> getfixture('manage_config_ini')  # This is just a test fixture, please disregard
 12 |     >>> d = KVStore({'key1':'value1'}, key2='value2')
 13 |     >>> d['key3'] = 'value3'
 14 |     >>> d
 15 |     KVStore(config_file='config.ini', config_section='KVStore', key1='value1', key2='value2', key3='value3')
 16 | 
 17 | 
 18 |     To create a brand new, default KVStore, ignoring anything that may already be on disk:
 19 |     >>> d = KVStore(overwrite=True)
 20 |     >>> d
 21 |     KVStore(config_file='config.ini', config_section='KVStore', )
 22 | 
 23 |     KVStore values can reference other values via substitution using the
 24 |     `ConfigParser.ExtendedInterpolation` format. When the KVStore is viewed as a dict,
 25 |     this substitution happens automatically.
 26 | 
 27 |     >>> d = KVStore(root_path='/tmp', data_path='${root_path}/data')
 28 |     >>> dict(d)
 29 |     {'root_path': '/tmp', 'data_path': '/tmp/data'}
 30 |     >>> d['data_path']
 31 |     '/tmp/data'
 32 | 
 33 |     To see the unparsed (raw) value, examine the object's `data` method; e.g.
 34 |     >>> d.data
 35 |     {'root_path': '/tmp', 'data_path': '${root_path}/data'}
 36 | 
 37 |     This substitution is updated whenever a key changes; e.g.
 38 |     >>> d['raw_data_path'] = '${root_path}/raw'
 39 |     >>> d['root_path'] = '/tmp2'
 40 |     >>> dict(d)
 41 |     {'root_path': '/tmp2', 'data_path': '/tmp2/data', 'raw_data_path': '/tmp2/raw'}
 42 |     >>> d.data
 43 |     {'root_path': '/tmp2', 'data_path': '${root_path}/data', 'raw_data_path': '${root_path}/raw'}
 44 |     >>> d['data_path']
 45 |     '/tmp2/data'
 46 | 
 47 |     Because this object is disk-backed, newly instantiated objects will receive the last set of defaults:
 48 |     >>> c = KVStore()
 49 |     >>> dict(c)
 50 |     {'root_path': '/tmp2', 'data_path': '/tmp2/data', 'raw_data_path': '/tmp2/raw'}
 51 |     >>> c.data
 52 |     {'root_path': '/tmp2', 'data_path': '${root_path}/data', 'raw_data_path': '${root_path}/raw'}
 53 | 
 54 |     We can force overwriting of this disk-backed file using the `overwrite` parameters:
 55 |     >>> c = KVStore(overwrite=True)
 56 |     >>> dict(c), c.data
 57 |     ({}, {})
 58 |     """
 59 |     def __init__(self, *args,
 60 |                  config_file=None, config_section="KVStore", overwrite=False, persistent=True,
 61 |                  **kwargs):
 62 |         """Create a new disk-backed key-value store
 63 | 
 64 |         Arguments
 65 |         ---------
 66 |         config_file: Path
 67 |             path to ini (ConfigParser-formatted) file that will be used to persist the KVStore
 68 |         config_section: String
 69 |             Section name to be used in the `config_file`
 70 |         overwrite: Boolean
 71 |             If True, any config file on disk will be overwritten.
 72 |             Otherwise, existing values from this file will be used as defaults,
 73 |             (unless overridden by explicit key/value pairs in the constructor)
 74 |         *args, **kwargs:
 75 |             All other arguments will be used as per the standard `dict` constructor
 76 | 
 77 |         """
 78 |         self._persistent = persistent
 79 |         if config_file is None:
 80 |             self._config_file = pathlib.Path("config.ini")
 81 |         else:
 82 |             self._config_file = pathlib.Path(config_file)
 83 |         self._config_section = config_section
 84 |         self._config = configparser.ConfigParser(interpolation=configparser.ExtendedInterpolation())
 85 | 
 86 |         self.data = dict()
 87 | 
 88 |         if self._config_file.exists() and not overwrite:
 89 |             self._read()
 90 |         else:
 91 |             self._config.add_section(config_section)
 92 |             self._config.read_dict(self.data)
 93 | 
 94 |         self.update({k:v for k,v in self._config.items(self._config_section, raw=True)}) # `update` comes for free from the abc
 95 |         self.update(dict(*args, **kwargs))
 96 |         self._write()
 97 | 
 98 |     def __getitem__(self, key):
 99 |         return self._config.get(self._config_section, key)
100 | 
101 |     def __setitem__(self, key, value):
102 |         self.data[key] = value
103 |         self._config.set(self._config_section, key, value)
104 |         self._write()
105 | 
106 |     def __delitem__(self, key):
107 |         del self.data[key]
108 |         self._config.remove_option(self._config_section, key)
109 |         self._write()
110 | 
111 |     def __iter__(self):
112 |         return iter(self.data)
113 | 
114 |     def __len__(self):
115 |         return len(self.data)
116 | 
117 |     def _read(self):
118 |         self._config.read(self._config_file)
119 |         if not self._config.has_section(self._config_section):
120 |             # File exists but we are adding to a new section of it
121 |             self._config.add_section(self._config_section)
122 | 
123 |     def _write(self):
124 |         if self._persistent:
125 |             with open(self._config_file, 'w') as fw:
126 |                 self._config.write(fw)
127 | 
128 |     def __repr__(self):
129 |         kvstr = ", ".join([f"{k}='{v}'" for k,v in self.data.items()])
130 |         return f"KVStore(config_file='{str(self._config_file)}', config_section='{self._config_section}', {kvstr})"
131 | 
132 |     def __str__(self):
133 |         return str({k:v for k,v in self._config.items(self._config_section, raw=False)})
134 | 
135 | 
136 | if __name__ == "__main__":
137 |     import doctest
138 |     doctest.testmod()
139 | 


--------------------------------------------------------------------------------
/docs/00-xyz-sample-notebook.ipynb:
--------------------------------------------------------------------------------
  1 | {
  2 |  "cells": [
  3 |   {
  4 |    "cell_type": "markdown",
  5 |    "metadata": {},
  6 |    "source": [
  7 |     "## Naming Convention\n",
  8 |     "The notebooks are named `dd-xyz-title.ipynb` where:\n",
  9 |     "* `dd` is an integer indicating the notebook sequence. This is critical when there are dependencies between notebooks\n",
 10 |     "* `xyz` is the author's initials, to help avoid namespace clashes when multiple parties are committing to the same repo\n",
 11 |     "* `title` is the name of the notebook, words separated by hyphens.\n"
 12 |    ]
 13 |   },
 14 |   {
 15 |    "cell_type": "markdown",
 16 |    "metadata": {},
 17 |    "source": [
 18 |     "## Useful Header Cells\n",
 19 |     "Make jupyter notebook use the full screen width"
 20 |    ]
 21 |   },
 22 |   {
 23 |    "cell_type": "code",
 24 |    "execution_count": null,
 25 |    "metadata": {},
 26 |    "outputs": [],
 27 |    "source": [
 28 |     "from IPython.core.display import display, HTML\n",
 29 |     "display(HTML(\"<style>.container { width:100% !important; }</style>\"))"
 30 |    ]
 31 |   },
 32 |   {
 33 |    "cell_type": "markdown",
 34 |    "metadata": {},
 35 |    "source": [
 36 |     "When developing code in the `src` module, it's very useful to enable auto-reload:"
 37 |    ]
 38 |   },
 39 |   {
 40 |    "cell_type": "code",
 41 |    "execution_count": null,
 42 |    "metadata": {},
 43 |    "outputs": [],
 44 |    "source": [
 45 |     "%load_ext autoreload\n",
 46 |     "%autoreload 2"
 47 |    ]
 48 |   },
 49 |   {
 50 |    "cell_type": "markdown",
 51 |    "metadata": {},
 52 |    "source": [
 53 |     "## Python Libraries\n",
 54 |     "Imports you'll almost always want"
 55 |    ]
 56 |   },
 57 |   {
 58 |    "cell_type": "code",
 59 |    "execution_count": null,
 60 |    "metadata": {},
 61 |    "outputs": [],
 62 |    "source": [
 63 |     "# Python Imports, alphabetized\n",
 64 |     "import pathlib\n",
 65 |     "\n",
 66 |     "#3rd party python modules, alphabetized\n",
 67 |     "\n",
 68 |     "import pandas as pd\n",
 69 |     "\n",
 70 |     "# Source module imports \n",
 71 |     "from src import paths\n",
 72 |     "from src.data import DataSource, Dataset, Catalog"
 73 |    ]
 74 |   },
 75 |   {
 76 |    "cell_type": "markdown",
 77 |    "metadata": {},
 78 |    "source": [
 79 |     "## Logging\n",
 80 |     "Enable logging and crank up log level to DEBUG. This is particularly useful when developing code in your project module and using it from a notebook."
 81 |    ]
 82 |   },
 83 |   {
 84 |    "cell_type": "code",
 85 |    "execution_count": null,
 86 |    "metadata": {},
 87 |    "outputs": [],
 88 |    "source": [
 89 |     "import logging\n",
 90 |     "from src.log import logger\n",
 91 |     "\n",
 92 |     "logger.setLevel(logging.DEBUG)"
 93 |    ]
 94 |   },
 95 |   {
 96 |    "cell_type": "markdown",
 97 |    "metadata": {},
 98 |    "source": [
 99 |     "## Working with a Dataset from the catalog\n",
100 |     "List available datasets\n"
101 |    ]
102 |   },
103 |   {
104 |    "cell_type": "code",
105 |    "execution_count": null,
106 |    "metadata": {},
107 |    "outputs": [],
108 |    "source": [
109 |     "c = Catalog.load('datasets'); c"
110 |    ]
111 |   },
112 |   {
113 |    "cell_type": "markdown",
114 |    "metadata": {},
115 |    "source": [
116 |     "Note: The first time running a `load` function on a new dataset may be slow, as it is doing all the work to generate and verify the contents of a dataset. However, on subsequent runs, it will use a cached copy of the dataset and be quick. "
117 |    ]
118 |   },
119 |   {
120 |    "cell_type": "code",
121 |    "execution_count": null,
122 |    "metadata": {},
123 |    "outputs": [],
124 |    "source": [
125 |     "%%time\n",
126 |     "ds = Dataset.load('20_newsgroups') # replace my-dataset with the name of a dataset you have a recipe for"
127 |    ]
128 |   },
129 |   {
130 |    "cell_type": "code",
131 |    "execution_count": null,
132 |    "metadata": {},
133 |    "outputs": [],
134 |    "source": [
135 |     "len(ds.data)"
136 |    ]
137 |   },
138 |   {
139 |    "cell_type": "code",
140 |    "execution_count": null,
141 |    "metadata": {},
142 |    "outputs": [],
143 |    "source": [
144 |     "ds.data[:5]"
145 |    ]
146 |   },
147 |   {
148 |    "cell_type": "code",
149 |    "execution_count": null,
150 |    "metadata": {},
151 |    "outputs": [],
152 |    "source": [
153 |     "print(ds.README)"
154 |    ]
155 |   },
156 |   {
157 |    "cell_type": "code",
158 |    "execution_count": null,
159 |    "metadata": {},
160 |    "outputs": [],
161 |    "source": [
162 |     "print(ds.LICENSE)"
163 |    ]
164 |   },
165 |   {
166 |    "cell_type": "markdown",
167 |    "metadata": {},
168 |    "source": [
169 |     "If you have data, you're up and running with a working installation."
170 |    ]
171 |   },
172 |   {
173 |    "cell_type": "markdown",
174 |    "metadata": {},
175 |    "source": [
176 |     "## Some data science libraries built in to the base conda environment"
177 |    ]
178 |   },
179 |   {
180 |    "cell_type": "code",
181 |    "execution_count": null,
182 |    "metadata": {},
183 |    "outputs": [],
184 |    "source": [
185 |     "# basic data science and visualization libraries\n",
186 |     "import sklearn\n",
187 |     "import matplotlib\n",
188 |     "import scipy\n",
189 |     "import pandas"
190 |    ]
191 |   },
192 |   {
193 |    "cell_type": "code",
194 |    "execution_count": null,
195 |    "metadata": {},
196 |    "outputs": [],
197 |    "source": []
198 |   }
199 |  ],
200 |  "metadata": {
201 |   "kernelspec": {
202 |    "display_name": "Python [conda env:easydata-notebook]",
203 |    "language": "python",
204 |    "name": "conda-env-easydata-notebook-py"
205 |   },
206 |   "language_info": {
207 |    "codemirror_mode": {
208 |     "name": "ipython",
209 |     "version": 3
210 |    },
211 |    "file_extension": ".py",
212 |    "mimetype": "text/x-python",
213 |    "name": "python",
214 |    "nbconvert_exporter": "python",
215 |    "pygments_lexer": "ipython3",
216 |    "version": "3.7.10"
217 |   }
218 |  },
219 |  "nbformat": 4,
220 |  "nbformat_minor": 4
221 | }
222 | 


--------------------------------------------------------------------------------
/{{ cookiecutter.repo_name }}/docs/make.bat:
--------------------------------------------------------------------------------
  1 | @ECHO OFF
  2 | 
  3 | REM Command file for Sphinx documentation
  4 | 
  5 | if "%SPHINXBUILD%" == "" (
  6 | 	set SPHINXBUILD=sphinx-build
  7 | )
  8 | set BUILDDIR=_build
  9 | set ALLSPHINXOPTS=-d %BUILDDIR%/doctrees %SPHINXOPTS% .
 10 | set I18NSPHINXOPTS=%SPHINXOPTS% .
 11 | if NOT "%PAPER%" == "" (
 12 | 	set ALLSPHINXOPTS=-D latex_paper_size=%PAPER% %ALLSPHINXOPTS%
 13 | 	set I18NSPHINXOPTS=-D latex_paper_size=%PAPER% %I18NSPHINXOPTS%
 14 | )
 15 | 
 16 | if "%1" == "" goto help
 17 | 
 18 | if "%1" == "help" (
 19 | 	:help
 20 | 	echo.Please use `make ^<target^>` where ^<target^> is one of
 21 | 	echo.  html       to make standalone HTML files
 22 | 	echo.  dirhtml    to make HTML files named index.html in directories
 23 | 	echo.  singlehtml to make a single large HTML file
 24 | 	echo.  pickle     to make pickle files
 25 | 	echo.  json       to make JSON files
 26 | 	echo.  htmlhelp   to make HTML files and a HTML help project
 27 | 	echo.  qthelp     to make HTML files and a qthelp project
 28 | 	echo.  devhelp    to make HTML files and a Devhelp project
 29 | 	echo.  epub       to make an epub
 30 | 	echo.  latex      to make LaTeX files, you can set PAPER=a4 or PAPER=letter
 31 | 	echo.  text       to make text files
 32 | 	echo.  man        to make manual pages
 33 | 	echo.  texinfo    to make Texinfo files
 34 | 	echo.  gettext    to make PO message catalogs
 35 | 	echo.  changes    to make an overview over all changed/added/deprecated items
 36 | 	echo.  linkcheck  to check all external links for integrity
 37 | 	echo.  doctest    to run all doctests embedded in the documentation if enabled
 38 | 	goto end
 39 | )
 40 | 
 41 | if "%1" == "clean" (
 42 | 	for /d %%i in (%BUILDDIR%\*) do rmdir /q /s %%i
 43 | 	del /q /s %BUILDDIR%\*
 44 | 	goto end
 45 | )
 46 | 
 47 | if "%1" == "html" (
 48 | 	%SPHINXBUILD% -b html %ALLSPHINXOPTS% %BUILDDIR%/html
 49 | 	if errorlevel 1 exit /b 1
 50 | 	echo.
 51 | 	echo.Build finished. The HTML pages are in %BUILDDIR%/html.
 52 | 	goto end
 53 | )
 54 | 
 55 | if "%1" == "dirhtml" (
 56 | 	%SPHINXBUILD% -b dirhtml %ALLSPHINXOPTS% %BUILDDIR%/dirhtml
 57 | 	if errorlevel 1 exit /b 1
 58 | 	echo.
 59 | 	echo.Build finished. The HTML pages are in %BUILDDIR%/dirhtml.
 60 | 	goto end
 61 | )
 62 | 
 63 | if "%1" == "singlehtml" (
 64 | 	%SPHINXBUILD% -b singlehtml %ALLSPHINXOPTS% %BUILDDIR%/singlehtml
 65 | 	if errorlevel 1 exit /b 1
 66 | 	echo.
 67 | 	echo.Build finished. The HTML pages are in %BUILDDIR%/singlehtml.
 68 | 	goto end
 69 | )
 70 | 
 71 | if "%1" == "pickle" (
 72 | 	%SPHINXBUILD% -b pickle %ALLSPHINXOPTS% %BUILDDIR%/pickle
 73 | 	if errorlevel 1 exit /b 1
 74 | 	echo.
 75 | 	echo.Build finished; now you can process the pickle files.
 76 | 	goto end
 77 | )
 78 | 
 79 | if "%1" == "json" (
 80 | 	%SPHINXBUILD% -b json %ALLSPHINXOPTS% %BUILDDIR%/json
 81 | 	if errorlevel 1 exit /b 1
 82 | 	echo.
 83 | 	echo.Build finished; now you can process the JSON files.
 84 | 	goto end
 85 | )
 86 | 
 87 | if "%1" == "htmlhelp" (
 88 | 	%SPHINXBUILD% -b htmlhelp %ALLSPHINXOPTS% %BUILDDIR%/htmlhelp
 89 | 	if errorlevel 1 exit /b 1
 90 | 	echo.
 91 | 	echo.Build finished; now you can run HTML Help Workshop with the ^
 92 | .hhp project file in %BUILDDIR%/htmlhelp.
 93 | 	goto end
 94 | )
 95 | 
 96 | if "%1" == "qthelp" (
 97 | 	%SPHINXBUILD% -b qthelp %ALLSPHINXOPTS% %BUILDDIR%/qthelp
 98 | 	if errorlevel 1 exit /b 1
 99 | 	echo.
100 | 	echo.Build finished; now you can run "qcollectiongenerator" with the ^
101 | .qhcp project file in %BUILDDIR%/qthelp, like this:
102 | 	echo.^> qcollectiongenerator %BUILDDIR%\qthelp\{{ cookiecutter.repo_name }}.qhcp
103 | 	echo.To view the help file:
104 | 	echo.^> assistant -collectionFile %BUILDDIR%\qthelp\{{ cookiecutter.repo_name }}.ghc
105 | 	goto end
106 | )
107 | 
108 | if "%1" == "devhelp" (
109 | 	%SPHINXBUILD% -b devhelp %ALLSPHINXOPTS% %BUILDDIR%/devhelp
110 | 	if errorlevel 1 exit /b 1
111 | 	echo.
112 | 	echo.Build finished.
113 | 	goto end
114 | )
115 | 
116 | if "%1" == "epub" (
117 | 	%SPHINXBUILD% -b epub %ALLSPHINXOPTS% %BUILDDIR%/epub
118 | 	if errorlevel 1 exit /b 1
119 | 	echo.
120 | 	echo.Build finished. The epub file is in %BUILDDIR%/epub.
121 | 	goto end
122 | )
123 | 
124 | if "%1" == "latex" (
125 | 	%SPHINXBUILD% -b latex %ALLSPHINXOPTS% %BUILDDIR%/latex
126 | 	if errorlevel 1 exit /b 1
127 | 	echo.
128 | 	echo.Build finished; the LaTeX files are in %BUILDDIR%/latex.
129 | 	goto end
130 | )
131 | 
132 | if "%1" == "text" (
133 | 	%SPHINXBUILD% -b text %ALLSPHINXOPTS% %BUILDDIR%/text
134 | 	if errorlevel 1 exit /b 1
135 | 	echo.
136 | 	echo.Build finished. The text files are in %BUILDDIR%/text.
137 | 	goto end
138 | )
139 | 
140 | if "%1" == "man" (
141 | 	%SPHINXBUILD% -b man %ALLSPHINXOPTS% %BUILDDIR%/man
142 | 	if errorlevel 1 exit /b 1
143 | 	echo.
144 | 	echo.Build finished. The manual pages are in %BUILDDIR%/man.
145 | 	goto end
146 | )
147 | 
148 | if "%1" == "texinfo" (
149 | 	%SPHINXBUILD% -b texinfo %ALLSPHINXOPTS% %BUILDDIR%/texinfo
150 | 	if errorlevel 1 exit /b 1
151 | 	echo.
152 | 	echo.Build finished. The Texinfo files are in %BUILDDIR%/texinfo.
153 | 	goto end
154 | )
155 | 
156 | if "%1" == "gettext" (
157 | 	%SPHINXBUILD% -b gettext %I18NSPHINXOPTS% %BUILDDIR%/locale
158 | 	if errorlevel 1 exit /b 1
159 | 	echo.
160 | 	echo.Build finished. The message catalogs are in %BUILDDIR%/locale.
161 | 	goto end
162 | )
163 | 
164 | if "%1" == "changes" (
165 | 	%SPHINXBUILD% -b changes %ALLSPHINXOPTS% %BUILDDIR%/changes
166 | 	if errorlevel 1 exit /b 1
167 | 	echo.
168 | 	echo.The overview file is in %BUILDDIR%/changes.
169 | 	goto end
170 | )
171 | 
172 | if "%1" == "linkcheck" (
173 | 	%SPHINXBUILD% -b linkcheck %ALLSPHINXOPTS% %BUILDDIR%/linkcheck
174 | 	if errorlevel 1 exit /b 1
175 | 	echo.
176 | 	echo.Link check complete; look for any errors in the above output ^
177 | or in %BUILDDIR%/linkcheck/output.txt.
178 | 	goto end
179 | )
180 | 
181 | if "%1" == "doctest" (
182 | 	%SPHINXBUILD% -b doctest %ALLSPHINXOPTS% %BUILDDIR%/doctest
183 | 	if errorlevel 1 exit /b 1
184 | 	echo.
185 | 	echo.Testing of doctests in the sources finished, look at the ^
186 | results in %BUILDDIR%/doctest/output.txt.
187 | 	goto end
188 | )
189 | 
190 | :end
191 | 


--------------------------------------------------------------------------------
/docs/datasets.md:
--------------------------------------------------------------------------------
  1 | # Datasets
  2 | 
  3 | Easydata lets you build a dataset catalog from which you can load any dataset in the catalog via its `dataset_name` via the `.load` API.
  4 | ```python
  5 | ds = Dataset.load(dataset_name)
  6 | ```
  7 | The basic idea is that we don't want to share data directly, instead, we share the recipes for how to re-create Datasets. These recipes are stored in the dataset catalog. Datasets can then be shared by sharing the catalog.
  8 | 
  9 | Datasets are the fundamental object that makes sharing of datasets reproducible, as they keep track of their own recipes, check that the data created from a recipe has the correct hashes, and keep licenses and other metadata with the data itself.
 10 | 
 11 | ## What is a `Dataset` object?
 12 | 
 13 | A Dataset is the fundamental object we use for turning raw data into useful datasets, reproducibly. It is like a scikit-learn-style `Bunch` object---essentially, a dictionary with some extra magic to make it nicer to work with---containing the following attributes:
 14 | 
 15 | ```bash
 16 |         data: the processed data
 17 |         target: (optional) target vector (for supervised learning problems)
 18 |         metadata: Data about the data
 19 | ```
 20 | 
 21 | The `data` attribute can really be any processed data form that you like: sometimes it's a pandas dataframe, a list of tuples containing other data, or other formats including  `scipy.sparse` matrices or `igraph` graphs. The `target` (if you're using it), expects something that matches the `data` in terms of length.
 22 | 
 23 | For a hint as to which data format to expect, you can look at the contents of the `DESCR` attribute, one of the many pieces of medata that are maintained as part of the `Dataset` object.
 24 | 
 25 | This `metadata` is where things get interesting... which we'll cover on its own next.
 26 | 
 27 | ## Why `metadata`?
 28 | The `metadata` is where the magic lives. It serves several purposes in terms of bookkeeping:
 29 | 
 30 | * it includes `HASHES`, which **improve data reproducibility**, since what you download and process gets checked each step along the way to ensure the raw data matches what is stored in the `dataset_catalog`,
 31 | * it provides easy access to **what the data is** via the `DESCR` attribute,
 32 | * it provides easy (and continual) **access to the license / usage restrictions** for the data (the `LICENSE` attribute), which helps with knowing what you can do when [Sharing your Work](sharing-your-work.md).
 33 | * it provides the **extra data manifest**, `EXTRA`, if your dataset includes around additional raw data (extra) files.
 34 | 
 35 | In short, it helps you to know what data you're working with, what you can do with it, and whether something has gone wrong.
 36 | 
 37 | Under the hood, metadata is a dictionary; however metadata can also be accessed by referring to attributes expressed in uppercase. For example, `ds.metadata['license']` and `ds.LICENSE` refer to the same thing.
 38 | 
 39 | ## Using a `Dataset`
 40 | As mentioned before, to load a `Dataset`:
 41 | ```python
 42 | ds = Dataset.load("<dataset-name>")
 43 | ```
 44 | At this point, if you already have a cached copy of the desired `Dataset` on disk, it will load it. Otherwise, the it will follow the *recipe* for generating the requested `Dataset`; i.e. generate the dataset from raw data, as per the instructions contained in the `dataset_catalog` (described below).
 45 | 
 46 | Because of licenses and other distribution restrictions, some of the datasets may require a **manual download** step. If so, you will be prompted at this point and given instructions for what to do. Some datasets will require local pre-processing. If so, the first time your run the command, you will be executing all of the processing scripts (which can be quite slow).
 47 | 
 48 | After the first load, however, datasets will load from cache on disk which should be fast. If you need to free up space, you can even delete related source files from `data/raw` and `data/interim`. Just don't touch the `data/processed` directory.
 49 | 
 50 | To access the data, target or metdata:
 51 | ```python
 52 | ds.data
 53 | ds.target
 54 | ds.metadata
 55 | ```
 56 | 
 57 | To access the most common metadata fields:
 58 | ```python
 59 | ds.DESCR          # or ds.metadata['descr']
 60 | ds.LICENSE        # or ds.metadata['license']
 61 | ds.HASHES         # or ds.metadata['hashes']
 62 | ```
 63 | ## The Dataset catalog
 64 | You can explore all of the currently available `Datasets` via the dataset catalog. The catalog keeps a record of the recipes used to generate a `Dataset` along with relevant hashes that are used to ensure the integrity of data when it's loaded.
 65 | 
 66 | To access the catalog:
 67 | 
 68 | ```python
 69 | workflow.available_datasets(keys_only=True)
 70 | ```
 71 | If you're interested, set `keys_only=False` to see the complete contents of the metadata that is saved in the catalog.
 72 | 
 73 | ## Creating Dataset Recipes
 74 | 
 75 | For the curious...
 76 | 
 77 | The API for adding datasets is not yet user friendly, but we are currently working on making it so.
 78 | 
 79 | When to create a dataset:
 80 | 
 81 | * If you're even tempted to save some data to disk so you don't have to recompute it later,
 82 | * If you're even tempted to save data to share with someone else,
 83 | * If you want to access your data from another notebook/code path,
 84 | * If you feel like it :)
 85 | 
 86 | We have included some examples to let you look under the hood if you're interested, and have included some common examples as part of the `workflow` module to make it easier to use.
 87 | 
 88 | Example notebooks using the built-in `workflow`:
 89 | 
 90 | * [Creating a dataset from a csv file](../Add-csv-template)
 91 | * [Creating a derived dataset using a single function](../Add-derived-dataset)
 92 | 
 93 | Example notebooks for generally building datasets:
 94 | 
 95 | * [Dataset from raw file](../New-Dataset-Template)
 96 | * [Dataset from another dataset](../New-Edge-Template)
 97 | 
 98 | You can also make datasets from multiple existing datasets, or make multiple datasets at once.
 99 | 
100 | Some datasets are trickier to include than others and may used advanced functionality. So please ask any questions that you may have. We'll attempt to explain and update the examples based on requests.


--------------------------------------------------------------------------------
/{{ cookiecutter.repo_name }}/reference/easydata/datasets.md:
--------------------------------------------------------------------------------
  1 | # Getting and Using Datasets
  2 | 
  3 | ## TL;DR
  4 | To get started, all you really need to know is that you can query for available datasets via
  5 | ```python
  6 | from {{ cookiecutter.module_name }}.data import Catalog
  7 | Catalog.load("datasets")
  8 | ```
  9 | 
 10 | and load these datasets via
 11 | ```python
 12 | from {{ cookiecutter.module_name }}.data import Dataset
 13 | ds = Dataset.load(dataset_name)
 14 | ```
 15 | 
 16 | If you've followed the instructions from building the repo contained in the [README](../README.md), this should just work (if it doesn't, please let us know)!
 17 | 
 18 | You can start using the data via `ds.data`. To find out more about the dataset you've just loaded, take a look at `ds.README` and `ds.LICENSE`.
 19 | 
 20 | **Disk Space Note**: sometimes datasets can be quite large. If you want to store your data externally, we recommend pointing your data directory to a new location; that is,
 21 | 
 22 | ```python
 23 | from {{ cookiecutter.module_name }} import paths
 24 | paths["data_path"] = "/path/to/big/data/directory"
 25 | ```
 26 | 
 27 | ## Digging Deeper
 28 | It is useful to know a little bit more about how Datasets work.
 29 | 
 30 | ## What is a `Dataset` object?
 31 | 
 32 | A Dataset is the fundamental object we use for turning raw data into useful datasets, reproducibly. It is like a scikit-learn-style `Bunch` object --- essentially, a dictionary with some extra magic to make it nicer to work with --- containing the following attributes:
 33 | 
 34 | ```
 35 |         data: the processed data
 36 |         target: (optional) target vector (for supervised learning problems)
 37 |         metadata: Data about the data
 38 | ```
 39 | 
 40 | The `data` attribute can really be any processed data form that you like: sometimes it's a pandas dataframe (like with `wine_reviews_130k`), a list of tuples containing other data, (`reddit_comment_tree_graphs`), or other formats including  `scipy.sparse` matrices or `igraph` graphs. The `target` (if you're using it), expects something that matches the `data` in terms of length.
 41 | 
 42 | For a hint as to which data format to expect, you can look at the contents of the `README` attribute, one of the many pieces of medata that are maintained as part of the `Dataset` object.
 43 | 
 44 | This `metadata` is where things get interesting... which we'll cover on its own next.
 45 | 
 46 | ## Why `metadata`?
 47 | The `metadata` is where the magic lives. It serves several purposes in terms of bookkeeping:
 48 | 
 49 | * it includes `HASHES`, which **improve data reproducibility**, since what you download and process gets checked each step along the way to ensure the raw data matches what is stored in the `dataset_catalog`,
 50 | * it provides easy access to **what the data is** via the `README` attribute,
 51 | * it provides easy (and continual) **access to the license / usage restrictions** for the data (the `LICENSE` attribute), which helps with knowing what you can do when [Sharing your Work](sharing-your-work.md).
 52 | * it provides the **fileset data manifest**, `FILESET`, if your dataset includes around additional raw data (fileset) files.
 53 | 
 54 | In short, it helps you to know what data you're working with, what you can do with it, and whether something has gone wrong.
 55 | 
 56 | Under the hood, metadata is a dictionary; however metadata can also be accessed by referring to attributes expressed in uppercase. For example, `ds.metadata['license']` and `ds.LICENSE` refer to the same thing.
 57 | 
 58 | ## Using a `Dataset`
 59 | As mentioned before, to load a `Dataset`:
 60 | ```python
 61 | from {{ cookiecutter.module_name }}.data import Dataset
 62 | ds = Dataset.load("<dataset-name>")
 63 | ```
 64 | At this point, if you already have a cached copy of the desired `Dataset` on disk, it will load it. Otherwise, the it will follow the *recipe* for generating the requested `Dataset`; i.e. generate the dataset from raw data, as per the instructions contained in the `dataset_catalog` (described below).
 65 | 
 66 | Because of licenses and other distribution restrictions, some of the datasets may require a **manual download** step. If so, you will be prompted at this point and given instructions for what to do. Some datasets will require local pre-processing. If so, the first time your run the command, you will be executing all of the processing scripts (which can be quite slow).
 67 | 
 68 | After the first load, however, datasets will load from cache on disk which should be fast. If you need to free up space, you can even delete related source files from `data/raw` and `data/interim`. Just don't touch the `data/processed` directory.
 69 | 
 70 | To access the data, target or metdata:
 71 | ```python
 72 | ds.data
 73 | ds.target
 74 | ds.metadata
 75 | ```
 76 | 
 77 | To access the most common metadata fields:
 78 | ```python
 79 | ds.README          # or ds.metadata['readme']
 80 | ds.LICENSE        # or ds.metadata['license']
 81 | ds.HASHES         # or ds.metadata['hashes']
 82 | ```
 83 | ## The catalog
 84 | You can explore all of the currently available `Datasets` via the Dataset `Catalog`. The catalog keeps a record of the recipes used to generate a `Dataset` along with relevant hashes that are used to ensure the integrity of data when it's loaded.
 85 | 
 86 | To access the catalog:
 87 | 
 88 | ```python
 89 | from {{ cookiecutter.module_name }}.data import Catalog
 90 | Catalog.load("datasets")
 91 | ```
 92 | 
 93 | ## Sharing your Data as a `Dataset` object
 94 | In order to convert your data to a `Dataset` object, you will need to generate a catalog *recipe*, that uses a custom *function for processing your raw data*. Doing so allows us to document all the munging, pre-processing, and data verification necessary to reproducibly build the dataset.
 95 | 
 96 | ## What do you mean, LICENSE?
 97 | No conversation on sharing data would be complete without a short discussion about data licenses. This will be covered in [Sharing your Work](sharing-your-work.md).
 98 | 
 99 | 
100 | ### Quick References
101 | 
102 | * [README](../README.md)
103 | * [Setting up and Maintaining your Conda Environment Reproducibly](conda-environments.md)
104 | * [Getting and Using Datasets](datasets.md)
105 | * [Using Notebooks for Analysis](notebooks.md)
106 | * [Sharing your Work](sharing-your-work.md)
107 | * [Troubleshooting Guide](troubleshooting.md)
108 | 


--------------------------------------------------------------------------------
/{{ cookiecutter.repo_name }}/docs/Makefile:
--------------------------------------------------------------------------------
  1 | # Makefile for Sphinx documentation
  2 | #
  3 | 
  4 | # You can set these variables from the command line.
  5 | SPHINXOPTS    =
  6 | SPHINXBUILD   = sphinx-build
  7 | PAPER         =
  8 | BUILDDIR      = _build
  9 | 
 10 | # Internal variables.
 11 | PAPEROPT_a4     = -D latex_paper_size=a4
 12 | PAPEROPT_letter = -D latex_paper_size=letter
 13 | ALLSPHINXOPTS   = -d $(BUILDDIR)/doctrees $(PAPEROPT_$(PAPER)) $(SPHINXOPTS) .
 14 | # the i18n builder cannot share the environment and doctrees with the others
 15 | I18NSPHINXOPTS  = $(PAPEROPT_$(PAPER)) $(SPHINXOPTS) .
 16 | 
 17 | .PHONY: help clean html dirhtml singlehtml pickle json htmlhelp qthelp devhelp epub latex latexpdf text man changes linkcheck doctest gettext
 18 | 
 19 | help:
 20 | 	@echo "Please use \`make <target>' where <target> is one of"
 21 | 	@echo "  html       to make standalone HTML files"
 22 | 	@echo "  dirhtml    to make HTML files named index.html in directories"
 23 | 	@echo "  singlehtml to make a single large HTML file"
 24 | 	@echo "  pickle     to make pickle files"
 25 | 	@echo "  json       to make JSON files"
 26 | 	@echo "  htmlhelp   to make HTML files and a HTML help project"
 27 | 	@echo "  qthelp     to make HTML files and a qthelp project"
 28 | 	@echo "  devhelp    to make HTML files and a Devhelp project"
 29 | 	@echo "  epub       to make an epub"
 30 | 	@echo "  latex      to make LaTeX files, you can set PAPER=a4 or PAPER=letter"
 31 | 	@echo "  latexpdf   to make LaTeX files and run them through pdflatex"
 32 | 	@echo "  text       to make text files"
 33 | 	@echo "  man        to make manual pages"
 34 | 	@echo "  texinfo    to make Texinfo files"
 35 | 	@echo "  info       to make Texinfo files and run them through makeinfo"
 36 | 	@echo "  gettext    to make PO message catalogs"
 37 | 	@echo "  changes    to make an overview of all changed/added/deprecated items"
 38 | 	@echo "  linkcheck  to check all external links for integrity"
 39 | 	@echo "  doctest    to run all doctests embedded in the documentation (if enabled)"
 40 | 
 41 | clean:
 42 | 	-rm -rf $(BUILDDIR)/*
 43 | 
 44 | html:
 45 | 	$(SPHINXBUILD) -b html $(ALLSPHINXOPTS) $(BUILDDIR)/html
 46 | 	@echo
 47 | 	@echo "Build finished. The HTML pages are in $(BUILDDIR)/html."
 48 | 
 49 | dirhtml:
 50 | 	$(SPHINXBUILD) -b dirhtml $(ALLSPHINXOPTS) $(BUILDDIR)/dirhtml
 51 | 	@echo
 52 | 	@echo "Build finished. The HTML pages are in $(BUILDDIR)/dirhtml."
 53 | 
 54 | singlehtml:
 55 | 	$(SPHINXBUILD) -b singlehtml $(ALLSPHINXOPTS) $(BUILDDIR)/singlehtml
 56 | 	@echo
 57 | 	@echo "Build finished. The HTML page is in $(BUILDDIR)/singlehtml."
 58 | 
 59 | pickle:
 60 | 	$(SPHINXBUILD) -b pickle $(ALLSPHINXOPTS) $(BUILDDIR)/pickle
 61 | 	@echo
 62 | 	@echo "Build finished; now you can process the pickle files."
 63 | 
 64 | json:
 65 | 	$(SPHINXBUILD) -b json $(ALLSPHINXOPTS) $(BUILDDIR)/json
 66 | 	@echo
 67 | 	@echo "Build finished; now you can process the JSON files."
 68 | 
 69 | htmlhelp:
 70 | 	$(SPHINXBUILD) -b htmlhelp $(ALLSPHINXOPTS) $(BUILDDIR)/htmlhelp
 71 | 	@echo
 72 | 	@echo "Build finished; now you can run HTML Help Workshop with the" \
 73 | 	      ".hhp project file in $(BUILDDIR)/htmlhelp."
 74 | 
 75 | qthelp:
 76 | 	$(SPHINXBUILD) -b qthelp $(ALLSPHINXOPTS) $(BUILDDIR)/qthelp
 77 | 	@echo
 78 | 	@echo "Build finished; now you can run "qcollectiongenerator" with the" \
 79 | 	      ".qhcp project file in $(BUILDDIR)/qthelp, like this:"
 80 | 	@echo "# qcollectiongenerator $(BUILDDIR)/qthelp/{{ cookiecutter.repo_name }}.qhcp"
 81 | 	@echo "To view the help file:"
 82 | 	@echo "# assistant -collectionFile $(BUILDDIR)/qthelp/{{ cookiecutter.repo_name }}.qhc"
 83 | 
 84 | devhelp:
 85 | 	$(SPHINXBUILD) -b devhelp $(ALLSPHINXOPTS) $(BUILDDIR)/devhelp
 86 | 	@echo
 87 | 	@echo "Build finished."
 88 | 	@echo "To view the help file:"
 89 | 	@echo "# mkdir -p $$HOME/.local/share/devhelp/{{ cookiecutter.repo_name }}"
 90 | 	@echo "# ln -s $(BUILDDIR)/devhelp $$HOME/.local/share/devhelp/{{ cookiecutter.repo_name }}"
 91 | 	@echo "# devhelp"
 92 | 
 93 | epub:
 94 | 	$(SPHINXBUILD) -b epub $(ALLSPHINXOPTS) $(BUILDDIR)/epub
 95 | 	@echo
 96 | 	@echo "Build finished. The epub file is in $(BUILDDIR)/epub."
 97 | 
 98 | latex:
 99 | 	$(SPHINXBUILD) -b latex $(ALLSPHINXOPTS) $(BUILDDIR)/latex
100 | 	@echo
101 | 	@echo "Build finished; the LaTeX files are in $(BUILDDIR)/latex."
102 | 	@echo "Run \`make' in that directory to run these through (pdf)latex" \
103 | 	      "(use \`make latexpdf' here to do that automatically)."
104 | 
105 | latexpdf:
106 | 	$(SPHINXBUILD) -b latex $(ALLSPHINXOPTS) $(BUILDDIR)/latex
107 | 	@echo "Running LaTeX files through pdflatex..."
108 | 	$(MAKE) -C $(BUILDDIR)/latex all-pdf
109 | 	@echo "pdflatex finished; the PDF files are in $(BUILDDIR)/latex."
110 | 
111 | text:
112 | 	$(SPHINXBUILD) -b text $(ALLSPHINXOPTS) $(BUILDDIR)/text
113 | 	@echo
114 | 	@echo "Build finished. The text files are in $(BUILDDIR)/text."
115 | 
116 | man:
117 | 	$(SPHINXBUILD) -b man $(ALLSPHINXOPTS) $(BUILDDIR)/man
118 | 	@echo
119 | 	@echo "Build finished. The manual pages are in $(BUILDDIR)/man."
120 | 
121 | texinfo:
122 | 	$(SPHINXBUILD) -b texinfo $(ALLSPHINXOPTS) $(BUILDDIR)/texinfo
123 | 	@echo
124 | 	@echo "Build finished. The Texinfo files are in $(BUILDDIR)/texinfo."
125 | 	@echo "Run \`make' in that directory to run these through makeinfo" \
126 | 	      "(use \`make info' here to do that automatically)."
127 | 
128 | info:
129 | 	$(SPHINXBUILD) -b texinfo $(ALLSPHINXOPTS) $(BUILDDIR)/texinfo
130 | 	@echo "Running Texinfo files through makeinfo..."
131 | 	make -C $(BUILDDIR)/texinfo info
132 | 	@echo "makeinfo finished; the Info files are in $(BUILDDIR)/texinfo."
133 | 
134 | gettext:
135 | 	$(SPHINXBUILD) -b gettext $(I18NSPHINXOPTS) $(BUILDDIR)/locale
136 | 	@echo
137 | 	@echo "Build finished. The message catalogs are in $(BUILDDIR)/locale."
138 | 
139 | changes:
140 | 	$(SPHINXBUILD) -b changes $(ALLSPHINXOPTS) $(BUILDDIR)/changes
141 | 	@echo
142 | 	@echo "The overview file is in $(BUILDDIR)/changes."
143 | 
144 | linkcheck:
145 | 	$(SPHINXBUILD) -b linkcheck $(ALLSPHINXOPTS) $(BUILDDIR)/linkcheck
146 | 	@echo
147 | 	@echo "Link check complete; look for any errors in the above output " \
148 | 	      "or in $(BUILDDIR)/linkcheck/output.txt."
149 | 
150 | doctest:
151 | 	$(SPHINXBUILD) -b doctest $(ALLSPHINXOPTS) $(BUILDDIR)/doctest
152 | 	@echo "Testing of doctests in the sources finished, look at the " \
153 | 	      "results in $(BUILDDIR)/doctest/output.txt."
154 | 


--------------------------------------------------------------------------------
/{{ cookiecutter.repo_name }}/reference/easydata/notebooks.md:
--------------------------------------------------------------------------------
  1 | # Using Notebooks for Analysis
  2 | 
  3 | Jupyter Notebooks are a fantastic way for doing your EDA and sharing stories about your analysis afterwards. Unfortunately, (and yes, after many years of trying to use notebooks reproducibly, we are opinionated on this) they're a pretty terrible way to share code itself. While we still *love* using notebooks for sharing what we've done with others, especially in a workshop setting.
  4 | 
  5 | We've set up this repo in a way to make it easier to use notebooks to share stories, while keeping your code in a python module where it belongs.
  6 | 
  7 | Here's our best practices for using notebooks, while keeping your analyses sharable and reproducible. We've also included some of our favourite tricks and tips below for making using notebooks easier. (If you have more, please share them!)
  8 | 
  9 | ## Naming Convention
 10 | Notebooks go in the `notebooks` directory, and are named `dd-xyz-title.ipynb` where:
 11 | 
 12 | * `dd` is an integer indicating the notebook sequence. This is critical when there are dependencies between notebooks
 13 | * `xyz` is the author's initials, to help avoid namespace clashes when multiple parties are committing to the same repo
 14 | * `title` is the name of the notebook, words separated by hyphens.
 15 | 
 16 | e.g.`00-xyz-sample-notebook.ipynb`
 17 | 
 18 | ## Source Control for Notebooks
 19 | Here's where the code part of notebooks starts to get tricky. Notebooks awful for using with `git` and other source control systems because of the way that they are stored (giant JSON blob). If you're going to share your notebook back to the main surge repo (which we strongly encourage!):
 20 | 
 21 | 1. Make sure your cells run sequentially (make sure you can **Kernel->Restart & Run All** successfully)
 22 | 1. Clear all cell output before checking in your notebook (**Kernel->Restart & Clear Output** before saving).
 23 | 
 24 | We realize that clearing the notebook (which gives cleaner diffs and PRs) is a bit of a trade-off against repoducibility of the notebook in that you lose the ability to check cell-by-cell whether you're getting the same results. One way to get around this in your own fork, is to use the `git nbdiff` feature, which is part of the `nbdiff` package (that is installed in this repo by default). You can find it on the right-hand side of the notebook toolbar, asc shown below:
 25 | 
 26 | ![screenshot](images/toolbar-screenshot.png)
 27 | 
 28 | This button will diff the notebook you have open intelligently against the the base version. We like to use `git nbdiff` as a visual diffing tool even if we are clearing output before checking in notebooks.
 29 | 
 30 | If you want to give your future users help to see whether they are getting images and figures that match previous analyses, we recommend saving the figures in `reports/figures` and then putting them into a markdown cell in the notebook (so a user can see if what they generated is comparable).
 31 | 
 32 | You can also optionally check your notebook in after a successful **Kernel->Restart & Run All**. This is a little more work to maintain diffs on, but can be nicer for communication withouit having to run a notebook to see what the results look like.
 33 | 
 34 | ## On code
 35 | As mentioned, notebooks aren't a great place for keeping code, as diffs and PRs in a notebook are virtually unreadable. This repo uses an editable python module called `{{ cookiecutter.module_name }}`. If you write code that you'd like to use in a notebook (e.g. `my_python_file.py`), put it in the `{{ cookiecutter.module_name }}/xyz` directory where `xyz` is the author's initials. You should then be able to immediately load it in your notebook via:
 36 | ```python
 37 | from {{ cookiecutter.module_name }}.xyz.my_python_file import my_function_name
 38 | ```
 39 | If it's not immediately loading (or you need to restart your kernel to make it visible), make sure you run the following cell (preferably at the top of your notebook...see more on useful header cells below):
 40 | ```python
 41 | %load_ext autoreload
 42 | %autoreload 2
 43 | ```
 44 | 
 45 | ## Jupyter Tips and Tricks
 46 | First up, if you're in a notebook, keyboard shortcuts can be found using the `Esc` key. Use them.
 47 | 
 48 | ### Useful Header Cells
 49 | #### Better display
 50 | This cell makes your jupyter notebook use the full screen width. Put this as your first executable cell. You'll thank us.
 51 | ```python
 52 | from IPython.core.display import display, HTML
 53 | display(HTML("<style>.container { width:100% !important; }</style>"))
 54 | ```
 55 | #### Autoreloading
 56 | The cell
 57 | ```python
 58 | %load_ext autoreload
 59 | %autoreload 2
 60 | ```
 61 | let's you autoreload code that's changed in your environment. This means you can update your environment without killing your kernel or develop code in the `{{ cookiecutter.module_name }}` module that is immediately available via auto-reload.
 62 | #### Python Libraries
 63 | It helps to put your dependencies at the top of your notebook. Ours usually look something like this:
 64 | ```python
 65 | # Python Imports, alphabetized
 66 | import pathlib
 67 | ...
 68 | 
 69 | #3rd party python modules, alphabetized
 70 | import pandas as pd
 71 | ...
 72 | 
 73 | #Some plotting libraries
 74 | import matplotlib.pyplot as plt
 75 | %matplotlib notebook
 76 | from bokeh.plotting import show, save, output_notebook, output_file
 77 | from bokeh.resources import INLINE
 78 | output_notebook(resources=INLINE)
 79 | 
 80 | # Source module imports
 81 | from {{ cookiecutter.module_name }} import paths
 82 | from {{ cookiecutter.module_name }}.data import DataSource, Dataset, Catalog
 83 | ```
 84 | You can also find most of these header cells in [00-xyz-sample-notebook.ipynb](../notebooks/00-xyz-sample-notebook.ipynb)
 85 | 
 86 | ### Cell Magics
 87 | There is a whole world of cell magics. These are bits of code that you can put at the top of a cell that do magical things. A few of our most used ones are:
 88 | 
 89 | * `%%time`: time the cell (use this on slow cells)
 90 | * `%debug`: invoke the python debugger (make sure to `exit` when you're done)
 91 | * `%%file`: write current cell's content to a file (use `-a` to append)
 92 | * `%load`: load a file's contents into the current cell
 93 | * `%%bash`: run the cell using bash kernel
 94 | 
 95 | 
 96 | ### Quick References
 97 | 
 98 | * [README](../README.md)
 99 | * [Setting up and Maintaining your Conda Environment, Reproducibly](conda-environments.md)
100 | * [Getting and Using Datasets](datasets.md)
101 | * [Specifying Paths in Easydata](paths.md)
102 | * [Using Notebooks for Analysis](notebooks.md)
103 | * [Sharing your Work](sharing-your-work.md)
104 | * [Troubleshooting Guide](troubleshooting.md)
105 | 


--------------------------------------------------------------------------------
/docs/New-Edge-Template.ipynb:
--------------------------------------------------------------------------------
  1 | {
  2 |  "cells": [
  3 |   {
  4 |    "cell_type": "markdown",
  5 |    "metadata": {},
  6 |    "source": [
  7 |     "# General template for creating a derived dataset *(aka. adding an edge to the DatasetGraph)*\n",
  8 |     "\n",
  9 |     "This example creates the dataset from in the [`Add-csv-template.ipynb`](https://cookiecutter-easydata.readthedocs.io/en/latest/Add-csv-template/) example, but does it completely generally without using the functions in `helpers` and builds on the `New-Dataset-Template.ipynb` example. Any derived dataset can be added in this way as an *edge* in the `DatasetGraph`."
 10 |    ]
 11 |   },
 12 |   {
 13 |    "cell_type": "markdown",
 14 |    "metadata": {},
 15 |    "source": [
 16 |     "## Basic imports"
 17 |    ]
 18 |   },
 19 |   {
 20 |    "cell_type": "code",
 21 |    "execution_count": null,
 22 |    "metadata": {},
 23 |    "outputs": [],
 24 |    "source": [
 25 |     "%load_ext autoreload\n",
 26 |     "%autoreload 2"
 27 |    ]
 28 |   },
 29 |   {
 30 |    "cell_type": "code",
 31 |    "execution_count": null,
 32 |    "metadata": {},
 33 |    "outputs": [],
 34 |    "source": [
 35 |     "# Basic utility functions\n",
 36 |     "import logging\n",
 37 |     "import os\n",
 38 |     "import pathlib\n",
 39 |     "from src.log import logger\n",
 40 |     "from src import paths\n",
 41 |     "from src.utils import list_dir\n",
 42 |     "from functools import partial\n",
 43 |     "\n",
 44 |     "# data functions\n",
 45 |     "from src.data import DataSource, Dataset, DatasetGraph\n",
 46 |     "from src import helpers"
 47 |    ]
 48 |   },
 49 |   {
 50 |    "cell_type": "code",
 51 |    "execution_count": null,
 52 |    "metadata": {},
 53 |    "outputs": [],
 54 |    "source": [
 55 |     "# Optionally set to debug log level\n",
 56 |     "logger.setLevel(logging.DEBUG)"
 57 |    ]
 58 |   },
 59 |   {
 60 |    "cell_type": "markdown",
 61 |    "metadata": {},
 62 |    "source": [
 63 |     "## Source dataset\n",
 64 |     "\n"
 65 |    ]
 66 |   },
 67 |   {
 68 |    "cell_type": "code",
 69 |    "execution_count": null,
 70 |    "metadata": {},
 71 |    "outputs": [],
 72 |    "source": [
 73 |     "source_ds_name = 'covid-19-epidemiology-raw'"
 74 |    ]
 75 |   },
 76 |   {
 77 |    "cell_type": "code",
 78 |    "execution_count": null,
 79 |    "metadata": {},
 80 |    "outputs": [],
 81 |    "source": [
 82 |     "source_ds = Dataset.load(source_ds_name)"
 83 |    ]
 84 |   },
 85 |   {
 86 |    "cell_type": "code",
 87 |    "execution_count": null,
 88 |    "metadata": {},
 89 |    "outputs": [],
 90 |    "source": [
 91 |     "source_ds.FILESET"
 92 |    ]
 93 |   },
 94 |   {
 95 |    "cell_type": "markdown",
 96 |    "metadata": {},
 97 |    "source": [
 98 |     "## Create and add your transfomer function\n",
 99 |     "Here we'll use a pre-built transformer function `csv_to_pandas`, but normally you would place your new transformer function in `{your_project_module}/data/transformer_functions.py` as in the [`Add-Derived-Dataset.ipynb`](https://cookiecutter-easydata.readthedocs.io/en/latest/Add-derived-dataset/) example. \n",
100 |     "\n",
101 |     "Transformer functions take a dict of Datasets of the form `{ds_name: ds}` as input and outputs a new dict of Datasets of the same form.\n",
102 |     "\n"
103 |    ]
104 |   },
105 |   {
106 |    "cell_type": "code",
107 |    "execution_count": null,
108 |    "metadata": {},
109 |    "outputs": [],
110 |    "source": [
111 |     "from src.data.transformer_functions import csv_to_pandas\n",
112 |     "from src.data import serialize_transformer_pipeline"
113 |    ]
114 |   },
115 |   {
116 |    "cell_type": "code",
117 |    "execution_count": null,
118 |    "metadata": {},
119 |    "outputs": [],
120 |    "source": [
121 |     "## Fill this in for your dataset\n",
122 |     "ds_name = 'covid-19-epidemiology'\n",
123 |     "transformers = [partial(csv_to_pandas,\n",
124 |     "                        output_map={ds_name:'epidemiology.csv'})]"
125 |    ]
126 |   },
127 |   {
128 |    "cell_type": "markdown",
129 |    "metadata": {},
130 |    "source": [
131 |     "## Create the new edge in the transformer graph"
132 |    ]
133 |   },
134 |   {
135 |    "cell_type": "code",
136 |    "execution_count": null,
137 |    "metadata": {},
138 |    "outputs": [],
139 |    "source": [
140 |     "dag = DatasetGraph(catalog_path=paths['catalog_path'])"
141 |    ]
142 |   },
143 |   {
144 |    "cell_type": "code",
145 |    "execution_count": null,
146 |    "metadata": {},
147 |    "outputs": [],
148 |    "source": [
149 |     "dag.add_edge(input_dataset=source_ds_name,\n",
150 |     "             output_dataset=ds_name,\n",
151 |     "             transformer_pipeline=serialize_transformer_pipeline(transformers),\n",
152 |     "             overwrite_catalog=True)"
153 |    ]
154 |   },
155 |   {
156 |    "cell_type": "code",
157 |    "execution_count": null,
158 |    "metadata": {},
159 |    "outputs": [],
160 |    "source": [
161 |     "%%time\n",
162 |     "ds = Dataset.from_catalog(ds_name)"
163 |    ]
164 |   },
165 |   {
166 |    "cell_type": "code",
167 |    "execution_count": null,
168 |    "metadata": {},
169 |    "outputs": [],
170 |    "source": [
171 |     "%%time\n",
172 |     "ds = Dataset.load(ds_name)"
173 |    ]
174 |   },
175 |   {
176 |    "cell_type": "code",
177 |    "execution_count": null,
178 |    "metadata": {},
179 |    "outputs": [],
180 |    "source": [
181 |     "print(ds.README)"
182 |    ]
183 |   },
184 |   {
185 |    "cell_type": "code",
186 |    "execution_count": null,
187 |    "metadata": {},
188 |    "outputs": [],
189 |    "source": [
190 |     "print(ds.LICENSE)"
191 |    ]
192 |   },
193 |   {
194 |    "cell_type": "code",
195 |    "execution_count": null,
196 |    "metadata": {},
197 |    "outputs": [],
198 |    "source": [
199 |     "ds.data.shape"
200 |    ]
201 |   },
202 |   {
203 |    "cell_type": "code",
204 |    "execution_count": null,
205 |    "metadata": {},
206 |    "outputs": [],
207 |    "source": [
208 |     "ds.data.head()"
209 |    ]
210 |   },
211 |   {
212 |    "cell_type": "markdown",
213 |    "metadata": {},
214 |    "source": [
215 |     "## Check-in the new dataset\n",
216 |     "Finally, check in the new catalog files. "
217 |    ]
218 |   }
219 |  ],
220 |  "metadata": {
221 |   "kernelspec": {
222 |    "display_name": "Python [conda env:easydata-notebook]",
223 |    "language": "python",
224 |    "name": "conda-env-easydata-notebook-py"
225 |   },
226 |   "language_info": {
227 |    "codemirror_mode": {
228 |     "name": "ipython",
229 |     "version": 3
230 |    },
231 |    "file_extension": ".py",
232 |    "mimetype": "text/x-python",
233 |    "name": "python",
234 |    "nbconvert_exporter": "python",
235 |    "pygments_lexer": "ipython3",
236 |    "version": "3.7.10"
237 |   }
238 |  },
239 |  "nbformat": 4,
240 |  "nbformat_minor": 4
241 | }
242 | 


--------------------------------------------------------------------------------
/README.md:
--------------------------------------------------------------------------------
  1 | [![Build Status](https://travis-ci.org/hackalog/cookiecutter-easydata.svg?branch=master)](https://travis-ci.org/hackalog/cookiecutter-easydata)
  2 | [![CircleCI](https://circleci.com/gh/hackalog/easydata.svg?style=shield)](https://app.circleci.com/pipelines/github/hackalog/easydata)
  3 | [![Coverage Status](https://coveralls.io/repos/github/hackalog/cookiecutter-easydata/badge.svg?branch=master)](https://coveralls.io/github/hackalog/cookiecutter-easydata?branch=master)
  4 | [![Documentation Status](https://readthedocs.org/projects/cookiecutter-easydata/badge/?version=latest)](https://cookiecutter-easydata.readthedocs.io/en/latest/?badge=latest)
  5 | 
  6 | # EasyData
  7 | 
  8 | _A python framework and git template for data scientists, teams, and workshop organizers
  9 | aimed at making your data science **reproducible**_
 10 | 
 11 | For most of us, data science is 5% science, 60% data cleaning, and 35%
 12 | IT hell.  Easydata focuses the 95% by helping you deliver
 13 | * reproducible python environments,
 14 | * reproducible datasets, and
 15 | * reproducible workflows
 16 | 
 17 | In other words, Easydata is a template, library, and workflow that lets you **get up and running with your data science analysis, quickly and reproducibly**.
 18 | 
 19 | ## What is Easydata?
 20 | 
 21 | Easydata is a framework for building custom data science git repos that provides:
 22 | * An **prescribed workflow** for collaboration, storytelling,
 23 | * A **python framework** to support this workflow
 24 | * A **makefile wrapper** for conda and pip environment management
 25 | * prebuilt **dataset recipes**, and
 26 | * a vast library of training materials and documentation around doing reproducible data science.
 27 | 
 28 | Easydata is **not**
 29 | * an ETL tooklit
 30 | * A data analysis pipeline
 31 | * a containerization solution, or
 32 | * a prescribed data format.
 33 | 
 34 | 
 35 | ### Requirements to use this framework:
 36 |  - anaconda (or miniconda)
 37 |  - python3.6+ (we use f-strings. So should you)
 38 |  - [Cookiecutter Python package](http://cookiecutter.readthedocs.org/en/latest/installation.html) >= 1.4.0: This can be installed with pip by or conda depending on how you manage your Python packages:
 39 | 
 40 | once you've installed anaconda, you can install the remaining requirements (including cookiecutter) by doing:
 41 | 
 42 | ```bash
 43 | conda create --name easydata python=3
 44 | conda activate easydata
 45 | python -m pip install -f requirements.txt
 46 | ```
 47 | 
 48 | 
 49 | ### To start a new project, run:
 50 | ------------
 51 | 
 52 |     cookiecutter https://github.com/hackalog/easydata
 53 | 
 54 | ### To find out more
 55 | ------------
 56 | A good place to start is with reproducible environments. We have a tutorial here: [Getting Started with EasyData Environments](https://github.com/hackalog/easydata/wiki/Getting-Started-with-EasyData-Environments). 
 57 | 
 58 | The next place to look is in the customized documentation that is in any EasyData created repo. It is customized to the settings that you put in your template. These are reference documents that can be found under `references/easydata` that are customized to your repo that cover:
 59 |    * more on conda environments
 60 |    * more on paths
 61 |    * git configuration (including setting up ssh with GitHub)
 62 |    * git workflows
 63 |    * tricks for using Jupyter notebooks in an EasyData environment
 64 |    * troubleshooting
 65 |    * recommendations for how to share your work
 66 |    
 67 | Furthermore, see:
 68 | * [The EasyData documentation on read the docs](https://cookiecutter-easydata.readthedocs.io/en/latest/?badge=latest): this contains up-to-date working exmaples of how to use EasyData for reproducible datasets and some ways to use notebooks reproducibly
 69 | * [Talks and Tutorials based on EasyData](https://github.com/hackalog/easydata/wiki/EasyData-Talks-and-Tutorials)
 70 | * [Catalog of EasyData Documentation](https://github.com/hackalog/easydata/wiki/Catalog-of-EasyData-Documentation)
 71 | * [The EasyData wiki](https://github.com/hackalog/easydata/wiki) Check here for further troubleshooting and how-to guides for particular problems that aren't in the `references/easydata` docs (including a `git` tutorial)
 72 | 
 73 | ### The resulting directory structure
 74 | ------------
 75 | 
 76 | The directory structure of your new project looks like this:
 77 | 
 78 | 
 79 | * `LICENSE`
 80 |     * Terms of use for this repo
 81 | * `Makefile`
 82 |     * top-level makefile. Type `make` for a list of valid commands
 83 | * `Makefile.include`
 84 |     * Global includes for makefile routines. Included by `Makefile`.
 85 | * `Makefile.env`
 86 |     * Command for maintaining reproducible conda environment. Included by `Makefile`.
 87 | * `README.md`
 88 |     * this file
 89 | * `catalog`
 90 |   * Data catalog. This is where config information such as data sources
 91 |     and data transformations are saved
 92 |   * `catalog/config.ini`
 93 |      * Local Data Store. This configuration file is for local data only, and is never checked into the repo.
 94 | * `data`
 95 |     * Data directory. often symlinked to a filesystem with lots of space
 96 |     * `data/raw`
 97 |         * Raw (immutable) hash-verified downloads
 98 |     * `data/interim`
 99 |         * Extracted and interim data representations
100 |     * `data/interim/cache`
101 |         * Dataset cache
102 |     * `data/processed`
103 |         * The final, canonical data sets for modeling.
104 | * `docs`
105 |     * Sphinx-format documentation files for this project.
106 |     * `docs/Makefile`: Makefile for generating HTML/Latex/other formats from Sphinx-format documentation.
107 | * `notebooks`
108 |     *  Jupyter notebooks. Naming convention is a number (for ordering),
109 |     the creator's initials, and a short `-` delimited description,
110 |     e.g. `1.0-jqp-initial-data-exploration`.
111 | * `reference`
112 |     * Data dictionaries, documentation, manuals, scripts, papers, or other explanatory materials.
113 |     * `reference/easydata`: Easydata framework and workflow documentation.
114 |     * `reference/templates`: Templates and code snippets for Jupyter
115 |     * `reference/dataset`: resources related to datasets; e.g. dataset creation notebooks and scripts
116 | * `reports`
117 |     * Generated analysis as HTML, PDF, LaTeX, etc.
118 |     * `reports/figures`
119 |         * Generated graphics and figures to be used in reporting
120 | * `environment.yml`
121 |     * The user-readable YAML file for reproducing the conda/pip environment.
122 | * `environment.(platform).lock.1yml`
123 |     * resolved versions, result of processing `environment.yml`
124 | * `setup.py`
125 |     * Turns contents of `MODULE_NAME` into a
126 |     pip-installable python module  (`pip install -e .`) so it can be
127 |     imported in python code
128 | * `MODULE_NAME`
129 |     * Source code for use in this project.
130 |     * `MODULE_NAME/__init__.py`
131 |         * Makes MODULE_NAME a Python module
132 |     * `MODULE_NAME/data`
133 |         * code to fetch raw data and generate Datasets from them
134 |     * `MODULE_NAME/analysis`
135 |         * code to turn datasets into output products
136 | 
137 | ### Installing development requirements
138 | The first time:
139 | ```
140 | make create_environment
141 | git init
142 | git add .
143 | git commit -m "initial import"
144 | git branch easydata   # tag for future easydata upgrades
145 | ```
146 | 
147 | Subsequent updates:
148 | ```
149 | make update_environment
150 | ```
151 | 
152 | In case you need to delete the environment later:
153 | ```
154 | conda deactivate
155 | make delete_environment
156 | ```
157 | 
158 | 
159 | ## Credits and Thanks
160 | * Early versions of Easydata were based on the excellent
161 | [cookiecutter-data-science](http://drivendata.github.io/cookiecutter-data-science/)
162 | template.
163 | * Thanks to the [Tutte Institute](https://github.com/TutteInstitute) for supporting the development of this framework.
164 | 


--------------------------------------------------------------------------------
/{{ cookiecutter.repo_name }}/{{ cookiecutter.module_name }}/data/utils.py:
--------------------------------------------------------------------------------
  1 | import importlib
  2 | import os
  3 | import pathlib
  4 | import random
  5 | import sys
  6 | import pandas as pd
  7 | import numpy as np
  8 | from typing import Iterator, List
  9 | from functools import partial
 10 | from joblib import func_inspect as jfi
 11 | 
 12 | from ..log import logger
 13 | from .. import paths
 14 | 
 15 | __all__ = [
 16 |     'deserialize_partial',
 17 |     'normalize_labels',
 18 |     'partial_call_signature',
 19 |     'read_space_delimited',
 20 |     'reservoir_sample',
 21 |     'serialize_partial',
 22 | ]
 23 | 
 24 | _MODULE = sys.modules[__name__]
 25 | _MODULE_DIR = pathlib.Path(os.path.dirname(os.path.abspath(__file__)))
 26 | 
 27 | def read_space_delimited(filename, skiprows=None, class_labels=True, metadata=None):
 28 |     """Read a space-delimited file
 29 | 
 30 |     Data is space-delimited. Last column is the (string) label for the data
 31 | 
 32 |     Note: we can't use automatic comment detection, as `#` characters are also
 33 |     used as data labels.
 34 | 
 35 |     Parameters
 36 |     ----------
 37 |     skiprows: list-like, int or callable, optional
 38 |         list of rows to skip when reading the file. See `pandas.read_csv`
 39 |         entry on `skiprows` for more
 40 |     class_labels: boolean
 41 |         if true, the last column is treated as the class (target) label
 42 |     """
 43 |     with open(filename, 'r') as fd:
 44 |         df = pd.read_csv(fd, skiprows=skiprows, skip_blank_lines=True,
 45 |                            comment=None, header=None, sep=' ', dtype=str)
 46 |         # targets are last column. Data is everything else
 47 |         if class_labels is True:
 48 |             target = df.loc[:, df.columns[-1]].values
 49 |             data = df.loc[:, df.columns[:-1]].values
 50 |         else:
 51 |             data = df.values
 52 |             target = np.zeros(data.shape[0])
 53 |         return data, target, metadata
 54 | 
 55 | def normalize_labels(target):
 56 |     """Map an arbitary target vector to an integer vector
 57 | 
 58 |     Returns
 59 |     -------
 60 |     tuple: (mapped_target, label_map)
 61 | 
 62 |     where:
 63 |         mapped_target: integer vector of same shape as target
 64 |         label_map: dict mapping mapped_target integers to original labels
 65 | 
 66 |     Examples
 67 |     --------
 68 |     >>> target = np.array(['a','b','c','a'])
 69 |     >>> mapped_target, label_map = normalize_labels(target)
 70 |     >>> mapped_target
 71 |     array([0, 1, 2, 0])
 72 | 
 73 |     The following should always be true
 74 | 
 75 |     >>> all(np.vectorize(label_map.get)(mapped_target) == target)
 76 |     True
 77 |     """
 78 |     label_map = {k:v for k, v in enumerate(np.unique(target))}
 79 |     label_map_inv = {v:k for k, v in label_map.items()}
 80 |     mapped_target = np.vectorize(label_map_inv.get)(target)
 81 | 
 82 |     return mapped_target, label_map
 83 | 
 84 | def partial_call_signature(func):
 85 |     """Return the fully qualified call signature for a (partial) function
 86 |     """
 87 |     func = partial(func)
 88 |     fa = jfi.getfullargspec(func)
 89 |     default_kw = {}
 90 |     if fa.args:
 91 |         default_kw = dict(zip(fa.args, fa.defaults))
 92 |     if getattr(fa, 'kwonlydefaults', None):
 93 |         fq_keywords = {**default_kw, **fa.kwonlydefaults}
 94 |     else:
 95 |         fq_keywords = default_kw
 96 |     return jfi.format_signature(func.func, *func.args, **fq_keywords)
 97 | 
 98 | def process_dataset_default(metadata=None, **kwargs):
 99 |     """Placeholder for data processing function"""
100 |     dataset_name = kwargs.get('dataset_name', 'unknown-dataset')
101 |     logger.error(f"'{dataset_name}()' function not found. Define it add it to the `user` namespace for correct behavior")
102 |     return None, None, metadata
103 | 
104 | def deserialize_partial(func_dict, delete_keys=False,
105 |                         key_base='load_function',
106 |                         fail_func=None):
107 |     """Convert a serialized function call into a partial
108 | 
109 |     if there is an error, returns a default function (process_dataset_default)
110 | 
111 |     Parameters
112 |     ----------
113 |     func_dict: dict containing
114 |         {key_base}_name: function name
115 |         {key_base}_module: module containing function
116 |         {key_base}_args: args to pass to function
117 |         {key_base}_kwargs: kwargs to pass to function
118 | 
119 |     delete_keys: Boolean
120 |         if True, keys are deleted from `func_dict` if found
121 |     key_base: str
122 |         name to be used when generating looking up keys in `func_dict`
123 |     fail_func:
124 |         function to use if no valid function found in the namespace
125 | 
126 |     """
127 | 
128 |     if delete_keys:
129 |         args = func_dict.pop(f"{key_base}_args", [])
130 |         kwargs = func_dict.pop(f"{key_base}_kwargs", {})
131 |         base_name = func_dict.pop(f"{key_base}_name", 'process_dataset_default')
132 |         func_mod_name = func_dict.pop(f'{key_base}_module', None)
133 |     else:
134 |         args = func_dict.get(f"{key_base}_args", [])
135 |         kwargs = func_dict.get(f"{key_base}_kwargs", {})
136 |         base_name = func_dict.get(f"{key_base}_name", 'process_dataset_default')
137 |         func_mod_name = func_dict.get(f'{key_base}_module', None)
138 | 
139 |     if fail_func is None:
140 |         fail_func = partial(process_dataset_default, dataset_name=base_name)
141 | 
142 |     try:
143 |         if func_mod_name:
144 |             func_mod = importlib.import_module(func_mod_name)
145 |         else:
146 |             func_mod = _MODULE
147 |         func_name = getattr(func_mod, base_name, fail_func)
148 |     except ModuleNotFoundError as e:
149 |         logger.error(f"Invalid parse_function: {e}")
150 |         func_name = fail_func
151 |     func = partial(func_name, *args, **kwargs)
152 | 
153 |     return func
154 | 
155 | def serialize_partial(func, key_base='load_function'):
156 |     """Serialize a function call to a dictionary.
157 | 
158 |     Parameters
159 |     ----------
160 |     func: function
161 |         function to serialize
162 |     key_base: str. Default 'load_function'
163 |         string to prepend to serialization parameters.
164 | 
165 |     Returns
166 |     -------
167 |     dict containing:
168 |         {key_base}_name: function name
169 |         {key_base}_module: fully-qualified module name containing function
170 |         {key_base}_args: args to pass to function
171 |         {key_base}_kwargs: kwargs to pass to function
172 |     """
173 | 
174 |     entry = {}
175 |     if func is None:
176 |         logger.warning(f"serialize_partial: `{key_base}` is None. Ignoring.")
177 |         return entry
178 |     func = partial(func)
179 |     entry[f'{key_base}_module'] = ".".join(jfi.get_func_name(func.func)[0])
180 |     entry[f'{key_base}_name'] = jfi.get_func_name(func.func)[1]
181 |     entry[f'{key_base}_args'] = func.args
182 |     entry[f'{key_base}_kwargs'] = func.keywords
183 |     return entry
184 | 
185 | def reservoir_sample(filename, n_samples=1, random_seed=None):
186 |     """Return a random subset of lines from a file
187 | 
188 |     Parameters
189 |     ----------
190 |     filename: path
191 |         File to be loaded
192 |     n_samples: int
193 |         number of lines to return
194 |     random_seed: int or None
195 |         If set, use this as the random seed
196 |     """
197 |     if random_seed is not None:
198 |         random.seed(random_seed)
199 |     sample = []
200 |     with open(filename) as f:
201 |         for n, line in enumerate(f):
202 |             if n < n_samples:
203 |                 sample.append(line.rstrip())
204 |             else:
205 |                 r = random.randint(0, n_samples)
206 |                 if r < n_samples:
207 |                     sample[r] = line.rstrip()
208 |     return sample
209 | 
210 | 
211 | def iter_directory(root: pathlib.Path) -> Iterator[pathlib.Path]:
212 |     """
213 |     Iterates the contents of a directory recursively, in depth-first
214 |     alphanumeric order.
215 | 
216 |     Parameters
217 |     ----------
218 |     path
219 |         Path to the directory to iterate.
220 | 
221 |     Items
222 |     -----
223 |     Paths to the various items contained in the directory and its subdirectories, recursively. The root prepends all the
224 |     yielded paths.
225 |     """
226 |     def listdir_sorted(path: pathlib.Path) -> List[pathlib.Path]:
227 |         return sorted(list(path.iterdir()), reverse=True)
228 | 
229 |     elements = listdir_sorted(root)
230 |     while elements:
231 |         item = elements.pop()
232 |         yield item
233 |         if item.is_dir():
234 |             elements += listdir_sorted(item)
235 | 


--------------------------------------------------------------------------------
/docs/opinions.md:
--------------------------------------------------------------------------------
 1 | # Easydata Opinions
 2 | 
 3 | There are some opinions implicit in the project structure that have grown out of our experience with what works and what doesn't when collaborating on data science projects. Some of the opinions are about workflows, and some of the opinions are about tools that make life easier. Here are some of the beliefs which this project is built on—if you've got thoughts, please [contribute or share them](#contributing).
 4 | 
 5 | ### Data is immutable
 6 | 
 7 | Don't ever edit your raw data, especially not manually. Don't overwrite your raw data. Don't save multiple versions of the raw data. Treat the data (and its format) as immutable. The code you write should move the raw data through a pipeline to your final analysis. You shouldn't have to run all of the steps every time you want to make a new figure (see [Analysis is a DAG](#analysis-is-a-dag)), but anyone should be able to reproduce the final products with only the code in {{ cookiecutter.module_name }} and the data in `data/raw`.
 8 | 
 9 | Also, if data is immutable, it doesn't need source control in the same way that code does. Therefore, ***by default, the data folder is included in the `.gitignore` file.*** If you have a small amount of data that rarely changes, you may want to include the data in the repository. Github currently warns if files are over 50MB and rejects files over 100MB. Some other options for storing/syncing large data include [AWS S3](https://aws.amazon.com/s3/) with a syncing tool (e.g., [`s3cmd`](http://s3tools.org/s3cmd)), [Git Large File Storage](https://git-lfs.github.com/), [Git Annex](https://git-annex.branchable.com/), and [dat](http://dat-data.com/).
10 | 
11 | ### Shared workflows matter
12 | Shared workflows matter in enabling reproducible results and smoother collaboration. That's why we include a suite of recommendeded (but lightweight) workflows that help you to collaborate with others in our `framework-docs`. Use them out-of-the-box for a workshop, or adapt them to suit your team's needs. Either way, we recommend that shared workflows stay with the project and include a few key elements:
13 | 
14 | * Contributor guidelines
15 | * A shared git workflow
16 | * How to submit issues, questions, or get help
17 | * Where to put different types of project materials such as code, notebooks for story-telling, documentation, visualizations, other deliverables
18 | * Which licenses to use (aka. terms for sharing)
19 | 
20 | ### Notebooks are for exploration and communication
21 | 
22 | Notebook packages like the [Jupyter notebook](http://jupyter.org/), [Beaker notebook](http://beakernotebook.com/), [Zeppelin](http://zeppelin-project.org/), and other literate programming tools are very effective for exploratory data analysis. However, these tools can be less effective for reproducing an analysis. When we use notebooks in our work, we often subdivide the `notebooks` folder. For example, `notebooks/exploratory` contains initial explorations, whereas `notebooks/reports` is more polished work that can be exported as html to the `reports` directory.
23 | 
24 | Since notebooks are challenging objects for source control (e.g., diffs of the `json` are often not human-readable and merging is near impossible), we recommended not collaborating directly with others on Jupyter notebooks. There are two steps we recommend for using notebooks effectively:
25 | 
26 |  1. Follow a naming convention that shows the owner and the order the analysis was done in. We use the format `<step>-<ghuser>-<description>.ipynb` (e.g., `0.3-bull-visualize-distributions.ipynb`).
27 | 
28 |  2. Refactor the good parts. Don't write code to do the same task in multiple notebooks. If it's a data preprocessing task, put it in the pipeline at `src/data/make_dataset.py` and load data from `data/interim`. If it's useful utility code, refactor it to `src`.
29 | 
30 |  Now by default we turn the project into a Python package (see the `setup.py` file). You can import your code and use it in notebooks with a cell like the following:
31 | 
32 | ```
33 | # OPTIONAL: Load the "autoreload" extension so that code can change
34 | %load_ext autoreload
35 | 
36 | # OPTIONAL: always reload modules so that as you change code in src, it gets loaded
37 | %autoreload 2
38 | 
39 | from src.data import make_dataset
40 | ```
41 | 
42 | ### Analysis is a DAG
43 | 
44 | Often in an analysis you have long-running steps that preprocess data or train models. If these steps have been run already (and you have stored the output somewhere like the `data/interim` directory), you don't want to wait to rerun them every time. We prefer [`make`](https://www.gnu.org/software/make/) for managing steps that depend on each other, especially the long-running ones. Make is a common tool on Unix-based platforms (and [is available for Windows]()). Following the [`make` documentation](https://www.gnu.org/software/make/), [Makefile conventions](https://www.gnu.org/prep/standards/html_node/Makefile-Conventions.html#Makefile-Conventions), and [portability guide](http://www.gnu.org/savannah-checkouts/gnu/autoconf/manual/autoconf-2.69/html_node/Portable-Make.html#Portable-Make) will help ensure your Makefiles work effectively across systems. Here are [some](http://zmjones.com/make/) [examples](http://blog.kaggle.com/2012/10/15/make-for-data-scientists/) to [get started](https://web.archive.org/web/20150206054212/http://www.bioinformaticszen.com/post/decomplected-workflows-makefiles/). A number of data folks use `make` as their tool of choice, including [Mike Bostock](https://bost.ocks.org/mike/make/).
45 | 
46 | There are other tools for managing DAGs that are written in Python instead of a DSL (e.g., [Paver](http://paver.github.io/paver/#), [Luigi](http://luigi.readthedocs.org/en/stable/index.html), [Airflow](http://pythonhosted.org/airflow/cli.html), [Snakemake](https://bitbucket.org/snakemake/snakemake/wiki/Home), [Ruffus](http://www.ruffus.org.uk/), or [Joblib](https://pythonhosted.org/joblib/memory.html)). Feel free to use these if they are more appropriate for your analysis.
47 | 
48 | ### Build from the environment up
49 | 
50 | The first step in reproducing an analysis is always reproducing the computational environment it was run in. You need the same tools, the same libraries, and the same versions to make everything play nicely together.
51 | 
52 | One effective approach to this is use [conda](https://anaconda.org) By listing all of your requirements in the repository (we include a `environment.yml` file) you can easily track the packages needed to recreate the analysis. Here is a good workflow:
53 | 
54 |  1. Run `make create_environment` when creating a new project
55 |  2. Add new requirements to `environment.yml`, either in the main section (for conda installations), or under the indented `- pip:` line, if it should be pip installed.
56 |  3. Type `make update_environment`
57 | 
58 | If you have more complex requirements for recreating your environment, consider a virtual machine based approach such as [Docker](https://www.docker.com/) or [Vagrant](https://www.vagrantup.com/). Both of these tools use text-based formats (Dockerfile and Vagrantfile respectively) you can easily add to source control to describe how to create a virtual machine with the requirements you need.
59 | 
60 | ### Keep secrets and configuration out of version control
61 | 
62 | You _really_ don't want to leak your AWS secret key or Postgres username and password on Github. Enough said — see the [Twelve Factor App](http://12factor.net/config) principles on this point. Here's one way to do this:
63 | 
64 | #### Store your secrets and config variables in a special file
65 | 
66 | Create a `.env` file in the project root folder. Thanks to the `.gitignore`, this file should never get committed into the version control repository. Here's an example:
67 | 
68 | ```nohighlight
69 | # example .env file
70 | DATABASE_URL=postgres://username:password@localhost:5432/dbname
71 | AWS_ACCESS_KEY=myaccesskey
72 | AWS_SECRET_ACCESS_KEY=mysecretkey
73 | OTHER_VARIABLE=something
74 | ```
75 | 
76 | #### Use a package to load these variables automatically.
77 | 
78 | If you look at the stub script in `src/data/make_dataset.py`, it uses a package called [python-dotenv](https://github.com/theskumar/python-dotenv) to load up all the entries in this file as environment variables so they are accessible with `os.environ.get`. Here's an example snippet adapted from the `python-dotenv` documentation:
79 | 
80 | ```python
81 | # src/data/dotenv_example.py
82 | import os
83 | from dotenv import load_dotenv, find_dotenv
84 | 
85 | # find .env automagically by walking up directories until it's found
86 | dotenv_path = find_dotenv()
87 | 
88 | # load up the entries as environment variables
89 | load_dotenv(dotenv_path)
90 | 
91 | database_url = os.environ.get("DATABASE_URL")
92 | other_variable = os.environ.get("OTHER_VARIABLE")
93 | ```
94 | 
95 | ### Be conservative in changing the default folder structure
96 | 
97 | To keep this structure broadly applicable for many different kinds of projects, we think the best approach is to be liberal in changing the folders around for _your_ project, but be conservative in changing the default structure for _all_ projects.


--------------------------------------------------------------------------------
/{{ cookiecutter.repo_name }}/docs/conf.py:
--------------------------------------------------------------------------------
  1 | # -*- coding: utf-8 -*-
  2 | #
  3 | # {{ cookiecutter.project_name }} documentation build configuration file, created by
  4 | # sphinx-quickstart.
  5 | #
  6 | # This file is execfile()d with the current directory set to its containing dir.
  7 | #
  8 | # Note that not all possible configuration values are present in this
  9 | # autogenerated file.
 10 | #
 11 | # All configuration values have a default; values that are commented out
 12 | # serve to show the default.
 13 | 
 14 | import os
 15 | import sys
 16 | 
 17 | # If extensions (or modules to document with autodoc) are in another directory,
 18 | # add these directories to sys.path here. If the directory is relative to the
 19 | # documentation root, use os.path.abspath to make it absolute, like shown here.
 20 | # sys.path.insert(0, os.path.abspath('.'))
 21 | 
 22 | # -- General configuration -----------------------------------------------------
 23 | 
 24 | # If your documentation needs a minimal Sphinx version, state it here.
 25 | # needs_sphinx = '1.0'
 26 | 
 27 | # Add any Sphinx extension module names here, as strings. They can be extensions
 28 | # coming with Sphinx (named 'sphinx.ext.*') or your custom ones.
 29 | extensions = []
 30 | 
 31 | # Add any paths that contain templates here, relative to this directory.
 32 | templates_path = ['_templates']
 33 | 
 34 | # The suffix of source filenames.
 35 | source_suffix = '.rst'
 36 | 
 37 | # The encoding of source files.
 38 | # source_encoding = 'utf-8-sig'
 39 | 
 40 | # The master toctree document.
 41 | master_doc = 'index'
 42 | 
 43 | # General information about the project.
 44 | project = u'{{ cookiecutter.project_name }}'
 45 | 
 46 | # The version info for the project you're documenting, acts as replacement for
 47 | # |version| and |release|, also used in various other places throughout the
 48 | # built documents.
 49 | #
 50 | # The short X.Y version.
 51 | version = '0.1'
 52 | # The full version, including alpha/beta/rc tags.
 53 | release = '0.1'
 54 | 
 55 | # The language for content autogenerated by Sphinx. Refer to documentation
 56 | # for a list of supported languages.
 57 | # language = None
 58 | 
 59 | # There are two options for replacing |today|: either, you set today to some
 60 | # non-false value, then it is used:
 61 | # today = ''
 62 | # Else, today_fmt is used as the format for a strftime call.
 63 | # today_fmt = '%B %d, %Y'
 64 | 
 65 | # List of patterns, relative to source directory, that match files and
 66 | # directories to ignore when looking for source files.
 67 | exclude_patterns = ['_build']
 68 | 
 69 | # The reST default role (used for this markup: `text`) to use for all documents.
 70 | # default_role = None
 71 | 
 72 | # If true, '()' will be appended to :func: etc. cross-reference text.
 73 | # add_function_parentheses = True
 74 | 
 75 | # If true, the current module name will be prepended to all description
 76 | # unit titles (such as .. function::).
 77 | # add_module_names = True
 78 | 
 79 | # If true, sectionauthor and moduleauthor directives will be shown in the
 80 | # output. They are ignored by default.
 81 | # show_authors = False
 82 | 
 83 | # The name of the Pygments (syntax highlighting) style to use.
 84 | pygments_style = 'sphinx'
 85 | 
 86 | # A list of ignored prefixes for module index sorting.
 87 | # modindex_common_prefix = []
 88 | 
 89 | 
 90 | # -- Options for HTML output ---------------------------------------------------
 91 | 
 92 | # The theme to use for HTML and HTML Help pages.  See the documentation for
 93 | # a list of builtin themes.
 94 | html_theme = 'default'
 95 | 
 96 | # Theme options are theme-specific and customize the look and feel of a theme
 97 | # further.  For a list of options available for each theme, see the
 98 | # documentation.
 99 | # html_theme_options = {}
100 | 
101 | # Add any paths that contain custom themes here, relative to this directory.
102 | # html_theme_path = []
103 | 
104 | # The name for this set of Sphinx documents.  If None, it defaults to
105 | # "<project> v<release> documentation".
106 | # html_title = None
107 | 
108 | # A shorter title for the navigation bar.  Default is the same as html_title.
109 | # html_short_title = None
110 | 
111 | # The name of an image file (relative to this directory) to place at the top
112 | # of the sidebar.
113 | # html_logo = None
114 | 
115 | # The name of an image file (within the static path) to use as favicon of the
116 | # docs.  This file should be a Windows icon file (.ico) being 16x16 or 32x32
117 | # pixels large.
118 | # html_favicon = None
119 | 
120 | # Add any paths that contain custom static files (such as style sheets) here,
121 | # relative to this directory. They are copied after the builtin static files,
122 | # so a file named "default.css" will overwrite the builtin "default.css".
123 | html_static_path = ['_static']
124 | 
125 | # If not '', a 'Last updated on:' timestamp is inserted at every page bottom,
126 | # using the given strftime format.
127 | # html_last_updated_fmt = '%b %d, %Y'
128 | 
129 | # If true, SmartyPants will be used to convert quotes and dashes to
130 | # typographically correct entities.
131 | # html_use_smartypants = True
132 | 
133 | # Custom sidebar templates, maps document names to template names.
134 | # html_sidebars = {}
135 | 
136 | # Additional templates that should be rendered to pages, maps page names to
137 | # template names.
138 | # html_additional_pages = {}
139 | 
140 | # If false, no module index is generated.
141 | # html_domain_indices = True
142 | 
143 | # If false, no index is generated.
144 | # html_use_index = True
145 | 
146 | # If true, the index is split into individual pages for each letter.
147 | # html_split_index = False
148 | 
149 | # If true, links to the reST sources are added to the pages.
150 | # html_show_sourcelink = True
151 | 
152 | # If true, "Created using Sphinx" is shown in the HTML footer. Default is True.
153 | # html_show_sphinx = True
154 | 
155 | # If true, "(C) Copyright ..." is shown in the HTML footer. Default is True.
156 | # html_show_copyright = True
157 | 
158 | # If true, an OpenSearch description file will be output, and all pages will
159 | # contain a <link> tag referring to it.  The value of this option must be the
160 | # base URL from which the finished HTML is served.
161 | # html_use_opensearch = ''
162 | 
163 | # This is the file name suffix for HTML files (e.g. ".xhtml").
164 | # html_file_suffix = None
165 | 
166 | # Output file base name for HTML help builder.
167 | htmlhelp_basename = '{{ cookiecutter.repo_name }}doc'
168 | 
169 | 
170 | # -- Options for LaTeX output --------------------------------------------------
171 | 
172 | latex_elements = {
173 |     # The paper size ('letterpaper' or 'a4paper').
174 |     # 'papersize': 'letterpaper',
175 | 
176 |     # The font size ('10pt', '11pt' or '12pt').
177 |     # 'pointsize': '10pt',
178 | 
179 |     # Additional stuff for the LaTeX preamble.
180 |     # 'preamble': '',
181 | }
182 | 
183 | # Grouping the document tree into LaTeX files. List of tuples
184 | # (source start file, target name, title, author, documentclass [howto/manual]).
185 | latex_documents = [
186 |     ('index',
187 |      '{{ cookiecutter.repo_name }}.tex',
188 |      u'{{ cookiecutter.project_name }} Documentation',
189 |      u"{{ cookiecutter.author_name }}", 'manual'),
190 | ]
191 | 
192 | # The name of an image file (relative to this directory) to place at the top of
193 | # the title page.
194 | # latex_logo = None
195 | 
196 | # For "manual" documents, if this is true, then toplevel headings are parts,
197 | # not chapters.
198 | # latex_use_parts = False
199 | 
200 | # If true, show page references after internal links.
201 | # latex_show_pagerefs = False
202 | 
203 | # If true, show URL addresses after external links.
204 | # latex_show_urls = False
205 | 
206 | # Documents to append as an appendix to all manuals.
207 | # latex_appendices = []
208 | 
209 | # If false, no module index is generated.
210 | # latex_domain_indices = True
211 | 
212 | 
213 | # -- Options for manual page output --------------------------------------------
214 | 
215 | # One entry per manual page. List of tuples
216 | # (source start file, name, description, authors, manual section).
217 | man_pages = [
218 |     ('index', '{{ cookiecutter.repo_name }}', u'{{ cookiecutter.project_name }} Documentation',
219 |      [u"{{ cookiecutter.author_name }}"], 1)
220 | ]
221 | 
222 | # If true, show URL addresses after external links.
223 | # man_show_urls = False
224 | 
225 | 
226 | # -- Options for Texinfo output ------------------------------------------------
227 | 
228 | # Grouping the document tree into Texinfo files. List of tuples
229 | # (source start file, target name, title, author,
230 | #  dir menu entry, description, category)
231 | texinfo_documents = [
232 |     ('index', '{{ cookiecutter.repo_name }}', u'{{ cookiecutter.project_name }} Documentation',
233 |      u"{{ cookiecutter.author_name }}", '{{ cookiecutter.project_name }}',
234 |      '{{ cookiecutter.description }}', 'Miscellaneous'),
235 | ]
236 | 
237 | # Documents to append as an appendix to all manuals.
238 | # texinfo_appendices = []
239 | 
240 | # If false, no module index is generated.
241 | # texinfo_domain_indices = True
242 | 
243 | # How to display URL addresses: 'footnote', 'no', or 'inline'.
244 | # texinfo_show_urls = 'footnote'
245 | 


--------------------------------------------------------------------------------
/{{ cookiecutter.repo_name }}/README.md:
--------------------------------------------------------------------------------
  1 | {{cookiecutter.project_name}}
  2 | ==============================
  3 | _Author: {{ cookiecutter.author_name }}_
  4 | 
  5 | {{cookiecutter.description}}
  6 | 
  7 | ABOUT EASYDATA
  8 | --------------
  9 | This git repository is build from the [Easydata](https://github.com/hackalog/easydata) framework, which aims to make
 10 | your data science workflow reproducible. The Easydata framework includes:
 11 | 
 12 | * tools for managing conda environments in a consistent and reproducible way,
 13 | * built-in dataset management (including tracking of metadata such as LICENSES and READMEs),
 14 | * a prescribed project directory structure,
 15 | * workflows and conventions for contributing notebooks and other code.
 16 | 
 17 | EASYDATA REQUIREMENTS
 18 | ------------
 19 | * Make
 20 | * conda >= 4.8 (via Anaconda or Miniconda)
 21 | * Git
 22 | 
 23 | GETTING STARTED
 24 | ---------------
 25 | ### Initial Git Configuration and Checking Out the Repo
 26 | 
 27 | If you haven't yet done so, please follow the instrucitons
 28 | in [Setting up git and Checking Out the Repo](reference/easydata/git-configuration.md) in
 29 | order to check-out the code and set-up your remote branches
 30 | 
 31 | Note: These instructions assume you are using SSH keys (and not HTTPS authentication) with {{ cookiecutter.upstream_location }}.
 32 | If you haven't set up SSH access to {{ cookiecutter.upstream_location }}, see [Configuring SSH Access to {{cookiecutter.upstream_location}}](https://github.com/hackalog/easydata/wiki/Configuring-SSH-Access-to-Github). This also includes instuctions for using more than one account with SSH keys.
 33 | 
 34 | Once you've got your local, `origin`, and `upstream` branches configured, you can follow the instructions in this handy [Git Workflow Cheat Sheet](reference/easydata/git-workflow.md) to keep your working copy of the repo in sync with the others.
 35 | 
 36 | ### Setting up your environment
 37 | **WARNING**: If you have conda-forge listed as a channel in your `.condarc` (or any other channels other than defaults), you may experience great difficulty generating reproducible conda environments.
 38 | 
 39 | We recommend you remove conda-forge (and all other non-default channels) from your `.condarc` file and [set your channel priority to 'strict'](https://docs.conda.io/projects/conda/en/latest/user-guide/tasks/manage-channels.html). Alternate channels can be specified explicitly in your your `environment.yml` by prefixing your package name with `channel-name::`; e.g.
 40 | ```
 41 |   - wheel                    # install from the default (anaconda) channel
 42 |   - pytorch::pytorch         # install this from the `pytorch` channel
 43 |   - conda-forge::tokenizers  # install this from conda-forge
 44 | 
 45 | 
 46 | ### Initial setup
 47 | 
 48 | * Make note of the path to your conda binary:
 49 | ```
 50 |    $ which conda
 51 |    ~/miniconda3/bin/conda
 52 | ```
 53 | * ensure your `CONDA_EXE` environment variable is set to this value (or edit `Makefile.include` directly)
 54 | ```
 55 |     export CONDA_EXE=~/miniconda3/bin/conda
 56 | ```
 57 | * Create and switch to the virtual environment:
 58 | ```
 59 | cd {{cookiecutter.repo_name}}
 60 | make create_environment
 61 | conda activate {{cookiecutter.repo_name}}
 62 | ```
 63 | 
 64 | Now you're ready to run `jupyter notebook` (or jupyterlab) and explore the notebooks in the `notebooks` directory.
 65 | 
 66 | For more instructions on setting up and maintaining your environment (including how to point your environment at your custom forks and work in progress) see [Setting up and Maintaining your Conda Environment Reproducibly](reference/easydata/conda-environments.md).
 67 | 
 68 | ### Loading Datasets
 69 | 
 70 | At this point you will be able to load any of the pre-built datasets by the following set of commands:
 71 | ```python
 72 | from {{ cookiecutter.module_name }}.data import Dataset
 73 | ds = Dataset.load("<dataset-name>")
 74 | ```
 75 | Because of licenses and other distribution restrictions, some of the datasets will require a manual dowload step. If so, you will prompted at this point and given instructions for what to do. Some datasets will require local pre-processing. If so, the first time your run the command, you will be executing all of the processing scripts (which can be quite slow).
 76 | 
 77 | After the first time, data will loaded from cache on disk which should be fast.
 78 | 
 79 | To see which datasets are currently available:
 80 | ```python
 81 | from {{ cookiecutter.module_name }} import workflow
 82 | workflow.available_datasets(keys_only=True)
 83 | ```
 84 | 
 85 | Note: sometimes datasets can be quite large. If you want to store your data externally, we recommend symlinking your data directory (that is `{{cookiecutter.repo_name}}/data`) to somewhere with more room.
 86 | 
 87 | For more on Datasets, see [Getting and Using Datasets](reference/easydata/datasets.md).
 88 | 
 89 | ### Using Notebooks and Sharing your Work
 90 | This repo has been set up in such a way as to make:
 91 | 
 92 | * environment management easy and reproducible
 93 | * sharing analyses via notebooks easy and reproducible
 94 | 
 95 | There are some tricks, hacks, and built in utilities that you'll want to check out: [Using Notebooks for Analysis](reference/easydata/notebooks.md).
 96 | 
 97 | Here are some best practices for sharing using this repo:
 98 | 
 99 | * Notebooks go in the...you guessed it...`notebooks` directory. The naming convention is a number (for ordering), the creator’s initials, and a short - delimited description, e.g. `01-jqp-initial-data-exploration`. Please increment the starting number when creating a new notebook.
100 | * When checking in a notebook, run **Kernel->Restart & Run All** or **Kernel->Restart & Clear Output** and then **Save** before checking it in.
101 | * Put any scripts or other code in the `{{ cookiecutter.module_name }}` module. We suggest you create a directory using the same initials you put in your notebook titles (e.g. `{{ cookiecutter.module_name }}/xyz`) You will be able to import it into your notebooks via `from {{ cookiecutter.module_name }}.xyz import ...`.
102 | * See the Project Organization section below to see where other materials should go, such as reports, figures, and references.
103 | 
104 | For more on sharing your work, including using git, submitting PRs and the like, see [Sharing your Work](reference/easydata/sharing-your-work.md).
105 | 
106 | ### Quick References
107 | * [Setting up and Maintaining your Conda Environment Reproducibly](reference/easydata/conda-environments.md)
108 | * [Getting and Using Datasets](reference/easydata/datasets.md)
109 | * [Using Notebooks for Analysis](reference/easydata/notebooks.md)
110 | * [Sharing your Work](reference/easydata/sharing-your-work.md)
111 | 
112 | 
113 | Project Organization
114 | ------------
115 | * `LICENSE`
116 | * `Makefile`
117 |     * Top-level makefile. Type `make` for a list of valid commands.
118 | * `Makefile.include`
119 |     * Global includes for makefile routines. Included by `Makefile`.
120 | * `Makefile.env`
121 |     * Command for maintaining reproducible conda environment. Included by `Makefile`.
122 | * `README.md`
123 |     * this file
124 | * `catalog`
125 |   * Data catalog. This is where config information such as data sources
126 |     and data transformations are saved.
127 |   * `catalog/config.ini`
128 |      * Local Data Store. This configuration file is for local data only, and is never checked into the repo.
129 | * `data`
130 |     * Data directory. Often symlinked to a filesystem with lots of space.
131 |     * `data/raw`
132 |         * Raw (immutable) hash-verified downloads.
133 |     * `data/interim`
134 |         * Extracted and interim data representations.
135 |     * `data/interim/cache`
136 |         * Dataset cache
137 |     * `data/processed`
138 |         * The final, canonical data sets ready for analysis.
139 | * `docs`
140 |     * Sphinx-format documentation files for this project.
141 |     * `docs/Makefile`: Makefile for generating HTML/Latex/other formats from Sphinx-format documentation.
142 | * `notebooks`
143 |     *  Jupyter notebooks. Naming convention is a number (for ordering),
144 |     the creator's initials, and a short `-` delimited description,
145 |     e.g. `1.0-jqp-initial-data-exploration`.
146 | * `reference`
147 |     * Data dictionaries, documentation, manuals, scripts, papers, or other explanatory materials.
148 |     * `reference/easydata`: Easydata framework and workflow documentation.
149 |     * `reference/templates`: Templates and code snippets for Jupyter
150 |     * `reference/dataset`: resources related to datasets; e.g. dataset creation notebooks and scripts
151 | * `reports`
152 |     * Generated analysis as HTML, PDF, LaTeX, etc.
153 |     * `reports/figures`
154 |         * Generated graphics and figures to be used in reporting.
155 | * `environment.yml`
156 |     * The user-readable YAML file for reproducing the conda/pip environment.
157 | * `environment.(platform).lock.yml`
158 |     * resolved versions, result of processing `environment.yml`
159 | * `setup.py`
160 |     * Turns contents of `{{ cookiecutter.module_name }}` into a
161 |     pip-installable python module  (`pip install -e .`) so it can be
162 |     imported in python code.
163 | * `{{ cookiecutter.module_name }}`
164 |     * Source code for use in this project.
165 |     * `{{ cookiecutter.module_name }}/__init__.py`
166 |         * Makes `{{ cookiecutter.module_name }}` a Python module.
167 |     * `{{ cookiecutter.module_name }}/data`
168 |         * Scripts to fetch or generate data.
169 |     * `{{ cookiecutter.module_name }}/analysis`
170 |         * Scripts to turn datasets into output products.
171 | 
172 | --------
173 | 
174 | <p><small>This project was built using <a target="_blank" href="https://github.com/hackalog/easydata">Easydata</a>, a python framework aimed at making your data science workflow reproducible.</small></p>
175 | 


--------------------------------------------------------------------------------
/docs/Add-derived-dataset.ipynb:
--------------------------------------------------------------------------------
  1 | {
  2 |  "cells": [
  3 |   {
  4 |    "cell_type": "markdown",
  5 |    "metadata": {},
  6 |    "source": [
  7 |     "# Template for creating a dataset from an existing dataset using a single function\n",
  8 |     "\n",
  9 |     "This example creates a dataset from the `covid-19-epidemiology` dataset created in the notebook that demos [how to create a dataset from a single .csv file](Add-csv-template.ipynb). \n",
 10 |     "\n",
 11 |     "To access functionality from the `src` module throughout this notebook, use your project module, whatever you have named it."
 12 |    ]
 13 |   },
 14 |   {
 15 |    "cell_type": "markdown",
 16 |    "metadata": {},
 17 |    "source": [
 18 |     "## Basic imports"
 19 |    ]
 20 |   },
 21 |   {
 22 |    "cell_type": "code",
 23 |    "execution_count": null,
 24 |    "metadata": {},
 25 |    "outputs": [],
 26 |    "source": [
 27 |     "# Basic utility functions\n",
 28 |     "import logging\n",
 29 |     "import pathlib\n",
 30 |     "from functools import partial\n",
 31 |     "\n",
 32 |     "from src.log import logger\n",
 33 |     "from src.data import Dataset\n",
 34 |     "from src import paths, helpers"
 35 |    ]
 36 |   },
 37 |   {
 38 |    "cell_type": "code",
 39 |    "execution_count": null,
 40 |    "metadata": {},
 41 |    "outputs": [],
 42 |    "source": [
 43 |     "# Optionally set to debug log level\n",
 44 |     "#logger.setLevel(logging.DEBUG)"
 45 |    ]
 46 |   },
 47 |   {
 48 |    "cell_type": "code",
 49 |    "execution_count": null,
 50 |    "metadata": {},
 51 |    "outputs": [],
 52 |    "source": [
 53 |     "%load_ext autoreload\n",
 54 |     "%autoreload 2"
 55 |    ]
 56 |   },
 57 |   {
 58 |    "cell_type": "markdown",
 59 |    "metadata": {},
 60 |    "source": [
 61 |     "## Load existing dataset"
 62 |    ]
 63 |   },
 64 |   {
 65 |    "cell_type": "code",
 66 |    "execution_count": null,
 67 |    "metadata": {},
 68 |    "outputs": [],
 69 |    "source": [
 70 |     "ds = Dataset.load('covid-19-epidemiology')"
 71 |    ]
 72 |   },
 73 |   {
 74 |    "cell_type": "code",
 75 |    "execution_count": null,
 76 |    "metadata": {},
 77 |    "outputs": [],
 78 |    "source": [
 79 |     "ds.data.shape"
 80 |    ]
 81 |   },
 82 |   {
 83 |    "cell_type": "code",
 84 |    "execution_count": null,
 85 |    "metadata": {},
 86 |    "outputs": [],
 87 |    "source": [
 88 |     "print(ds.README)"
 89 |    ]
 90 |   },
 91 |   {
 92 |    "cell_type": "code",
 93 |    "execution_count": null,
 94 |    "metadata": {},
 95 |    "outputs": [],
 96 |    "source": [
 97 |     "print(ds.LICENSE)"
 98 |    ]
 99 |   },
100 |   {
101 |    "cell_type": "markdown",
102 |    "metadata": {},
103 |    "source": [
104 |     "## Create a function that we want to transform by\n",
105 |     "\n",
106 |     "Here let's do something extremely simple, subselect by `key` which reflects a geographic region. \n",
107 |     "\n",
108 |     "We will use this function to create a derived dataset. As such, let's save it in the project module (`src` in this case) in `transformer_functions.py`."
109 |    ]
110 |   },
111 |   {
112 |    "cell_type": "code",
113 |    "execution_count": null,
114 |    "metadata": {},
115 |    "outputs": [],
116 |    "source": [
117 |     "project_path = paths['project_path']"
118 |    ]
119 |   },
120 |   {
121 |    "cell_type": "code",
122 |    "execution_count": null,
123 |    "metadata": {},
124 |    "outputs": [],
125 |    "source": [
126 |     "%%writefile -a $project_path/src/data/transformer_functions.py\n",
127 |     "\n",
128 |     "def subselect_by_key(df, key):\n",
129 |     "    \"\"\"\n",
130 |     "    Filter dataframe by key and return resulting dataframe.\n",
131 |     "    \"\"\"\n",
132 |     "    return df[df.key == key]"
133 |    ]
134 |   },
135 |   {
136 |    "cell_type": "code",
137 |    "execution_count": null,
138 |    "metadata": {},
139 |    "outputs": [],
140 |    "source": [
141 |     "from src.data.transformer_functions import subselect_by_key"
142 |    ]
143 |   },
144 |   {
145 |    "cell_type": "code",
146 |    "execution_count": null,
147 |    "metadata": {},
148 |    "outputs": [],
149 |    "source": [
150 |     "subselect_by_key.__module__"
151 |    ]
152 |   },
153 |   {
154 |    "cell_type": "code",
155 |    "execution_count": null,
156 |    "metadata": {},
157 |    "outputs": [],
158 |    "source": [
159 |     "df = ds.data.copy()"
160 |    ]
161 |   },
162 |   {
163 |    "cell_type": "markdown",
164 |    "metadata": {},
165 |    "source": [
166 |     "For example, `CA` will give us the numbers for Canada:"
167 |    ]
168 |   },
169 |   {
170 |    "cell_type": "code",
171 |    "execution_count": null,
172 |    "metadata": {},
173 |    "outputs": [],
174 |    "source": [
175 |     "key_df = subselect_by_key(df, 'CA')\n",
176 |     "key_df.shape"
177 |    ]
178 |   },
179 |   {
180 |    "cell_type": "markdown",
181 |    "metadata": {},
182 |    "source": [
183 |     "Here are some trends:"
184 |    ]
185 |   },
186 |   {
187 |    "cell_type": "code",
188 |    "execution_count": null,
189 |    "metadata": {},
190 |    "outputs": [],
191 |    "source": [
192 |     "key_df[['date', 'new_confirmed']].plot();"
193 |    ]
194 |   },
195 |   {
196 |    "cell_type": "code",
197 |    "execution_count": null,
198 |    "metadata": {},
199 |    "outputs": [],
200 |    "source": [
201 |     "key_df[['date', 'new_deceased']].plot();"
202 |    ]
203 |   },
204 |   {
205 |    "cell_type": "markdown",
206 |    "metadata": {},
207 |    "source": [
208 |     "## Create a derived dataset"
209 |    ]
210 |   },
211 |   {
212 |    "cell_type": "markdown",
213 |    "metadata": {},
214 |    "source": [
215 |     "Let's create a dataset that's just the Canadian epidimelogical numbers. To do so, we only need to apply a single function to the existing data.\n",
216 |     "\n",
217 |     "Here is the information we need to create a dataset using `helpers.dataset_from_single_function()`:\n",
218 |     "\n",
219 |     "    source_dataset_name\n",
220 |     "    dataset_name\n",
221 |     "    data_function\n",
222 |     "    added_readme_txt\n",
223 |     "\n",
224 |     "We'll want our `data_function` to be defined in the project module (in this case `src`) for reproducibility reasons (which we've already done with `subselect_by_key` above)."
225 |    ]
226 |   },
227 |   {
228 |    "cell_type": "code",
229 |    "execution_count": null,
230 |    "metadata": {},
231 |    "outputs": [],
232 |    "source": [
233 |     "key = 'CA'"
234 |    ]
235 |   },
236 |   {
237 |    "cell_type": "code",
238 |    "execution_count": null,
239 |    "metadata": {},
240 |    "outputs": [],
241 |    "source": [
242 |     "source_dataset_name = 'covid-19-epidemiology'\n",
243 |     "dataset_name = f'covid-19-epidemiology-{key}'\n",
244 |     "data_function = partial(subselect_by_key, key=key)"
245 |    ]
246 |   },
247 |   {
248 |    "cell_type": "code",
249 |    "execution_count": null,
250 |    "metadata": {},
251 |    "outputs": [],
252 |    "source": [
253 |     "added_readme_txt = f\"\"\"The dataset {dataset_name} is the subselection \\\n",
254 |     "to the {key} dataset.\"\"\""
255 |    ]
256 |   },
257 |   {
258 |    "cell_type": "code",
259 |    "execution_count": null,
260 |    "metadata": {},
261 |    "outputs": [],
262 |    "source": [
263 |     "# test out the function\n",
264 |     "data_function(df).shape"
265 |    ]
266 |   },
267 |   {
268 |    "cell_type": "markdown",
269 |    "metadata": {},
270 |    "source": [
271 |     "### Use the helper function to create the derived dataset"
272 |    ]
273 |   },
274 |   {
275 |    "cell_type": "code",
276 |    "execution_count": null,
277 |    "metadata": {},
278 |    "outputs": [],
279 |    "source": [
280 |     "ds = helpers.dataset_from_single_function(\n",
281 |     "        source_dataset_name=source_dataset_name,\n",
282 |     "        dataset_name=dataset_name,\n",
283 |     "        data_function=data_function,\n",
284 |     "        added_readme_txt=added_readme_txt,\n",
285 |     "        overwrite_catalog=True)"
286 |    ]
287 |   },
288 |   {
289 |    "cell_type": "code",
290 |    "execution_count": null,
291 |    "metadata": {},
292 |    "outputs": [],
293 |    "source": [
294 |     "dataset_name"
295 |    ]
296 |   },
297 |   {
298 |    "cell_type": "code",
299 |    "execution_count": null,
300 |    "metadata": {},
301 |    "outputs": [],
302 |    "source": [
303 |     "ds = Dataset.load(dataset_name)"
304 |    ]
305 |   },
306 |   {
307 |    "cell_type": "code",
308 |    "execution_count": null,
309 |    "metadata": {},
310 |    "outputs": [],
311 |    "source": [
312 |     "ds.data.shape"
313 |    ]
314 |   },
315 |   {
316 |    "cell_type": "code",
317 |    "execution_count": null,
318 |    "metadata": {},
319 |    "outputs": [],
320 |    "source": [
321 |     "print(ds.README)"
322 |    ]
323 |   },
324 |   {
325 |    "cell_type": "code",
326 |    "execution_count": null,
327 |    "metadata": {},
328 |    "outputs": [],
329 |    "source": [
330 |     "print(ds.LICENSE)"
331 |    ]
332 |   },
333 |   {
334 |    "cell_type": "code",
335 |    "execution_count": null,
336 |    "metadata": {},
337 |    "outputs": [],
338 |    "source": [
339 |     "ds.data[['date', 'new_confirmed']].plot();"
340 |    ]
341 |   },
342 |   {
343 |    "cell_type": "markdown",
344 |    "metadata": {},
345 |    "source": [
346 |     "## Check-in the new catalog files\n",
347 |     "Finally check in the new catalog files."
348 |    ]
349 |   }
350 |  ],
351 |  "metadata": {
352 |   "kernelspec": {
353 |    "display_name": "Python [conda env:easydata-notebook]",
354 |    "language": "python",
355 |    "name": "conda-env-easydata-notebook-py"
356 |   },
357 |   "language_info": {
358 |    "codemirror_mode": {
359 |     "name": "ipython",
360 |     "version": 3
361 |    },
362 |    "file_extension": ".py",
363 |    "mimetype": "text/x-python",
364 |    "name": "python",
365 |    "nbconvert_exporter": "python",
366 |    "pygments_lexer": "ipython3",
367 |    "version": "3.7.10"
368 |   }
369 |  },
370 |  "nbformat": 4,
371 |  "nbformat_minor": 4
372 | }
373 | 


--------------------------------------------------------------------------------
/{{ cookiecutter.repo_name }}/reference/easydata/conda-environments.md:
--------------------------------------------------------------------------------
  1 | # Setting up and Maintaining your Conda Environment (Reproducibly)
  2 | 
  3 | The `{{ cookiecutter.repo_name }}` repo is set up with template code to make managing your conda environments easy and reproducible. Not only will _future you_ appreciate this, but so will anyone else who needs to work with your code after today.
  4 | 
  5 | If you haven't yet, configure your conda environment.
  6 | 
  7 | **WARNING**: If you have conda-forge listed as a channel in your `.condarc` (or any other channels other than defaults), you may experience great difficulty generating reproducible conda environments.
  8 | 
  9 | We recommend you remove conda-forge (and all other non-default channels) from your `.condarc` file and [set your channel priority to 'strict'](https://docs.conda.io/projects/conda/en/latest/user-guide/tasks/manage-channels.html). You can still use conda-forge (or any other conda channel), just specify it explicitly in your `environment.yml` by prefixing your package name with `channel-name::`; e.g.
 10 | ```
 11 |   - wheel                    # install from the default (anaconda) channel
 12 |   - pytorch::pytorch         # install this from the `pytorch` channel
 13 |   - conda-forge::tokenizers  # install this from conda-forge
 14 | ```
 15 | 
 16 | ## Configuring your python environment
 17 | Easydata uses conda to manage python packages installed by both conda **and pip**.
 18 | 
 19 | ### Adjust your `.condarc`
 20 | 
 21 | ```
 22 | conda config --set channel_priority strict
 23 | ```
 24 | Whenever possible, re-order your channels so that `default` is first.
 25 | 
 26 | ```
 27 | conda config --prepend channels defaults
 28 | ```
 29 | 
 30 | **Note for Jupyterhub Users**: You will need to store your conda environment in your **home directory** so that they wil be persisted across JupyterHub sessions.
 31 | ```
 32 | conda config --prepend envs_dirs ~/.conda/envs   # Store environments in local dir for JupyterHub
 33 | ```
 34 | 
 35 | #### Locating the `conda` binary
 36 | Ensure the Makefile can find your conda binary, either by setting the `CONDA_EXE` environment variable, or by modifying `Makefile.include` directly.
 37 | 
 38 | First, check if `CONDA_EXE` is already set
 39 | ```
 40 |    >>> export | grep CONDA_EXE
 41 |    CONDA_EXE=/Users/your_username/miniconda3/bin/conda
 42 | ```
 43 | 
 44 | If `CONDA_EXE` is not set, you will need to set it manually in `Makefile.include`; i.e.
 45 | 
 46 | * Make note of the path to your conda binary. It should be in the `bin` subdirectory of your Anaconda (or miniconda) installation directory:
 47 | ```
 48 |    >>>  which conda         # this will only work if conda is in your PATH, otherwise, verify manually
 49 |    ~/miniconda3/bin/conda
 50 | ```
 51 | * ensure your `CONDA_EXE` environment variable is set to this value; i.e.
 52 | ```
 53 |     >>> export CONDA_EXE=~/miniconda3/bin/conda
 54 | ```
 55 | or edit `Makefile.include` directly.
 56 | 
 57 | ### Create the conda environment
 58 | Create and switch to the virtual environment:
 59 | ```
 60 | cd {{ cookiecutter.repo_name }}
 61 | make create_environment
 62 | conda activate {{ cookiecutter.repo_name }}
 63 | make update_environment
 64 | ```
 65 | **Note**: When creating the environment the first time, you really do need to run **both** `make create_environment` and `make update_environment` for the `{{ cookiecutter.module_name }}` module to install correctly.
 66 | 
 67 | To activate the environment, simply `conda activate {{ cookiecutter.repo_name }}`
 68 | 
 69 | To deactivate it and return to your base environment, use `conda deactivate`
 70 | 
 71 | ## Maintaining your Python environment
 72 | 
 73 | ### Updating your conda and pip environments
 74 | The `make` commands, `make create_environment` and `make update_environment` are wrappers that allow you to easily manage your conda and pip environments using the `environment.yml` file.
 75 | 
 76 | (If you ever forget which `make` command to run, you can run `make` by itself and it will provide a list of commands that are available.)
 77 | 
 78 | 
 79 | When adding packages to your python environment, **do not `pip install` or `conda install` directly**. Always edit `environment.yml` and `make update_environment` instead.
 80 | 
 81 | Your `environment.yml` file will look something like this:
 82 | ```
 83 | name: {{ cookiecutter.repo_name }}
 84 | dependencies:
 85 |   - pip
 86 |   - pip:
 87 |     - -e .  # conda >= 4.4 only
 88 |     - python-dotenv>=0.5.1
 89 |     - nbval
 90 |     - nbdime
 91 |     - umap-learn
 92 |     - gdown
 93 |   - setuptools
 94 |   - wheel
 95 |   - git>=2.5  # for git worktree template updating
 96 |   - sphinx
 97 |   - bokeh
 98 |   - click
 99 |   - colorcet
100 |   - coverage
101 |   - coveralls
102 |   - datashader
103 |   - holoviews
104 |   - matplotlib
105 |   - jupyter
106 | ...
107 | ```
108 | To add any package available from conda, add it to the end of the list. If you have a PYPI dependency that's not avaible via conda, add it to the list of pip installable dependencies under `  - pip:`.
109 | 
110 | You can include any `{{ cookiecutter.upstream_location }}` python-based project in the `pip` section via `git+https://{{ cookiecutter.upstream_location }}/<my_git_handle>/<package>`.
111 | 
112 | In particular, if you're working off of a fork or a work in progress branch of a repo in {{ cookiecutter.upstream_location }} (say, your personal version of <package>), you can change `git+https://{{ cookiecutter.upstream_location }}/<my_git_handle>/<package>` to
113 | 
114 | * `git+https://{{ cookiecutter.upstream_location }}/<my_git_handle>/<package>.git` to point to the {{cookiecutter.default_branch}} branch of your fork and
115 | * `git+https://{{ cookiecutter.upstream_location }}/<my_git_handle>/<package>.git@<my branch>` to point to a specific branch.
116 | 
117 | Once you're done your edits, run `make update_environment` and voila, you're updated.
118 | 
119 | To share your updated environment, check in your `environment.yml` file. (More on this in [Sharing your Work](sharing-your-work.md))
120 | 
121 | #### Adding packages from other conda channels
122 | Say we want to add a package only available from the `conda-forge` conda channel and not the default conda channel. (The conda channel is what follows `-c` when using `conda install -c my-channel my-package`. Suppose we want to use `make` on windows. Then we need to use `conda-forge` since the default conda channel only has linux and macOS installations of `make`. To normally conda install this, we would use `conda install -c conda-forge make`. **We won't do that here**.
123 | 
124 | Instead, we add a `channel-order` section that starts with `defaults` and lists the other channels we want to use in the order we want to install from them (note that this is a custom EasyData section to the `environment.yml`). Then we add our package in the dependency list in the form `channel-name::package-name`, for example, `conda-forge::make`.
125 | 
126 | In this case an updated `environment.yml` file looks like this:
127 | ```
128 | name: {{ cookiecutter.repo_name }}
129 | channel-order:
130 |   - defaults
131 |   - conda-forge
132 | dependencies:
133 |   - pip
134 |   - pip:
135 |     - -e .  # conda >= 4.4 only
136 |     - python-dotenv>=0.5.1
137 |     - nbval
138 |     - nbdime
139 |     - umap-learn
140 |     - gdown
141 |   - setuptools
142 |   - wheel
143 |   - git>=2.5  # for git worktree template updating
144 |   - sphinx
145 |   - bokeh
146 |   - click
147 |   - colorcet
148 |   - coverage
149 |   - coveralls
150 |   - datashader
151 |   - holoviews
152 |   - matplotlib
153 |   - jupyter
154 |   - conda-forge::make
155 | ...
156 | ```
157 | 
158 | 
159 | #### Lock files
160 | Now, we'll admit that this workflow isn't perfectly reproducible in the sense that conda still has to resolve versions from the `environment.yml`. To make it more reproducible, running either `make create_environment` or `make update_environment` will generate an `environment.{$ARCH}.lock.yml` (e.g. `environment.i386.lock.yml`). This file keeps a record of the exact environment that is currently installed in your conda environment `{{ cookiecutter.repo_name }}`. If you ever need to reproduce an environment exactly, you can install from the `.lock.yml` file. (Note: These are architecture dependent).
161 | 
162 | #### Using your conda environment in a jupyter notebook
163 | If you make a new notebook, select the `{{ cookiecutter.repo_name }}` environment from within the notebook. If you are somehow in another kernel, select **Kernel -> Change kernel -> Python[conda env:{{ cookiecutter.repo_name }}]**. If you don't seem to have that option, make sure that you ran `jupyter notebooks` with the `{{ cookiecutter.repo_name }}` conda environment enabled, and that `which jupyter` points to the correct (`{{ cookiecutter.repo_name }}`) version of jupyter.
164 | 
165 | If you want your environment changes (or `{{ cookiecutter.module_name }}` module edits) to be immediately available in your running notebooks, make sure to run a notebook cell containing
166 | ```
167 | %load_ext autoreload
168 | %autoreload 2
169 | ```
170 | 
171 | More on notebooks can be found in [Using Notebooks for Analysis](notebooks.md).
172 | 
173 | ### Nuke it from orbit
174 | Sometimes, you need to be sure. Making things reproducible means that blowing things away completely and rebuilding from scratch is always an option. To do so:
175 | ```
176 | conda deactivate
177 | make delete_environment
178 | make create_environment
179 | conda activate {{ cookiecutter.repo_name }}
180 | touch environment.yml
181 | make update_envrionment
182 | ```
183 | and then proceed with managing your environment as above.
184 | 
185 | ### Quick References
186 | 
187 | * [README](../README.md)
188 | * [Setting up and Maintaining your Conda Environment Reproducibly](conda-environments.md)
189 | * [Getting and Using Datasets](datasets.md)
190 | * [Using Notebooks for Analysis](notebooks.md)
191 | * [Sharing your Work](sharing-your-work.md)
192 | * [Troubleshooting Guide](troubleshooting.md)
193 | 


--------------------------------------------------------------------------------
/{{ cookiecutter.repo_name }}/{{ cookiecutter.module_name }}/tests/data/dataset-test.json:
--------------------------------------------------------------------------------
 1 | {
 2 |   "wine_reviews": {
 3 |     "dataset_name": "wine_reviews",
 4 |     "descr": "\n### Content\n\nThis dataset contains three files:\n\n  * `winemag-data-130k-v2.csv` contains 13 columns and 130k rows of wine reviews.\n\n  * `winemag-data_first150k.csv` contains 10 columns and 150k rows of wine reviews. (Does not have Taster info)\n\n  * `winemag-data-130k-v2.json` contains 6919 nodes of wine reviews.\n\nThe data consists of 13 fields:\n\n* Points: the number of points WineEnthusiast rated the wine on a scale of 1-100 (though they say they only post reviews for wines that score >=80)\n* Title: the title of the wine review, which often contains the vintage if you're interested in extracting that feature\n* Variety: the type of grapes used to make the wine (ie Pinot Noir)\n* Description: a few sentences from a sommelier describing the wine's taste, smell, look, feel, etc.\n* Country: the country that the wine is from\n* Province: the province or state that the wine is from\n* Region 1: the wine growing area in a province or state (ie Napa)\n* Region 2: sometimes there are more specific regions specified within a wine growing area (ie Rutherford inside the Napa Valley), but this value can sometimes be blank\n* Winery: the winery that made the wine\n* Designation: the vineyard within the winery where the grapes that made the wine are from\n* Price: the cost for a bottle of the wine\n* Taster Name: name of the person who tasted and reviewed the wine\n* Taster Twitter Handle: Twitter handle for the person who tasted ane reviewed the wine\n\nUPDATED 11/24/2017 Title, Taster Name, and Taster Twitter Handle were collected and the issue with duplicate entires was resolved\n\n### Acknowledgements\n\nThe data was scraped from WineEnthusiast during the week of June 15th, 2017. The code for the scraper can be found here if you have any more specific questions about data collection that I didn't address. (https://github.com/zackthoutt/wine-deep-learning)\n\nUPDATE 11/24/2017\nAfter feedback from users of the dataset I scraped the reviews again on November 22nd, 2017. This time around I collected the title of each review, which you can parse the year out of, the tasters name, and the taster's Twitter handle. This should also fix the duplicate entry issue.",
 5 |     "hashes": {
 6 |       "data": "sha1:120c359cedf8b75e1e9fb7d280668e51eea2e43f",
 7 |       "target": "sha1:38f65f3b11da4851aaaccc19b1f0cf4d3806f83b"
 8 |     },
 9 |     "license": "CC BY-NC-SA 4.0"
10 |   },
11 |   "wine_reviews_130k": {
12 |     "dataset_name": "wine_reviews_130k",
13 |     "descr": "\n### Content\n\nThis dataset contains three files:\n\n  * `winemag-data-130k-v2.csv` contains 13 columns and 130k rows of wine reviews.\n\n  * `winemag-data_first150k.csv` contains 10 columns and 150k rows of wine reviews. (Does not have Taster info)\n\n  * `winemag-data-130k-v2.json` contains 6919 nodes of wine reviews.\n\nThe data consists of 13 fields:\n\n* Points: the number of points WineEnthusiast rated the wine on a scale of 1-100 (though they say they only post reviews for wines that score >=80)\n* Title: the title of the wine review, which often contains the vintage if you're interested in extracting that feature\n* Variety: the type of grapes used to make the wine (ie Pinot Noir)\n* Description: a few sentences from a sommelier describing the wine's taste, smell, look, feel, etc.\n* Country: the country that the wine is from\n* Province: the province or state that the wine is from\n* Region 1: the wine growing area in a province or state (ie Napa)\n* Region 2: sometimes there are more specific regions specified within a wine growing area (ie Rutherford inside the Napa Valley), but this value can sometimes be blank\n* Winery: the winery that made the wine\n* Designation: the vineyard within the winery where the grapes that made the wine are from\n* Price: the cost for a bottle of the wine\n* Taster Name: name of the person who tasted and reviewed the wine\n* Taster Twitter Handle: Twitter handle for the person who tasted ane reviewed the wine\n\nUPDATED 11/24/2017 Title, Taster Name, and Taster Twitter Handle were collected and the issue with duplicate entires was resolved\n\n### Acknowledgements\n\nThe data was scraped from WineEnthusiast during the week of June 15th, 2017. The code for the scraper can be found here if you have any more specific questions about data collection that I didn't address. (https://github.com/zackthoutt/wine-deep-learning)\n\nUPDATE 11/24/2017\nAfter feedback from users of the dataset I scraped the reviews again on November 22nd, 2017. This time around I collected the title of each review, which you can parse the year out of, the tasters name, and the taster's Twitter handle. This should also fix the duplicate entry issue.",
14 |     "hashes": {
15 |       "data": "sha1:9d8db83e00877dbe2ce862040d677b29eb4e23b3",
16 |       "target": "sha1:38f65f3b11da4851aaaccc19b1f0cf4d3806f83b"
17 |     },
18 |     "license": "CC BY-NC-SA 4.0"
19 |   },
20 |   "wine_reviews_130k_varietals_75": {
21 |     "dataset_name": "wine_reviews_130k_varietals_75",
22 |     "descr": "\n### Content\n\nThis dataset contains three files:\n\n  * `winemag-data-130k-v2.csv` contains 13 columns and 130k rows of wine reviews.\n\n  * `winemag-data_first150k.csv` contains 10 columns and 150k rows of wine reviews. (Does not have Taster info)\n\n  * `winemag-data-130k-v2.json` contains 6919 nodes of wine reviews.\n\nThe data consists of 13 fields:\n\n* Points: the number of points WineEnthusiast rated the wine on a scale of 1-100 (though they say they only post reviews for wines that score >=80)\n* Title: the title of the wine review, which often contains the vintage if you're interested in extracting that feature\n* Variety: the type of grapes used to make the wine (ie Pinot Noir)\n* Description: a few sentences from a sommelier describing the wine's taste, smell, look, feel, etc.\n* Country: the country that the wine is from\n* Province: the province or state that the wine is from\n* Region 1: the wine growing area in a province or state (ie Napa)\n* Region 2: sometimes there are more specific regions specified within a wine growing area (ie Rutherford inside the Napa Valley), but this value can sometimes be blank\n* Winery: the winery that made the wine\n* Designation: the vineyard within the winery where the grapes that made the wine are from\n* Price: the cost for a bottle of the wine\n* Taster Name: name of the person who tasted and reviewed the wine\n* Taster Twitter Handle: Twitter handle for the person who tasted ane reviewed the wine\n\nUPDATED 11/24/2017 Title, Taster Name, and Taster Twitter Handle were collected and the issue with duplicate entires was resolved\n\n### Acknowledgements\n\nThe data was scraped from WineEnthusiast during the week of June 15th, 2017. The code for the scraper can be found here if you have any more specific questions about data collection that I didn't address. (https://github.com/zackthoutt/wine-deep-learning)\n\nUPDATE 11/24/2017\nAfter feedback from users of the dataset I scraped the reviews again on November 22nd, 2017. This time around I collected the title of each review, which you can parse the year out of, the tasters name, and the taster's Twitter handle. This should also fix the duplicate entry issue.Subselection of the dataset that only includes entries for wines with a given varietal that appeas in at least 75 different entries",
23 |     "hashes": {
24 |       "data": "sha1:d76d24f6ecd309aec82545c39af107c82edebc2f",
25 |       "target": "sha1:38f65f3b11da4851aaaccc19b1f0cf4d3806f83b"
26 |     },
27 |     "license": "CC BY-NC-SA 4.0"
28 |   },
29 |   "wine_reviews_150k": {
30 |     "dataset_name": "wine_reviews_150k",
31 |     "descr": "\n### Content\n\nThis dataset contains three files:\n\n  * `winemag-data-130k-v2.csv` contains 13 columns and 130k rows of wine reviews.\n\n  * `winemag-data_first150k.csv` contains 10 columns and 150k rows of wine reviews. (Does not have Taster info)\n\n  * `winemag-data-130k-v2.json` contains 6919 nodes of wine reviews.\n\nThe data consists of 13 fields:\n\n* Points: the number of points WineEnthusiast rated the wine on a scale of 1-100 (though they say they only post reviews for wines that score >=80)\n* Title: the title of the wine review, which often contains the vintage if you're interested in extracting that feature\n* Variety: the type of grapes used to make the wine (ie Pinot Noir)\n* Description: a few sentences from a sommelier describing the wine's taste, smell, look, feel, etc.\n* Country: the country that the wine is from\n* Province: the province or state that the wine is from\n* Region 1: the wine growing area in a province or state (ie Napa)\n* Region 2: sometimes there are more specific regions specified within a wine growing area (ie Rutherford inside the Napa Valley), but this value can sometimes be blank\n* Winery: the winery that made the wine\n* Designation: the vineyard within the winery where the grapes that made the wine are from\n* Price: the cost for a bottle of the wine\n* Taster Name: name of the person who tasted and reviewed the wine\n* Taster Twitter Handle: Twitter handle for the person who tasted ane reviewed the wine\n\nUPDATED 11/24/2017 Title, Taster Name, and Taster Twitter Handle were collected and the issue with duplicate entires was resolved\n\n### Acknowledgements\n\nThe data was scraped from WineEnthusiast during the week of June 15th, 2017. The code for the scraper can be found here if you have any more specific questions about data collection that I didn't address. (https://github.com/zackthoutt/wine-deep-learning)\n\nUPDATE 11/24/2017\nAfter feedback from users of the dataset I scraped the reviews again on November 22nd, 2017. This time around I collected the title of each review, which you can parse the year out of, the tasters name, and the taster's Twitter handle. This should also fix the duplicate entry issue.",
32 |     "hashes": {
33 |       "data": "sha1:84c8540f48e1350e0cf5c92a3064711b96e1a5ff",
34 |       "target": "sha1:38f65f3b11da4851aaaccc19b1f0cf4d3806f83b"
35 |     },
36 |     "license": "CC BY-NC-SA 4.0"
37 |   }
38 | }


--------------------------------------------------------------------------------