├── .nojeklyll
├── test_files
└── requirements.txt
├── fastkaggle
├── __init__.py
├── _nbdev.py
├── _modidx.py
└── core.py
├── images
├── library-fastkaggle.png
└── libraries-pawpularity.png
├── MANIFEST.in
├── styles.css
├── install_quarto.sh
├── CHANGELOG.md
├── _quarto.yml
├── settings.ini
├── .github
└── workflows
│ └── deploy.yaml
├── Makefile
├── .gitignore
├── setup.py
├── README.md
├── index.ipynb
├── LICENSE
└── 00_core.ipynb
/.nojeklyll:
--------------------------------------------------------------------------------
1 |
--------------------------------------------------------------------------------
/test_files/requirements.txt:
--------------------------------------------------------------------------------
1 | fastcore
2 | timm
--------------------------------------------------------------------------------
/fastkaggle/__init__.py:
--------------------------------------------------------------------------------
1 | __version__ = "0.0.8"
2 | from .core import *
3 |
4 |
--------------------------------------------------------------------------------
/images/library-fastkaggle.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/fastai/fastkaggle/master/images/library-fastkaggle.png
--------------------------------------------------------------------------------
/images/libraries-pawpularity.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/fastai/fastkaggle/master/images/libraries-pawpularity.png
--------------------------------------------------------------------------------
/MANIFEST.in:
--------------------------------------------------------------------------------
1 | include settings.ini
2 | include LICENSE
3 | include CONTRIBUTING.md
4 | include README.md
5 | recursive-exclude * __pycache__
6 |
--------------------------------------------------------------------------------
/styles.css:
--------------------------------------------------------------------------------
1 | .cell-output pre {
2 | margin-left: 0.8rem;
3 | margin-top: 0;
4 | background: none;
5 | border-left: 2px solid lightsalmon;
6 | border-top-left-radius: 0;
7 | border-top-right-radius: 0;
8 | }
9 |
10 | .cell-output .sourceCode {
11 | background: none;
12 | margin-top: 0;
13 | }
14 |
15 | .cell > .sourceCode {
16 | margin-bottom: 0;
17 | }
18 |
--------------------------------------------------------------------------------
/fastkaggle/_nbdev.py:
--------------------------------------------------------------------------------
1 | # AUTOGENERATED BY NBDEV! DO NOT EDIT!
2 |
3 | __all__ = ["index", "modules", "custom_doc_links", "git_url"]
4 |
5 | index = {"iskaggle": "00_core.ipynb",
6 | "setup_comp": "00_core.ipynb",
7 | "nb_meta": "00_core.ipynb",
8 | "push_notebook": "00_core.ipynb"}
9 |
10 | modules = ["core.py"]
11 |
12 | doc_url = "https://fastai.github.io/fastkaggle/"
13 |
14 | git_url = "https://github.com/fastai/fastkaggle/tree/master/"
15 |
16 | def custom_doc_links(name): return None
17 |
--------------------------------------------------------------------------------
/install_quarto.sh:
--------------------------------------------------------------------------------
1 | #!/bin/bash
2 | set -e
3 | install_linux() {
4 | echo "...installing Quarto"
5 | wget -nv https://www.quarto.org/download/latest/quarto-linux-amd64.deb
6 | sudo dpkg -i *64.deb
7 | rm *64.deb
8 | }
9 |
10 | install_mac() {
11 | echo "...downloading Quarto installer"
12 | wget -nv https://www.quarto.org/download/latest/quarto-macos.pkg
13 | echo "...opening installer for Quarto"
14 | open quarto-macos.pkg
15 | }
16 |
17 | case "$OSTYPE" in
18 | linux*) install_linux ;;
19 | darwin*) install_mac ;;
20 | *) echo "make sure you install the latest version of quarto: https://quarto.org/docs/get-started/" ;;
21 | esac
22 |
--------------------------------------------------------------------------------
/CHANGELOG.md:
--------------------------------------------------------------------------------
1 | # Release notes
2 |
3 |
4 |
5 | ## 0.0.7
6 |
7 | ### New Features
8 |
9 | - Datasets functionality + Docs ([#9](https://github.com/fastai/fastkaggle/pull/9)), thanks to [@Isaac-Flath](https://github.com/Isaac-Flath)
10 | - 2 high level functions allow to either pass a list of libraries or a requirements.txt to maintain and update their own libraries as kaggle datasets.
11 |
12 |
13 | ## 0.0.6
14 |
15 | ### Bugs Squashed
16 |
17 | - fix comp should be competition in setup_comp" ([#3](https://github.com/fastai/fastkaggle/pull/3)), thanks to [@n-e-w](https://github.com/n-e-w)
18 |
19 |
20 | ## 0.0.4
21 |
22 | ### Bugs Squashed
23 |
24 | - api not exported ([#1](https://github.com/fastai/fastkaggle/issues/1))
25 |
26 |
27 | ## 0.0.1
28 |
29 | - init release
30 |
31 |
--------------------------------------------------------------------------------
/_quarto.yml:
--------------------------------------------------------------------------------
1 | ipynb-filters: [nbdev_filter]
2 |
3 | project:
4 | type: website
5 | output-dir: docs
6 | preview:
7 | port: 3000
8 | browser: false
9 |
10 | format:
11 | html:
12 | theme: cosmo
13 | css: styles.css
14 | toc: true
15 | toc-depth: 4
16 |
17 | website:
18 | title: "fastkaggle"
19 | site-url: "https://fastai.github.io/fastkaggle/"
20 | description: "Kaggling for fast kagglers!"
21 | execute:
22 | enabled: false
23 | twitter-card: true
24 | open-graph: true
25 | reader-mode: true
26 | repo-branch: master
27 | repo-url: "https://github.com/fastai/fastkaggle/tree/master/"
28 | repo-actions: [issue]
29 | navbar:
30 | background: primary
31 | search: true
32 | right:
33 | - icon: github
34 | href: "https://github.com/fastai/fastkaggle/tree/master/"
35 | sidebar:
36 | style: "floating"
37 |
38 | metadata-files:
39 | - sidebar.yml
40 | - custom.yml
41 |
--------------------------------------------------------------------------------
/settings.ini:
--------------------------------------------------------------------------------
1 | [DEFAULT]
2 | host = github
3 | lib_name = fastkaggle
4 | description = Kaggling for fast kagglers!
5 | copyright = Jeremy Howard, 2022 onwards
6 | keywords = machine-learning kaggle fastai nbdev
7 | user = fastai
8 | author = Jeremy Howard
9 | author_email = info@fast.ai
10 | branch = master
11 | version = 0.0.8
12 | min_python = 3.7
13 | audience = Developers
14 | language = English
15 | requirements = fastcore>=1.4.5 kaggle
16 | custom_sidebar = False
17 | license = apache2
18 | status = 2
19 | nbs_path = .
20 | doc_path = docs
21 | recursive = False
22 | tst_flags = notest
23 | doc_host = https://fastai.github.io
24 | doc_baseurl = /fastkaggle/
25 | git_url = https://github.com/fastai/fastkaggle/tree/master/
26 | lib_path = fastkaggle
27 | title = fastkaggle
28 | black_formatting = False
29 | readme_nb = index.ipynb
30 | allowed_metadata_keys =
31 | allowed_cell_metadata_keys =
32 | jupyter_hooks = True
33 | clean_ids = True
34 |
35 |
--------------------------------------------------------------------------------
/.github/workflows/deploy.yaml:
--------------------------------------------------------------------------------
1 | name: Deploy to GitHub Pages
2 |
3 | on:
4 | push:
5 | branches:
6 | - master
7 | - main
8 | workflow_dispatch:
9 |
10 | jobs:
11 | deploy:
12 | name: Deploy to GitHub Pages
13 | runs-on: ubuntu-latest
14 | steps:
15 | - uses: actions/checkout@v3
16 | - uses: actions/setup-python@v3
17 | - name: Install Dependencies
18 | run: |
19 | python -m pip install --upgrade pip
20 | pip install nbdev
21 | make install
22 | - name: Build website
23 | env:
24 | KAGGLE_USERNAME: test
25 | KAGGLE_KEY: test
26 | run: make docs
27 | - name: Deploy to GitHub Pages
28 | uses: peaceiris/actions-gh-pages@v3
29 | with:
30 | github_token: ${{ secrets.GITHUB_TOKEN }}
31 | force_orphan: true
32 | publish_dir: ./docs
33 | # The following lines assign commit authorship to the official
34 | # GH-Actions bot for deploys to `gh-pages` branch:
35 | # https://github.com/actions/checkout/issues/13#issuecomment-724415212
36 | # The GH actions bot is used by default if you didn't specify the two fields.
37 | # You can swap them out with your own user credentials.
38 | user_name: github-actions[bot]
39 | user_email: 41898282+github-actions[bot]@users.noreply.github.com
40 |
--------------------------------------------------------------------------------
/Makefile:
--------------------------------------------------------------------------------
1 | .ONESHELL:
2 | SHELL := /bin/bash
3 |
4 | exp:
5 | nbdev_clean
6 | nbdev_export
7 |
8 | help: ## Show this help
9 | @egrep -h '\s##\s' $(MAKEFILE_LIST) | sort | awk 'BEGIN {FS = ":.*?## "}; {printf "\033[36m%-20s\033[0m %s\n", $$1, $$2}'
10 |
11 | sync: ## Propagates any change in the modules (.py files) to the notebooks that created them
12 | nbdev_update
13 |
14 | deploy: docs ## Push local docs to gh-pages branch
15 | nbdev_ghp_deploy
16 |
17 | preview: ## Live preview quarto docs with hot reloading.
18 | nbdev_sidebar
19 | nbdev_export
20 | IN_TEST=1 && nbdev_quarto --preview
21 |
22 | docs: .FORCE ## Build quarto docs and put them into folder specified in `doc_path` in settings.ini
23 | nbdev_export
24 | nbdev_quarto
25 |
26 | prepare: ## Export notebooks to python modules, test code and clean notebooks.
27 | nbdev_export
28 | nbdev_test
29 | nbdev_clean
30 |
31 | test: ## Test notebooks
32 | nbdev_test
33 |
34 | release_all: pypi release_conda ## Release python package on pypi and conda. Also bumps version number automatically.
35 | nbdev_bump_version
36 | nbdev_export
37 |
38 | release_pypi: pypi ## Release python package on pypi. Also bumps version number automatically.
39 | nbdev_export
40 | nbdev_bump_version
41 |
42 | release_conda:
43 | fastrelease_conda_package
44 |
45 | pypi: dist
46 | twine upload --repository pypi dist/*
47 |
48 | dist: clean
49 | python setup.py sdist bdist_wheel
50 |
51 | clean:
52 | rm -rf dist
53 |
54 |
55 | install: install_quarto ## Install quarto and the latest version of the local python pckage as an editable install
56 | pip install -e ".[dev]"
57 |
58 | install_py: .FORCE
59 | nbdev_export
60 | pip install -e ".[dev]"
61 |
62 | install_quarto: .FORCE ## Install the latest version of quarto for Mac and Linux. Go to https://quarto.org/docs/get-started/ for Windows.
63 | ./install_quarto.sh
64 |
65 | .FORCE:
66 |
67 |
--------------------------------------------------------------------------------
/.gitignore:
--------------------------------------------------------------------------------
1 | sidebar.yml
2 | conda/
3 | titanic*
4 | docs/
5 |
6 | *.bak
7 | .gitattributes
8 | .last_checked
9 | .gitconfig
10 | *.bak
11 | *.log
12 | *~
13 | ~*
14 | _tmp*
15 | tmp*
16 | tags
17 | *.pkg
18 |
19 | # Byte-compiled / optimized / DLL files
20 | __pycache__/
21 | *.py[cod]
22 | *$py.class
23 |
24 | # C extensions
25 | *.so
26 |
27 | # Distribution / packaging
28 | .Python
29 | env/
30 | build/
31 | develop-eggs/
32 | dist/
33 | downloads/
34 | eggs/
35 | .eggs/
36 | lib/
37 | lib64/
38 | parts/
39 | sdist/
40 | var/
41 | wheels/
42 | *.egg-info/
43 | .installed.cfg
44 | *.egg
45 |
46 | # PyInstaller
47 | # Usually these files are written by a python script from a template
48 | # before PyInstaller builds the exe, so as to inject date/other infos into it.
49 | *.manifest
50 | *.spec
51 |
52 | # Installer logs
53 | pip-log.txt
54 | pip-delete-this-directory.txt
55 |
56 | # Unit test / coverage reports
57 | htmlcov/
58 | .tox/
59 | .coverage
60 | .coverage.*
61 | .cache
62 | nosetests.xml
63 | coverage.xml
64 | *.cover
65 | .hypothesis/
66 |
67 | # Translations
68 | *.mo
69 | *.pot
70 |
71 | # Django stuff:
72 | *.log
73 | local_settings.py
74 |
75 | # Flask stuff:
76 | instance/
77 | .webassets-cache
78 |
79 | # Scrapy stuff:
80 | .scrapy
81 |
82 | # Sphinx documentation
83 | docs/_build/
84 |
85 | # PyBuilder
86 | target/
87 |
88 | # Jupyter Notebook
89 | .ipynb_checkpoints
90 |
91 | # pyenv
92 | .python-version
93 |
94 | # celery beat schedule file
95 | celerybeat-schedule
96 |
97 | # SageMath parsed files
98 | *.sage.py
99 |
100 | # dotenv
101 | .env
102 |
103 | # virtualenv
104 | .venv
105 | venv/
106 | ENV/
107 |
108 | # Spyder project settings
109 | .spyderproject
110 | .spyproject
111 |
112 | # Rope project settings
113 | .ropeproject
114 |
115 | # mkdocs documentation
116 | /site
117 |
118 | # mypy
119 | .mypy_cache/
120 |
121 | .vscode
122 | *.swp
123 |
124 | # osx generated files
125 | .DS_Store
126 | .DS_Store?
127 | .Trashes
128 | ehthumbs.db
129 | Thumbs.db
130 | .idea
131 |
132 | # pytest
133 | .pytest_cache
134 |
135 | # tools/trust-doc-nbs
136 | docs_src/.last_checked
137 |
138 | # symlinks to fastai
139 | docs_src/fastai
140 | tools/fastai
141 |
142 | # link checker
143 | checklink/cookies.txt
144 |
145 | # .gitconfig is now autogenerated
146 | .gitconfig
147 |
148 |
149 | /.quarto/
150 |
--------------------------------------------------------------------------------
/setup.py:
--------------------------------------------------------------------------------
1 | from pkg_resources import parse_version
2 | from configparser import ConfigParser
3 | import setuptools
4 | assert parse_version(setuptools.__version__)>=parse_version('36.2')
5 |
6 | # note: all settings are in settings.ini; edit there, not here
7 | config = ConfigParser(delimiters=['='])
8 | config.read('settings.ini')
9 | cfg = config['DEFAULT']
10 |
11 | cfg_keys = 'version description keywords author author_email'.split()
12 | expected = cfg_keys + "lib_name user branch license status min_python audience language".split()
13 | for o in expected: assert o in cfg, "missing expected setting: {}".format(o)
14 | setup_cfg = {o:cfg[o] for o in cfg_keys}
15 |
16 | licenses = {
17 | 'apache2': ('Apache Software License 2.0','OSI Approved :: Apache Software License'),
18 | 'mit': ('MIT License', 'OSI Approved :: MIT License'),
19 | 'gpl2': ('GNU General Public License v2', 'OSI Approved :: GNU General Public License v2 (GPLv2)'),
20 | 'gpl3': ('GNU General Public License v3', 'OSI Approved :: GNU General Public License v3 (GPLv3)'),
21 | 'bsd3': ('BSD License', 'OSI Approved :: BSD License'),
22 | }
23 | statuses = [ '1 - Planning', '2 - Pre-Alpha', '3 - Alpha',
24 | '4 - Beta', '5 - Production/Stable', '6 - Mature', '7 - Inactive' ]
25 | py_versions = '2.0 2.1 2.2 2.3 2.4 2.5 2.6 2.7 3.0 3.1 3.2 3.3 3.4 3.5 3.6 3.7 3.8'.split()
26 |
27 | requirements = cfg.get('requirements','').split()
28 | min_python = cfg['min_python']
29 | lic = licenses.get(cfg['license'].lower(), (cfg['license'], None))
30 | dev_requirements = (cfg.get('dev_requirements') or '').split()
31 |
32 | setuptools.setup(
33 | name = cfg['lib_name'],
34 | license = lic[0],
35 | classifiers = [
36 | 'Development Status :: ' + statuses[int(cfg['status'])],
37 | 'Intended Audience :: ' + cfg['audience'].title(),
38 | 'Natural Language :: ' + cfg['language'].title(),
39 | ] + ['Programming Language :: Python :: '+o for o in py_versions[py_versions.index(min_python):]] + (['License :: ' + lic[1] ] if lic[1] else []),
40 | url = cfg['git_url'],
41 | packages = setuptools.find_packages(),
42 | include_package_data = True,
43 | install_requires = requirements,
44 | extras_require={ 'dev': dev_requirements },
45 | dependency_links = cfg.get('dep_links','').split(),
46 | python_requires = '>=' + cfg['min_python'],
47 | long_description = open('README.md').read(),
48 | long_description_content_type = 'text/markdown',
49 | zip_safe = False,
50 | entry_points = {
51 | 'console_scripts': cfg.get('console_scripts','').split(),
52 | 'mkdocs.plugins': [ 'rm_num_prefix = nbdev.mkdocs:RmNumPrefix' ],
53 | 'nbdev': [f'{cfg.get("lib_path")}={cfg.get("lib_path")}._modidx:d']
54 | },
55 | **setup_cfg)
56 |
57 |
58 |
--------------------------------------------------------------------------------
/fastkaggle/_modidx.py:
--------------------------------------------------------------------------------
1 | # Autogenerated by nbdev
2 |
3 | d = { 'settings': { 'allowed_cell_metadata_keys': '',
4 | 'allowed_metadata_keys': '',
5 | 'audience': 'Developers',
6 | 'author': 'Jeremy Howard',
7 | 'author_email': 'info@fast.ai',
8 | 'black_formatting': 'False',
9 | 'branch': 'master',
10 | 'clean_ids': 'True',
11 | 'copyright': 'Jeremy Howard, 2022 onwards',
12 | 'custom_sidebar': 'False',
13 | 'description': 'Kaggling for fast kagglers!',
14 | 'doc_baseurl': '/fastkaggle/',
15 | 'doc_host': 'https://fastai.github.io',
16 | 'doc_path': 'docs',
17 | 'git_url': 'https://github.com/fastai/fastkaggle/tree/master/',
18 | 'host': 'github',
19 | 'jupyter_hooks': 'True',
20 | 'keywords': 'machine-learning kaggle fastai nbdev',
21 | 'language': 'English',
22 | 'lib_name': 'fastkaggle',
23 | 'lib_path': 'fastkaggle',
24 | 'license': 'apache2',
25 | 'min_python': '3.7',
26 | 'nbs_path': '.',
27 | 'readme_nb': 'index.ipynb',
28 | 'recursive': 'False',
29 | 'requirements': 'fastcore>=1.4.5 kaggle',
30 | 'status': '2',
31 | 'title': 'fastkaggle',
32 | 'tst_flags': 'notest',
33 | 'user': 'fastai',
34 | 'version': '0.0.8'},
35 | 'syms': { 'fastkaggle.core': { 'fastkaggle.core.check_ds_exists': 'https://fastai.github.io/fastkaggle/core.html#check_ds_exists',
36 | 'fastkaggle.core.create_libs_datasets': 'https://fastai.github.io/fastkaggle/core.html#create_libs_datasets',
37 | 'fastkaggle.core.create_requirements_dataset': 'https://fastai.github.io/fastkaggle/core.html#create_requirements_dataset',
38 | 'fastkaggle.core.get_dataset': 'https://fastai.github.io/fastkaggle/core.html#get_dataset',
39 | 'fastkaggle.core.get_local_ds_ver': 'https://fastai.github.io/fastkaggle/core.html#get_local_ds_ver',
40 | 'fastkaggle.core.get_pip_libraries': 'https://fastai.github.io/fastkaggle/core.html#get_pip_libraries',
41 | 'fastkaggle.core.get_pip_library': 'https://fastai.github.io/fastkaggle/core.html#get_pip_library',
42 | 'fastkaggle.core.import_kaggle': 'https://fastai.github.io/fastkaggle/core.html#import_kaggle',
43 | 'fastkaggle.core.iskaggle': 'https://fastai.github.io/fastkaggle/core.html#iskaggle',
44 | 'fastkaggle.core.mk_dataset': 'https://fastai.github.io/fastkaggle/core.html#mk_dataset',
45 | 'fastkaggle.core.nb_meta': 'https://fastai.github.io/fastkaggle/core.html#nb_meta',
46 | 'fastkaggle.core.push_dataset': 'https://fastai.github.io/fastkaggle/core.html#push_dataset',
47 | 'fastkaggle.core.push_notebook': 'https://fastai.github.io/fastkaggle/core.html#push_notebook',
48 | 'fastkaggle.core.setup_comp': 'https://fastai.github.io/fastkaggle/core.html#setup_comp'}}}
--------------------------------------------------------------------------------
/README.md:
--------------------------------------------------------------------------------
1 | fastkaggle
2 | ================
3 |
4 |
5 |
6 | ## Install
7 |
8 | Either:
9 |
10 | pip install fastkaggle
11 |
12 | or:
13 |
14 | mamba install -c fastai fastkaggle
15 |
16 | (or replace `mamba` with `conda` if you don’t mind it taking much longer
17 | to run…)
18 |
19 | ## How to use
20 |
21 | ### Competition
22 |
23 | This little library is where I’ll be putting snippets of stuff which are
24 | useful on Kaggle. Functionality includes the following:
25 |
26 | It defines
27 | [`iskaggle`](https://fastai.github.io/fastkaggle/core.html#iskaggle)
28 | which is `True` if you’re running on Kaggle:
29 |
30 | ``` python
31 | 'Kaggle' if iskaggle else 'Not Kaggle'
32 | ```
33 |
34 | 'Not Kaggle'
35 |
36 | It provides a
37 | [`setup_comp`](https://fastai.github.io/fastkaggle/core.html#setup_comp)
38 | function which gets a path to the data for a competition, downloading it
39 | if needed, and also installs any modules that might be missing or out of
40 | data if running on Kaggle:
41 |
42 | ``` python
43 | setup_comp('titanic')
44 | ```
45 |
46 | Path('titanic')
47 |
48 | There’s also
49 | [`push_notebook`](https://fastai.github.io/fastkaggle/core.html#push_notebook)
50 | to push a notebook to Kaggle Notebooks, and
51 | [`import_kaggle`](https://fastai.github.io/fastkaggle/core.html#import_kaggle)
52 | to use the Kaggle API (even when you’re on Kaggle!) See the
53 | `fastkaggle.core` docs for details.
54 |
55 | ### Datasets
56 |
57 | This section is designed to make uploading pip libraries to kaggle
58 | datasets easy. There’s 2 primary high level functions to be used. First
59 | we can define our kaggle username and the local path we want to use to
60 | store datasets when we create them.
61 |
62 |
63 |
64 | > **Usage tip**
65 | >
66 | > The purpose of this is to create datasets that can be used in no
67 | > internet inference competitions to install libraries using
68 | > `pip install -Uqq library --no-index --find-links=file:///kaggle/input/your_dataset/`
69 |
70 |
71 |
72 | ``` python
73 | lib_path = Path('/root/kaggle_datasets')
74 | username = 'isaacflath'
75 | ```
76 |
77 | #### List of Libraries
78 |
79 | We can take a list of libraries and upload them as seperate datasets.
80 | For example the below will create a `library-fastcore` and
81 | `library-timm` dataset. If they already exist, it will push a new
82 | version if there is a more recent version available.
83 |
84 | ``` python
85 | libs = ['fastcore','timm']
86 | create_libs_datasets(libs,lib_path,username)
87 | ```
88 |
89 | Processing fastcore as library-fastcore at /root/kaggle_datasets/library-fastcore
90 | -----Downloading or Creating Dataset
91 | -----Checking dataset version against pip
92 | -----Kaggle dataset already up to date 1.5.16 to 1.5.16
93 | Processing timm as library-timm at /root/kaggle_datasets/library-timm
94 | -----Downloading or Creating Dataset
95 | -----Checking dataset version against pip
96 | -----Kaggle dataset already up to date 0.6.7 to 0.6.7
97 | Complete
98 |
99 | This creates datasets in kaggle with the needed files.
100 |
101 | 
102 |
103 | #### requirements.txt
104 |
105 | We can also create a singular dataset with multiple libraries based on a
106 | `requirements.txt` file for the project. If there are any different
107 | files it will push a new version.
108 |
109 | ``` python
110 | create_requirements_dataset('test_files/requirements.txt',lib_path,'libraries-pawpularity', username)
111 | ```
112 |
113 | Processing libraries-pawpularity at /root/kaggle_datasets/libraries-pawpularity
114 | -----Downloading or Creating Dataset
115 | Data package template written to: /root/kaggle_datasets/libraries-pawpularity/dataset-metadata.json
116 | -----Checking dataset version against pip
117 | -----Updating libraries-pawpularity in Kaggle
118 | Complete
119 |
120 | This creats a dataset in kaggle with the needed files.
121 |
122 | 
123 |
--------------------------------------------------------------------------------
/index.ipynb:
--------------------------------------------------------------------------------
1 | {
2 | "cells": [
3 | {
4 | "cell_type": "code",
5 | "execution_count": null,
6 | "metadata": {},
7 | "outputs": [],
8 | "source": [
9 | "#|hide\n",
10 | "from fastkaggle.core import *\n",
11 | "from pathlib import Path"
12 | ]
13 | },
14 | {
15 | "cell_type": "markdown",
16 | "metadata": {},
17 | "source": [
18 | "# fastkaggle\n",
19 | "\n",
20 | "> Kaggling for fast kagglers!"
21 | ]
22 | },
23 | {
24 | "cell_type": "markdown",
25 | "metadata": {},
26 | "source": [
27 | "## Install"
28 | ]
29 | },
30 | {
31 | "cell_type": "markdown",
32 | "metadata": {},
33 | "source": [
34 | "Either:\n",
35 | "\n",
36 | " pip install fastkaggle\n",
37 | "\n",
38 | "or:\n",
39 | "\n",
40 | " mamba install -c fastai fastkaggle\n",
41 | "\n",
42 | "(or replace `mamba` with `conda` if you don't mind it taking much longer to run...)"
43 | ]
44 | },
45 | {
46 | "cell_type": "markdown",
47 | "metadata": {},
48 | "source": [
49 | "## How to use"
50 | ]
51 | },
52 | {
53 | "cell_type": "markdown",
54 | "metadata": {},
55 | "source": [
56 | "### Competition"
57 | ]
58 | },
59 | {
60 | "cell_type": "markdown",
61 | "metadata": {},
62 | "source": [
63 | "This little library is where I'll be putting snippets of stuff which are useful on Kaggle. Functionality includes the following:\n",
64 | "\n",
65 | "It defines `iskaggle` which is `True` if you're running on Kaggle:"
66 | ]
67 | },
68 | {
69 | "cell_type": "code",
70 | "execution_count": null,
71 | "metadata": {},
72 | "outputs": [
73 | {
74 | "data": {
75 | "text/plain": [
76 | "'Not Kaggle'"
77 | ]
78 | },
79 | "execution_count": null,
80 | "metadata": {},
81 | "output_type": "execute_result"
82 | }
83 | ],
84 | "source": [
85 | "'Kaggle' if iskaggle else 'Not Kaggle'"
86 | ]
87 | },
88 | {
89 | "cell_type": "markdown",
90 | "metadata": {},
91 | "source": [
92 | "It provides a `setup_comp` function which gets a path to the data for a competition, downloading it if needed, and also installs any modules that might be missing or out of data if running on Kaggle:"
93 | ]
94 | },
95 | {
96 | "cell_type": "code",
97 | "execution_count": null,
98 | "metadata": {},
99 | "outputs": [
100 | {
101 | "data": {
102 | "text/plain": [
103 | "Path('titanic')"
104 | ]
105 | },
106 | "execution_count": null,
107 | "metadata": {},
108 | "output_type": "execute_result"
109 | }
110 | ],
111 | "source": [
112 | "setup_comp('titanic')"
113 | ]
114 | },
115 | {
116 | "cell_type": "markdown",
117 | "metadata": {},
118 | "source": [
119 | "There's also `push_notebook` to push a notebook to Kaggle Notebooks, and `import_kaggle` to use the Kaggle API (even when you're on Kaggle!) See the `fastkaggle.core` docs for details."
120 | ]
121 | },
122 | {
123 | "cell_type": "markdown",
124 | "metadata": {},
125 | "source": [
126 | "### Datasets"
127 | ]
128 | },
129 | {
130 | "cell_type": "markdown",
131 | "metadata": {},
132 | "source": [
133 | "This section is designed to make uploading pip libraries to kaggle datasets easy. There's 2 primary high level functions to be used. First we can define our kaggle username and the local path we want to use to store datasets when we create them. \n",
134 | "\n",
135 | ":::{.callout-tip}\n",
136 | "## Usage tip\n",
137 | "The purpose of this is to create datasets that can be used in no internet inference competitions to install libraries using `pip install -Uqq library --no-index --find-links=file:///kaggle/input/your_dataset/`\n",
138 | ":::"
139 | ]
140 | },
141 | {
142 | "cell_type": "code",
143 | "execution_count": null,
144 | "metadata": {},
145 | "outputs": [],
146 | "source": [
147 | "lib_path = Path.home()/'kaggle_datasets'\n",
148 | "username = 'isaacflath'"
149 | ]
150 | },
151 | {
152 | "cell_type": "markdown",
153 | "metadata": {},
154 | "source": [
155 | "#### List of Libraries\n",
156 | "\n",
157 | "We can take a list of libraries and upload them as seperate datasets. For example the below will create a `library-fastcore` and `library-timm` dataset. If they already exist, it will push a new version if there is a more recent version available."
158 | ]
159 | },
160 | {
161 | "cell_type": "code",
162 | "execution_count": null,
163 | "metadata": {},
164 | "outputs": [
165 | {
166 | "name": "stdout",
167 | "output_type": "stream",
168 | "text": [
169 | "Processing fastcore as library-fastcore at /Users/isaacflath/kaggle_datasets/library-fastcore\n",
170 | "-----Downloading or Creating Dataset\n",
171 | "-----Checking dataset version against pip\n",
172 | "-----Kaggle dataset already up to date 1.5.16 to 1.5.16\n",
173 | "Processing flask as library-flask at /Users/isaacflath/kaggle_datasets/library-flask\n",
174 | "-----Downloading or Creating Dataset\n",
175 | "-----Checking dataset version against pip\n",
176 | "-----Kaggle dataset already up to date 2.2.2 to 2.2.2\n",
177 | "Processing fastkaggle as library-fastkaggle at /Users/isaacflath/kaggle_datasets/library-fastkaggle\n",
178 | "-----Downloading or Creating Dataset\n",
179 | "-----Checking dataset version against pip\n",
180 | "-----Kaggle dataset already up to date 0.0.6 to 0.0.6\n",
181 | "Complete\n"
182 | ]
183 | }
184 | ],
185 | "source": [
186 | "libs = ['fastcore','flask','fastkaggle']\n",
187 | "create_libs_datasets(libs,lib_path,username)"
188 | ]
189 | },
190 | {
191 | "cell_type": "markdown",
192 | "metadata": {},
193 | "source": [
194 | "This creats datasets in kaggle with the needed files. For example the library `fastkaggle` looks like this in kaggle.\n",
195 | "\n",
196 | ""
197 | ]
198 | },
199 | {
200 | "cell_type": "markdown",
201 | "metadata": {},
202 | "source": [
203 | "#### requirements.txt \n",
204 | "\n",
205 | "We can also create a singular dataset with multiple libraries based on a `requirements.txt` file for the project. If there are any different files it will push a new version."
206 | ]
207 | },
208 | {
209 | "cell_type": "code",
210 | "execution_count": null,
211 | "metadata": {},
212 | "outputs": [
213 | {
214 | "name": "stdout",
215 | "output_type": "stream",
216 | "text": [
217 | "Processing libraries-pawpularity at /root/kaggle_datasets/libraries-pawpularity\n",
218 | "-----Downloading or Creating Dataset\n",
219 | "Data package template written to: /root/kaggle_datasets/libraries-pawpularity/dataset-metadata.json\n",
220 | "-----Checking dataset version against pip\n",
221 | "-----Updating libraries-pawpularity in Kaggle\n",
222 | "Complete\n"
223 | ]
224 | }
225 | ],
226 | "source": [
227 | "create_requirements_dataset('test_files/requirements.txt',lib_path,'libraries-pawpularity', username)"
228 | ]
229 | },
230 | {
231 | "cell_type": "markdown",
232 | "metadata": {},
233 | "source": [
234 | "This creates a dataset in kaggle with the needed files.\n",
235 | "\n",
236 | ""
237 | ]
238 | },
239 | {
240 | "cell_type": "code",
241 | "execution_count": null,
242 | "metadata": {},
243 | "outputs": [],
244 | "source": []
245 | }
246 | ],
247 | "metadata": {
248 | "kernelspec": {
249 | "display_name": "Python 3 (ipykernel)",
250 | "language": "python",
251 | "name": "python3"
252 | }
253 | },
254 | "nbformat": 4,
255 | "nbformat_minor": 4
256 | }
257 |
--------------------------------------------------------------------------------
/fastkaggle/core.py:
--------------------------------------------------------------------------------
1 | # AUTOGENERATED! DO NOT EDIT! File to edit: ../00_core.ipynb.
2 |
3 | # %% auto 0
4 | __all__ = ['iskaggle', 'import_kaggle', 'setup_comp', 'nb_meta', 'push_notebook', 'check_ds_exists', 'mk_dataset', 'get_dataset',
5 | 'get_pip_library', 'get_pip_libraries', 'push_dataset', 'get_local_ds_ver', 'create_libs_datasets',
6 | 'create_requirements_dataset']
7 |
8 | # %% ../00_core.ipynb 3
9 | import os,json,subprocess, shutil
10 | import re
11 | from fastcore.utils import *
12 | # from fastcore.all import *
13 |
14 | # %% ../00_core.ipynb 4
15 | iskaggle = os.environ.get('KAGGLE_KERNEL_RUN_TYPE', '')
16 |
17 | # %% ../00_core.ipynb 5
18 | def import_kaggle():
19 | "Import kaggle API, using Kaggle secrets `kaggle_username` and `kaggle_key` if needed"
20 | if iskaggle:
21 | from kaggle_secrets import UserSecretsClient
22 | sec = UserSecretsClient()
23 | os.environ['KAGGLE_USERNAME'] = sec.get_secret("kaggle_username")
24 | if not os.environ['KAGGLE_USERNAME']: raise Exception("Please insert your Kaggle username and key into Kaggle secrets")
25 | os.environ['KAGGLE_KEY'] = sec.get_secret("kaggle_key")
26 | from kaggle import api
27 | return api
28 |
29 | # %% ../00_core.ipynb 7
30 | def setup_comp(competition, install=''):
31 | "Get a path to data for `competition`, downloading it if needed"
32 | if iskaggle:
33 | if install:
34 | os.system(f'pip install -Uqq {install}')
35 | return Path('../input')/competition
36 | else:
37 | path = Path(competition)
38 | api = import_kaggle()
39 | if not path.exists():
40 | import zipfile
41 | api.competition_download_cli(str(competition))
42 | zipfile.ZipFile(f'{competition}.zip').extractall(str(competition))
43 | return path
44 |
45 | # %% ../00_core.ipynb 10
46 | def nb_meta(user, id, title, file, competition=None, private=True, gpu=False, internet=True, linked_datasets=None):
47 | "Get the `dict` required for a kernel-metadata.json file"
48 | d = {
49 | "id": f"{user}/{id}",
50 | "title": title,
51 | "code_file": file,
52 | "language": "python",
53 | "kernel_type": "notebook",
54 | "is_private": private,
55 | "enable_gpu": gpu,
56 | "enable_internet": internet,
57 | "keywords": [],
58 | "dataset_sources": linked_datasets if linked_datasets else [],
59 | "kernel_sources": []
60 | }
61 | if competition: d["competition_sources"] = [f"competitions/{competition}"]
62 | return d
63 |
64 | # %% ../00_core.ipynb 12
65 | def push_notebook(user, id, title, file, path='.', competition=None, private=True, gpu=False, internet=True, linked_datasets=None):
66 | "Push notebook `file` to Kaggle Notebooks"
67 | meta = nb_meta(user, id, title, file=file, competition=competition, private=private, gpu=gpu, internet=internet, linked_datasets=linked_datasets)
68 | path = Path(path)
69 | nm = 'kernel-metadata.json'
70 | path.mkdir(exist_ok=True, parents=True)
71 | with open(path/nm, 'w') as f: json.dump(meta, f, indent=2)
72 | api = import_kaggle()
73 | api.kernels_push_cli(str(path))
74 |
75 | # %% ../00_core.ipynb 16
76 | def check_ds_exists(dataset_slug # Dataset slug (ie "zillow/zecon")
77 | ):
78 | '''Checks if a dataset exists in kaggle and returns boolean'''
79 | api = import_kaggle()
80 | ds_search = L(api.dataset_list(mine=True)).filter(lambda x: str(x)==dataset_slug)
81 | if len(ds_search)==1: return True
82 | elif len(ds_search)==0: return False
83 | else: raise exception("Multiple datasets found - Check Manually")
84 |
85 | # %% ../00_core.ipynb 17
86 | def mk_dataset(dataset_path, # Local path to create dataset in
87 | title, # Name of the dataset
88 | force=False, # Should it overwrite or error if exists?
89 | upload=True # Should it upload and create on kaggle
90 | ):
91 | '''Creates minimal dataset metadata needed to push new dataset to kaggle'''
92 | dataset_path = Path(dataset_path)
93 | dataset_path.mkdir(exist_ok=force,parents=True)
94 | api = import_kaggle()
95 | api.dataset_initialize(dataset_path)
96 | md = json.load(open(dataset_path/'dataset-metadata.json'))
97 | md['title'] = title
98 | md['id'] = md['id'].replace('INSERT_SLUG_HERE',title)
99 | json.dump(md,open(dataset_path/'dataset-metadata.json','w'))
100 | if upload: (dataset_path/'empty.txt').touch()
101 | api.dataset_create_new(str(dataset_path),public=True,dir_mode='zip',quiet=True)
102 |
103 | # %% ../00_core.ipynb 19
104 | def get_dataset(dataset_path, # Local path to download dataset to
105 | dataset_slug, # Dataset slug (ie "zillow/zecon")
106 | unzip=True, # Should it unzip after downloading?
107 | force=False # Should it overwrite or error if dataset_path exists?
108 | ):
109 | '''Downloads an existing dataset and metadata from kaggle'''
110 | if not force: assert not Path(dataset_path).exists()
111 | api = import_kaggle()
112 | api.dataset_metadata(dataset_slug,str(dataset_path))
113 | api.dataset_download_files(dataset_slug,str(dataset_path))
114 | if unzip:
115 | zipped_file = Path(dataset_path)/f"{dataset_slug.split('/')[-1]}.zip"
116 | import zipfile
117 | with zipfile.ZipFile(zipped_file, 'r') as zip_ref:
118 | zip_ref.extractall(Path(dataset_path))
119 | zipped_file.unlink()
120 |
121 |
122 | # %% ../00_core.ipynb 20
123 | def get_pip_library(dataset_path, # Local path to download pip library to
124 | pip_library, # name of library for pip to install
125 | pip_cmd="pip" # pip base to use (ie "pip3" or "pip")
126 | ):
127 | '''Download the whl files for pip_library and store in dataset_path'''
128 | bashCommand = f"{pip_cmd} download {pip_library} -d {dataset_path}"
129 | process = subprocess.Popen(bashCommand.split(), stdout=subprocess.PIPE)
130 | output, error = process.communicate()
131 |
132 | # %% ../00_core.ipynb 21
133 | def get_pip_libraries(dataset_path, # Local path to download pip library to
134 | requirements_path, # path to requirements file
135 | pip_cmd="pip" # pip base to use (ie "pip3" or "pip")
136 | ):
137 | '''Download whl files for a requirements.txt file and store in dataset_path'''
138 | bashCommand = f"{pip_cmd} download -r {requirements_path} -d {dataset_path}"
139 | process = subprocess.Popen(bashCommand.split(), stdout=subprocess.PIPE)
140 | output, error = process.communicate()
141 |
142 | # %% ../00_core.ipynb 23
143 | def push_dataset(dataset_path, # Local path where dataset is stored
144 | version_comment # Comment associated with this dataset update
145 | ):
146 | '''Push dataset update to kaggle. Dataset path must contain dataset metadata file'''
147 | api = import_kaggle()
148 | api.dataset_create_version(str(dataset_path),version_comment,dir_mode='zip',quiet=True)
149 |
150 | # %% ../00_core.ipynb 24
151 | def get_local_ds_ver(lib_path, # Local path dataset is stored in
152 | lib # Name of library (ie "fastcore")
153 | ):
154 | '''checks a local copy of kaggle dataset for library version number'''
155 | wheel_lib_name = lib.replace('-','_')
156 | local_path = (lib_path/f"library-{lib}")
157 | lib_whl = local_path.ls().filter(lambda x: wheel_lib_name in x.name.lower())
158 | if 1==len(lib_whl):
159 | return re.search(f"(?<={wheel_lib_name}-)[\d+.]+\d",lib_whl[0].name.lower())[0]
160 | elif 0 API details for fastkaggle."
19 | ]
20 | },
21 | {
22 | "cell_type": "code",
23 | "execution_count": null,
24 | "metadata": {},
25 | "outputs": [],
26 | "source": [
27 | "#|hide\n",
28 | "from nbdev.showdoc import *"
29 | ]
30 | },
31 | {
32 | "cell_type": "code",
33 | "execution_count": null,
34 | "metadata": {},
35 | "outputs": [],
36 | "source": [
37 | "#|export\n",
38 | "import os,json,subprocess, shutil\n",
39 | "import re\n",
40 | "from fastcore.utils import *\n",
41 | "# from fastcore.all import *"
42 | ]
43 | },
44 | {
45 | "cell_type": "code",
46 | "execution_count": null,
47 | "metadata": {},
48 | "outputs": [],
49 | "source": [
50 | "#|export\n",
51 | "iskaggle = os.environ.get('KAGGLE_KERNEL_RUN_TYPE', '')"
52 | ]
53 | },
54 | {
55 | "cell_type": "code",
56 | "execution_count": null,
57 | "metadata": {},
58 | "outputs": [],
59 | "source": [
60 | "#|export\n",
61 | "def import_kaggle():\n",
62 | " \"Import kaggle API, using Kaggle secrets `kaggle_username` and `kaggle_key` if needed\"\n",
63 | " if iskaggle:\n",
64 | " from kaggle_secrets import UserSecretsClient\n",
65 | " sec = UserSecretsClient()\n",
66 | " os.environ['KAGGLE_USERNAME'] = sec.get_secret(\"kaggle_username\")\n",
67 | " if not os.environ['KAGGLE_USERNAME']: raise Exception(\"Please insert your Kaggle username and key into Kaggle secrets\")\n",
68 | " os.environ['KAGGLE_KEY'] = sec.get_secret(\"kaggle_key\")\n",
69 | " from kaggle import api\n",
70 | " return api"
71 | ]
72 | },
73 | {
74 | "cell_type": "code",
75 | "execution_count": null,
76 | "metadata": {},
77 | "outputs": [
78 | {
79 | "data": {
80 | "text/plain": [
81 | "(#20) [contradictory-my-dear-watson,gan-getting-started,store-sales-time-series-forecasting,tpu-getting-started,digit-recognizer,titanic,house-prices-advanced-regression-techniques,connectx,nlp-getting-started,spaceship-titanic...]"
82 | ]
83 | },
84 | "execution_count": null,
85 | "metadata": {},
86 | "output_type": "execute_result"
87 | }
88 | ],
89 | "source": [
90 | "api = import_kaggle()\n",
91 | "L(api.competitions_list())"
92 | ]
93 | },
94 | {
95 | "cell_type": "code",
96 | "execution_count": null,
97 | "metadata": {},
98 | "outputs": [],
99 | "source": [
100 | "#|export\n",
101 | "def setup_comp(competition, install=''):\n",
102 | " \"Get a path to data for `competition`, downloading it if needed\"\n",
103 | " if iskaggle:\n",
104 | " if install:\n",
105 | " os.system(f'pip install -Uqq {install}')\n",
106 | " return Path('../input')/competition\n",
107 | " else:\n",
108 | " path = Path(competition)\n",
109 | " api = import_kaggle()\n",
110 | " if not path.exists():\n",
111 | " import zipfile\n",
112 | " api.competition_download_cli(str(competition))\n",
113 | " zipfile.ZipFile(f'{competition}.zip').extractall(str(competition))\n",
114 | " return path"
115 | ]
116 | },
117 | {
118 | "cell_type": "code",
119 | "execution_count": null,
120 | "metadata": {},
121 | "outputs": [
122 | {
123 | "data": {
124 | "text/plain": [
125 | "Path('titanic')"
126 | ]
127 | },
128 | "execution_count": null,
129 | "metadata": {},
130 | "output_type": "execute_result"
131 | }
132 | ],
133 | "source": [
134 | "setup_comp('titanic')"
135 | ]
136 | },
137 | {
138 | "cell_type": "markdown",
139 | "metadata": {},
140 | "source": [
141 | "If you pass a list of space separated modules to `install`, they'll be installed if running on Kaggle."
142 | ]
143 | },
144 | {
145 | "cell_type": "code",
146 | "execution_count": null,
147 | "metadata": {},
148 | "outputs": [],
149 | "source": [
150 | "#|export\n",
151 | "def nb_meta(user, id, title, file, competition=None, private=True, gpu=False, internet=True, linked_datasets=None):\n",
152 | " \"Get the `dict` required for a kernel-metadata.json file\"\n",
153 | " d = {\n",
154 | " \"id\": f\"{user}/{id}\",\n",
155 | " \"title\": title,\n",
156 | " \"code_file\": file,\n",
157 | " \"language\": \"python\",\n",
158 | " \"kernel_type\": \"notebook\",\n",
159 | " \"is_private\": private,\n",
160 | " \"enable_gpu\": gpu,\n",
161 | " \"enable_internet\": internet,\n",
162 | " \"keywords\": [],\n",
163 | " \"dataset_sources\": linked_datasets if linked_datasets else [],\n",
164 | " \"kernel_sources\": []\n",
165 | " }\n",
166 | " if competition: d[\"competition_sources\"] = [f\"competitions/{competition}\"]\n",
167 | " return d"
168 | ]
169 | },
170 | {
171 | "cell_type": "code",
172 | "execution_count": null,
173 | "metadata": {},
174 | "outputs": [
175 | {
176 | "data": {
177 | "text/plain": [
178 | "{'id': 'jhoward/my-notebook',\n",
179 | " 'title': 'My notebook',\n",
180 | " 'code_file': 'my-notebook.ipynb',\n",
181 | " 'language': 'python',\n",
182 | " 'kernel_type': 'notebook',\n",
183 | " 'is_private': True,\n",
184 | " 'enable_gpu': False,\n",
185 | " 'enable_internet': True,\n",
186 | " 'keywords': [],\n",
187 | " 'dataset_sources': [],\n",
188 | " 'kernel_sources': [],\n",
189 | " 'competition_sources': ['competitions/paddy-disease-classification']}"
190 | ]
191 | },
192 | "execution_count": null,
193 | "metadata": {},
194 | "output_type": "execute_result"
195 | }
196 | ],
197 | "source": [
198 | "nb_meta('jhoward', 'my-notebook', 'My notebook', 'my-notebook.ipynb', competition='paddy-disease-classification')"
199 | ]
200 | },
201 | {
202 | "cell_type": "code",
203 | "execution_count": null,
204 | "metadata": {},
205 | "outputs": [],
206 | "source": [
207 | "#|export\n",
208 | "def push_notebook(user, id, title, file, path='.', competition=None, private=True, gpu=False, internet=True, linked_datasets=None):\n",
209 | " \"Push notebook `file` to Kaggle Notebooks\"\n",
210 | " meta = nb_meta(user, id, title, file=file, competition=competition, private=private, gpu=gpu, internet=internet, linked_datasets=linked_datasets)\n",
211 | " path = Path(path)\n",
212 | " nm = 'kernel-metadata.json'\n",
213 | " path.mkdir(exist_ok=True, parents=True)\n",
214 | " with open(path/nm, 'w') as f: json.dump(meta, f, indent=2)\n",
215 | " api = import_kaggle()\n",
216 | " api.kernels_push_cli(str(path))"
217 | ]
218 | },
219 | {
220 | "cell_type": "markdown",
221 | "metadata": {},
222 | "source": [
223 | "Note that Kaggle recommends that the `id` match the *slug* for the title -- i.e it should be the same as the title, but lowercase, no punctuation, and spaces replaced with dashes. E.g:\n",
224 | "\n",
225 | "```python\n",
226 | "push_notebook('jhoward', 'first-steps-road-to-the-top-part-1',\n",
227 | " title='First Steps: Road to the Top, Part 1',\n",
228 | " file='first-steps-road-to-the-top-part-1.ipynb',\n",
229 | " competition='paddy-disease-classification',\n",
230 | " private=False, gpu=True)\n",
231 | "```"
232 | ]
233 | },
234 | {
235 | "cell_type": "markdown",
236 | "metadata": {},
237 | "source": [
238 | "## Datasets"
239 | ]
240 | },
241 | {
242 | "cell_type": "markdown",
243 | "metadata": {},
244 | "source": [
245 | "### Core"
246 | ]
247 | },
248 | {
249 | "cell_type": "code",
250 | "execution_count": null,
251 | "metadata": {},
252 | "outputs": [],
253 | "source": [
254 | "#| export\n",
255 | "def check_ds_exists(dataset_slug # Dataset slug (ie \"zillow/zecon\")\n",
256 | " ):\n",
257 | " '''Checks if a dataset exists in kaggle and returns boolean'''\n",
258 | " api = import_kaggle()\n",
259 | " ds_search = L(api.dataset_list(mine=True)).filter(lambda x: str(x)==dataset_slug)\n",
260 | " if len(ds_search)==1: return True\n",
261 | " elif len(ds_search)==0: return False\n",
262 | " else: raise exception(\"Multiple datasets found - Check Manually\")"
263 | ]
264 | },
265 | {
266 | "cell_type": "code",
267 | "execution_count": null,
268 | "metadata": {},
269 | "outputs": [],
270 | "source": [
271 | "#| export\n",
272 | "def mk_dataset(dataset_path, # Local path to create dataset in\n",
273 | " title, # Name of the dataset\n",
274 | " force=False, # Should it overwrite or error if exists?\n",
275 | " upload=True # Should it upload and create on kaggle\n",
276 | " ):\n",
277 | " '''Creates minimal dataset metadata needed to push new dataset to kaggle'''\n",
278 | " dataset_path = Path(dataset_path)\n",
279 | " dataset_path.mkdir(exist_ok=force,parents=True)\n",
280 | " api = import_kaggle()\n",
281 | " api.dataset_initialize(dataset_path)\n",
282 | " md = json.load(open(dataset_path/'dataset-metadata.json'))\n",
283 | " md['title'] = title\n",
284 | " md['id'] = md['id'].replace('INSERT_SLUG_HERE',title)\n",
285 | " json.dump(md,open(dataset_path/'dataset-metadata.json','w'))\n",
286 | " if upload: (dataset_path/'empty.txt').touch()\n",
287 | " api.dataset_create_new(str(dataset_path),public=True,dir_mode='zip',quiet=True)"
288 | ]
289 | },
290 | {
291 | "cell_type": "code",
292 | "execution_count": null,
293 | "metadata": {},
294 | "outputs": [
295 | {
296 | "name": "stdout",
297 | "output_type": "stream",
298 | "text": [
299 | "Data package template written to: testds/dataset-metadata.json\n"
300 | ]
301 | }
302 | ],
303 | "source": [
304 | "mk_dataset('./testds','mytestds',force=True)\n",
305 | "md = json.load(open('./testds/dataset-metadata.json'))\n",
306 | "assert md['title'] == 'mytestds'\n",
307 | "assert md['id'].endswith('/mytestds')"
308 | ]
309 | },
310 | {
311 | "cell_type": "code",
312 | "execution_count": null,
313 | "metadata": {},
314 | "outputs": [],
315 | "source": [
316 | "#| export\n",
317 | "def get_dataset(dataset_path, # Local path to download dataset to\n",
318 | " dataset_slug, # Dataset slug (ie \"zillow/zecon\")\n",
319 | " unzip=True, # Should it unzip after downloading?\n",
320 | " force=False # Should it overwrite or error if dataset_path exists?\n",
321 | " ):\n",
322 | " '''Downloads an existing dataset and metadata from kaggle'''\n",
323 | " if not force: assert not Path(dataset_path).exists()\n",
324 | " api = import_kaggle()\n",
325 | " api.dataset_metadata(dataset_slug,str(dataset_path))\n",
326 | " api.dataset_download_files(dataset_slug,str(dataset_path))\n",
327 | " if unzip:\n",
328 | " zipped_file = Path(dataset_path)/f\"{dataset_slug.split('/')[-1]}.zip\"\n",
329 | " import zipfile\n",
330 | " with zipfile.ZipFile(zipped_file, 'r') as zip_ref:\n",
331 | " zip_ref.extractall(Path(dataset_path))\n",
332 | " zipped_file.unlink()\n",
333 | " "
334 | ]
335 | },
336 | {
337 | "cell_type": "code",
338 | "execution_count": null,
339 | "metadata": {},
340 | "outputs": [],
341 | "source": [
342 | "#| export\n",
343 | "def get_pip_library(dataset_path, # Local path to download pip library to\n",
344 | " pip_library, # name of library for pip to install\n",
345 | " pip_cmd=\"pip\" # pip base to use (ie \"pip3\" or \"pip\")\n",
346 | " ): \n",
347 | " '''Download the whl files for pip_library and store in dataset_path'''\n",
348 | " bashCommand = f\"{pip_cmd} download {pip_library} -d {dataset_path}\"\n",
349 | " process = subprocess.Popen(bashCommand.split(), stdout=subprocess.PIPE)\n",
350 | " output, error = process.communicate()"
351 | ]
352 | },
353 | {
354 | "cell_type": "code",
355 | "execution_count": null,
356 | "metadata": {},
357 | "outputs": [],
358 | "source": [
359 | "#| export\n",
360 | "def get_pip_libraries(dataset_path, # Local path to download pip library to\n",
361 | " requirements_path, # path to requirements file\n",
362 | " pip_cmd=\"pip\" # pip base to use (ie \"pip3\" or \"pip\")\n",
363 | " ):\n",
364 | " '''Download whl files for a requirements.txt file and store in dataset_path'''\n",
365 | " bashCommand = f\"{pip_cmd} download -r {requirements_path} -d {dataset_path}\"\n",
366 | " process = subprocess.Popen(bashCommand.split(), stdout=subprocess.PIPE)\n",
367 | " output, error = process.communicate()"
368 | ]
369 | },
370 | {
371 | "cell_type": "code",
372 | "execution_count": null,
373 | "metadata": {},
374 | "outputs": [],
375 | "source": [
376 | "dl_path = Path('./mylib')\n",
377 | "get_pip_library(dl_path,'fastkaggle')\n",
378 | "assert 1==len([o for o in dl_path.ls() if str(o).startswith(f\"{dl_path}/fastkaggle\")])"
379 | ]
380 | },
381 | {
382 | "cell_type": "code",
383 | "execution_count": null,
384 | "metadata": {},
385 | "outputs": [],
386 | "source": [
387 | "#| export\n",
388 | "def push_dataset(dataset_path, # Local path where dataset is stored \n",
389 | " version_comment # Comment associated with this dataset update\n",
390 | " ):\n",
391 | " '''Push dataset update to kaggle. Dataset path must contain dataset metadata file'''\n",
392 | " api = import_kaggle()\n",
393 | " api.dataset_create_version(str(dataset_path),version_comment,dir_mode='zip',quiet=True)"
394 | ]
395 | },
396 | {
397 | "cell_type": "code",
398 | "execution_count": null,
399 | "metadata": {},
400 | "outputs": [],
401 | "source": [
402 | "#| export\n",
403 | "def get_local_ds_ver(lib_path, # Local path dataset is stored in\n",
404 | " lib # Name of library (ie \"fastcore\")\n",
405 | " ):\n",
406 | " '''checks a local copy of kaggle dataset for library version number'''\n",
407 | " wheel_lib_name = lib.replace('-','_')\n",
408 | " local_path = (lib_path/f\"library-{lib}\")\n",
409 | " lib_whl = local_path.ls().filter(lambda x: wheel_lib_name in x.name.lower())\n",
410 | " if 1==len(lib_whl):\n",
411 | " return re.search(f\"(?<={wheel_lib_name}-)[\\d+.]+\\d\",lib_whl[0].name.lower())[0]\n",
412 | " elif 0