├── .nojeklyll ├── test_files └── requirements.txt ├── fastkaggle ├── __init__.py ├── _nbdev.py ├── _modidx.py └── core.py ├── images ├── library-fastkaggle.png └── libraries-pawpularity.png ├── MANIFEST.in ├── styles.css ├── install_quarto.sh ├── CHANGELOG.md ├── _quarto.yml ├── settings.ini ├── .github └── workflows │ └── deploy.yaml ├── Makefile ├── .gitignore ├── setup.py ├── README.md ├── index.ipynb ├── LICENSE └── 00_core.ipynb /.nojeklyll: -------------------------------------------------------------------------------- 1 | -------------------------------------------------------------------------------- /test_files/requirements.txt: -------------------------------------------------------------------------------- 1 | fastcore 2 | timm -------------------------------------------------------------------------------- /fastkaggle/__init__.py: -------------------------------------------------------------------------------- 1 | __version__ = "0.0.8" 2 | from .core import * 3 | 4 | -------------------------------------------------------------------------------- /images/library-fastkaggle.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/fastai/fastkaggle/master/images/library-fastkaggle.png -------------------------------------------------------------------------------- /images/libraries-pawpularity.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/fastai/fastkaggle/master/images/libraries-pawpularity.png -------------------------------------------------------------------------------- /MANIFEST.in: -------------------------------------------------------------------------------- 1 | include settings.ini 2 | include LICENSE 3 | include CONTRIBUTING.md 4 | include README.md 5 | recursive-exclude * __pycache__ 6 | -------------------------------------------------------------------------------- /styles.css: -------------------------------------------------------------------------------- 1 | .cell-output pre { 2 | margin-left: 0.8rem; 3 | margin-top: 0; 4 | background: none; 5 | border-left: 2px solid lightsalmon; 6 | border-top-left-radius: 0; 7 | border-top-right-radius: 0; 8 | } 9 | 10 | .cell-output .sourceCode { 11 | background: none; 12 | margin-top: 0; 13 | } 14 | 15 | .cell > .sourceCode { 16 | margin-bottom: 0; 17 | } 18 | -------------------------------------------------------------------------------- /fastkaggle/_nbdev.py: -------------------------------------------------------------------------------- 1 | # AUTOGENERATED BY NBDEV! DO NOT EDIT! 2 | 3 | __all__ = ["index", "modules", "custom_doc_links", "git_url"] 4 | 5 | index = {"iskaggle": "00_core.ipynb", 6 | "setup_comp": "00_core.ipynb", 7 | "nb_meta": "00_core.ipynb", 8 | "push_notebook": "00_core.ipynb"} 9 | 10 | modules = ["core.py"] 11 | 12 | doc_url = "https://fastai.github.io/fastkaggle/" 13 | 14 | git_url = "https://github.com/fastai/fastkaggle/tree/master/" 15 | 16 | def custom_doc_links(name): return None 17 | -------------------------------------------------------------------------------- /install_quarto.sh: -------------------------------------------------------------------------------- 1 | #!/bin/bash 2 | set -e 3 | install_linux() { 4 | echo "...installing Quarto" 5 | wget -nv https://www.quarto.org/download/latest/quarto-linux-amd64.deb 6 | sudo dpkg -i *64.deb 7 | rm *64.deb 8 | } 9 | 10 | install_mac() { 11 | echo "...downloading Quarto installer" 12 | wget -nv https://www.quarto.org/download/latest/quarto-macos.pkg 13 | echo "...opening installer for Quarto" 14 | open quarto-macos.pkg 15 | } 16 | 17 | case "$OSTYPE" in 18 | linux*) install_linux ;; 19 | darwin*) install_mac ;; 20 | *) echo "make sure you install the latest version of quarto: https://quarto.org/docs/get-started/" ;; 21 | esac 22 | -------------------------------------------------------------------------------- /CHANGELOG.md: -------------------------------------------------------------------------------- 1 | # Release notes 2 | 3 | 4 | 5 | ## 0.0.7 6 | 7 | ### New Features 8 | 9 | - Datasets functionality + Docs ([#9](https://github.com/fastai/fastkaggle/pull/9)), thanks to [@Isaac-Flath](https://github.com/Isaac-Flath) 10 | - 2 high level functions allow to either pass a list of libraries or a requirements.txt to maintain and update their own libraries as kaggle datasets. 11 | 12 | 13 | ## 0.0.6 14 | 15 | ### Bugs Squashed 16 | 17 | - fix comp should be competition in setup_comp" ([#3](https://github.com/fastai/fastkaggle/pull/3)), thanks to [@n-e-w](https://github.com/n-e-w) 18 | 19 | 20 | ## 0.0.4 21 | 22 | ### Bugs Squashed 23 | 24 | - api not exported ([#1](https://github.com/fastai/fastkaggle/issues/1)) 25 | 26 | 27 | ## 0.0.1 28 | 29 | - init release 30 | 31 | -------------------------------------------------------------------------------- /_quarto.yml: -------------------------------------------------------------------------------- 1 | ipynb-filters: [nbdev_filter] 2 | 3 | project: 4 | type: website 5 | output-dir: docs 6 | preview: 7 | port: 3000 8 | browser: false 9 | 10 | format: 11 | html: 12 | theme: cosmo 13 | css: styles.css 14 | toc: true 15 | toc-depth: 4 16 | 17 | website: 18 | title: "fastkaggle" 19 | site-url: "https://fastai.github.io/fastkaggle/" 20 | description: "Kaggling for fast kagglers!" 21 | execute: 22 | enabled: false 23 | twitter-card: true 24 | open-graph: true 25 | reader-mode: true 26 | repo-branch: master 27 | repo-url: "https://github.com/fastai/fastkaggle/tree/master/" 28 | repo-actions: [issue] 29 | navbar: 30 | background: primary 31 | search: true 32 | right: 33 | - icon: github 34 | href: "https://github.com/fastai/fastkaggle/tree/master/" 35 | sidebar: 36 | style: "floating" 37 | 38 | metadata-files: 39 | - sidebar.yml 40 | - custom.yml 41 | -------------------------------------------------------------------------------- /settings.ini: -------------------------------------------------------------------------------- 1 | [DEFAULT] 2 | host = github 3 | lib_name = fastkaggle 4 | description = Kaggling for fast kagglers! 5 | copyright = Jeremy Howard, 2022 onwards 6 | keywords = machine-learning kaggle fastai nbdev 7 | user = fastai 8 | author = Jeremy Howard 9 | author_email = info@fast.ai 10 | branch = master 11 | version = 0.0.8 12 | min_python = 3.7 13 | audience = Developers 14 | language = English 15 | requirements = fastcore>=1.4.5 kaggle 16 | custom_sidebar = False 17 | license = apache2 18 | status = 2 19 | nbs_path = . 20 | doc_path = docs 21 | recursive = False 22 | tst_flags = notest 23 | doc_host = https://fastai.github.io 24 | doc_baseurl = /fastkaggle/ 25 | git_url = https://github.com/fastai/fastkaggle/tree/master/ 26 | lib_path = fastkaggle 27 | title = fastkaggle 28 | black_formatting = False 29 | readme_nb = index.ipynb 30 | allowed_metadata_keys = 31 | allowed_cell_metadata_keys = 32 | jupyter_hooks = True 33 | clean_ids = True 34 | 35 | -------------------------------------------------------------------------------- /.github/workflows/deploy.yaml: -------------------------------------------------------------------------------- 1 | name: Deploy to GitHub Pages 2 | 3 | on: 4 | push: 5 | branches: 6 | - master 7 | - main 8 | workflow_dispatch: 9 | 10 | jobs: 11 | deploy: 12 | name: Deploy to GitHub Pages 13 | runs-on: ubuntu-latest 14 | steps: 15 | - uses: actions/checkout@v3 16 | - uses: actions/setup-python@v3 17 | - name: Install Dependencies 18 | run: | 19 | python -m pip install --upgrade pip 20 | pip install nbdev 21 | make install 22 | - name: Build website 23 | env: 24 | KAGGLE_USERNAME: test 25 | KAGGLE_KEY: test 26 | run: make docs 27 | - name: Deploy to GitHub Pages 28 | uses: peaceiris/actions-gh-pages@v3 29 | with: 30 | github_token: ${{ secrets.GITHUB_TOKEN }} 31 | force_orphan: true 32 | publish_dir: ./docs 33 | # The following lines assign commit authorship to the official 34 | # GH-Actions bot for deploys to `gh-pages` branch: 35 | # https://github.com/actions/checkout/issues/13#issuecomment-724415212 36 | # The GH actions bot is used by default if you didn't specify the two fields. 37 | # You can swap them out with your own user credentials. 38 | user_name: github-actions[bot] 39 | user_email: 41898282+github-actions[bot]@users.noreply.github.com 40 | -------------------------------------------------------------------------------- /Makefile: -------------------------------------------------------------------------------- 1 | .ONESHELL: 2 | SHELL := /bin/bash 3 | 4 | exp: 5 | nbdev_clean 6 | nbdev_export 7 | 8 | help: ## Show this help 9 | @egrep -h '\s##\s' $(MAKEFILE_LIST) | sort | awk 'BEGIN {FS = ":.*?## "}; {printf "\033[36m%-20s\033[0m %s\n", $$1, $$2}' 10 | 11 | sync: ## Propagates any change in the modules (.py files) to the notebooks that created them 12 | nbdev_update 13 | 14 | deploy: docs ## Push local docs to gh-pages branch 15 | nbdev_ghp_deploy 16 | 17 | preview: ## Live preview quarto docs with hot reloading. 18 | nbdev_sidebar 19 | nbdev_export 20 | IN_TEST=1 && nbdev_quarto --preview 21 | 22 | docs: .FORCE ## Build quarto docs and put them into folder specified in `doc_path` in settings.ini 23 | nbdev_export 24 | nbdev_quarto 25 | 26 | prepare: ## Export notebooks to python modules, test code and clean notebooks. 27 | nbdev_export 28 | nbdev_test 29 | nbdev_clean 30 | 31 | test: ## Test notebooks 32 | nbdev_test 33 | 34 | release_all: pypi release_conda ## Release python package on pypi and conda. Also bumps version number automatically. 35 | nbdev_bump_version 36 | nbdev_export 37 | 38 | release_pypi: pypi ## Release python package on pypi. Also bumps version number automatically. 39 | nbdev_export 40 | nbdev_bump_version 41 | 42 | release_conda: 43 | fastrelease_conda_package 44 | 45 | pypi: dist 46 | twine upload --repository pypi dist/* 47 | 48 | dist: clean 49 | python setup.py sdist bdist_wheel 50 | 51 | clean: 52 | rm -rf dist 53 | 54 | 55 | install: install_quarto ## Install quarto and the latest version of the local python pckage as an editable install 56 | pip install -e ".[dev]" 57 | 58 | install_py: .FORCE 59 | nbdev_export 60 | pip install -e ".[dev]" 61 | 62 | install_quarto: .FORCE ## Install the latest version of quarto for Mac and Linux. Go to https://quarto.org/docs/get-started/ for Windows. 63 | ./install_quarto.sh 64 | 65 | .FORCE: 66 | 67 | -------------------------------------------------------------------------------- /.gitignore: -------------------------------------------------------------------------------- 1 | sidebar.yml 2 | conda/ 3 | titanic* 4 | docs/ 5 | 6 | *.bak 7 | .gitattributes 8 | .last_checked 9 | .gitconfig 10 | *.bak 11 | *.log 12 | *~ 13 | ~* 14 | _tmp* 15 | tmp* 16 | tags 17 | *.pkg 18 | 19 | # Byte-compiled / optimized / DLL files 20 | __pycache__/ 21 | *.py[cod] 22 | *$py.class 23 | 24 | # C extensions 25 | *.so 26 | 27 | # Distribution / packaging 28 | .Python 29 | env/ 30 | build/ 31 | develop-eggs/ 32 | dist/ 33 | downloads/ 34 | eggs/ 35 | .eggs/ 36 | lib/ 37 | lib64/ 38 | parts/ 39 | sdist/ 40 | var/ 41 | wheels/ 42 | *.egg-info/ 43 | .installed.cfg 44 | *.egg 45 | 46 | # PyInstaller 47 | # Usually these files are written by a python script from a template 48 | # before PyInstaller builds the exe, so as to inject date/other infos into it. 49 | *.manifest 50 | *.spec 51 | 52 | # Installer logs 53 | pip-log.txt 54 | pip-delete-this-directory.txt 55 | 56 | # Unit test / coverage reports 57 | htmlcov/ 58 | .tox/ 59 | .coverage 60 | .coverage.* 61 | .cache 62 | nosetests.xml 63 | coverage.xml 64 | *.cover 65 | .hypothesis/ 66 | 67 | # Translations 68 | *.mo 69 | *.pot 70 | 71 | # Django stuff: 72 | *.log 73 | local_settings.py 74 | 75 | # Flask stuff: 76 | instance/ 77 | .webassets-cache 78 | 79 | # Scrapy stuff: 80 | .scrapy 81 | 82 | # Sphinx documentation 83 | docs/_build/ 84 | 85 | # PyBuilder 86 | target/ 87 | 88 | # Jupyter Notebook 89 | .ipynb_checkpoints 90 | 91 | # pyenv 92 | .python-version 93 | 94 | # celery beat schedule file 95 | celerybeat-schedule 96 | 97 | # SageMath parsed files 98 | *.sage.py 99 | 100 | # dotenv 101 | .env 102 | 103 | # virtualenv 104 | .venv 105 | venv/ 106 | ENV/ 107 | 108 | # Spyder project settings 109 | .spyderproject 110 | .spyproject 111 | 112 | # Rope project settings 113 | .ropeproject 114 | 115 | # mkdocs documentation 116 | /site 117 | 118 | # mypy 119 | .mypy_cache/ 120 | 121 | .vscode 122 | *.swp 123 | 124 | # osx generated files 125 | .DS_Store 126 | .DS_Store? 127 | .Trashes 128 | ehthumbs.db 129 | Thumbs.db 130 | .idea 131 | 132 | # pytest 133 | .pytest_cache 134 | 135 | # tools/trust-doc-nbs 136 | docs_src/.last_checked 137 | 138 | # symlinks to fastai 139 | docs_src/fastai 140 | tools/fastai 141 | 142 | # link checker 143 | checklink/cookies.txt 144 | 145 | # .gitconfig is now autogenerated 146 | .gitconfig 147 | 148 | 149 | /.quarto/ 150 | -------------------------------------------------------------------------------- /setup.py: -------------------------------------------------------------------------------- 1 | from pkg_resources import parse_version 2 | from configparser import ConfigParser 3 | import setuptools 4 | assert parse_version(setuptools.__version__)>=parse_version('36.2') 5 | 6 | # note: all settings are in settings.ini; edit there, not here 7 | config = ConfigParser(delimiters=['=']) 8 | config.read('settings.ini') 9 | cfg = config['DEFAULT'] 10 | 11 | cfg_keys = 'version description keywords author author_email'.split() 12 | expected = cfg_keys + "lib_name user branch license status min_python audience language".split() 13 | for o in expected: assert o in cfg, "missing expected setting: {}".format(o) 14 | setup_cfg = {o:cfg[o] for o in cfg_keys} 15 | 16 | licenses = { 17 | 'apache2': ('Apache Software License 2.0','OSI Approved :: Apache Software License'), 18 | 'mit': ('MIT License', 'OSI Approved :: MIT License'), 19 | 'gpl2': ('GNU General Public License v2', 'OSI Approved :: GNU General Public License v2 (GPLv2)'), 20 | 'gpl3': ('GNU General Public License v3', 'OSI Approved :: GNU General Public License v3 (GPLv3)'), 21 | 'bsd3': ('BSD License', 'OSI Approved :: BSD License'), 22 | } 23 | statuses = [ '1 - Planning', '2 - Pre-Alpha', '3 - Alpha', 24 | '4 - Beta', '5 - Production/Stable', '6 - Mature', '7 - Inactive' ] 25 | py_versions = '2.0 2.1 2.2 2.3 2.4 2.5 2.6 2.7 3.0 3.1 3.2 3.3 3.4 3.5 3.6 3.7 3.8'.split() 26 | 27 | requirements = cfg.get('requirements','').split() 28 | min_python = cfg['min_python'] 29 | lic = licenses.get(cfg['license'].lower(), (cfg['license'], None)) 30 | dev_requirements = (cfg.get('dev_requirements') or '').split() 31 | 32 | setuptools.setup( 33 | name = cfg['lib_name'], 34 | license = lic[0], 35 | classifiers = [ 36 | 'Development Status :: ' + statuses[int(cfg['status'])], 37 | 'Intended Audience :: ' + cfg['audience'].title(), 38 | 'Natural Language :: ' + cfg['language'].title(), 39 | ] + ['Programming Language :: Python :: '+o for o in py_versions[py_versions.index(min_python):]] + (['License :: ' + lic[1] ] if lic[1] else []), 40 | url = cfg['git_url'], 41 | packages = setuptools.find_packages(), 42 | include_package_data = True, 43 | install_requires = requirements, 44 | extras_require={ 'dev': dev_requirements }, 45 | dependency_links = cfg.get('dep_links','').split(), 46 | python_requires = '>=' + cfg['min_python'], 47 | long_description = open('README.md').read(), 48 | long_description_content_type = 'text/markdown', 49 | zip_safe = False, 50 | entry_points = { 51 | 'console_scripts': cfg.get('console_scripts','').split(), 52 | 'mkdocs.plugins': [ 'rm_num_prefix = nbdev.mkdocs:RmNumPrefix' ], 53 | 'nbdev': [f'{cfg.get("lib_path")}={cfg.get("lib_path")}._modidx:d'] 54 | }, 55 | **setup_cfg) 56 | 57 | 58 | -------------------------------------------------------------------------------- /fastkaggle/_modidx.py: -------------------------------------------------------------------------------- 1 | # Autogenerated by nbdev 2 | 3 | d = { 'settings': { 'allowed_cell_metadata_keys': '', 4 | 'allowed_metadata_keys': '', 5 | 'audience': 'Developers', 6 | 'author': 'Jeremy Howard', 7 | 'author_email': 'info@fast.ai', 8 | 'black_formatting': 'False', 9 | 'branch': 'master', 10 | 'clean_ids': 'True', 11 | 'copyright': 'Jeremy Howard, 2022 onwards', 12 | 'custom_sidebar': 'False', 13 | 'description': 'Kaggling for fast kagglers!', 14 | 'doc_baseurl': '/fastkaggle/', 15 | 'doc_host': 'https://fastai.github.io', 16 | 'doc_path': 'docs', 17 | 'git_url': 'https://github.com/fastai/fastkaggle/tree/master/', 18 | 'host': 'github', 19 | 'jupyter_hooks': 'True', 20 | 'keywords': 'machine-learning kaggle fastai nbdev', 21 | 'language': 'English', 22 | 'lib_name': 'fastkaggle', 23 | 'lib_path': 'fastkaggle', 24 | 'license': 'apache2', 25 | 'min_python': '3.7', 26 | 'nbs_path': '.', 27 | 'readme_nb': 'index.ipynb', 28 | 'recursive': 'False', 29 | 'requirements': 'fastcore>=1.4.5 kaggle', 30 | 'status': '2', 31 | 'title': 'fastkaggle', 32 | 'tst_flags': 'notest', 33 | 'user': 'fastai', 34 | 'version': '0.0.8'}, 35 | 'syms': { 'fastkaggle.core': { 'fastkaggle.core.check_ds_exists': 'https://fastai.github.io/fastkaggle/core.html#check_ds_exists', 36 | 'fastkaggle.core.create_libs_datasets': 'https://fastai.github.io/fastkaggle/core.html#create_libs_datasets', 37 | 'fastkaggle.core.create_requirements_dataset': 'https://fastai.github.io/fastkaggle/core.html#create_requirements_dataset', 38 | 'fastkaggle.core.get_dataset': 'https://fastai.github.io/fastkaggle/core.html#get_dataset', 39 | 'fastkaggle.core.get_local_ds_ver': 'https://fastai.github.io/fastkaggle/core.html#get_local_ds_ver', 40 | 'fastkaggle.core.get_pip_libraries': 'https://fastai.github.io/fastkaggle/core.html#get_pip_libraries', 41 | 'fastkaggle.core.get_pip_library': 'https://fastai.github.io/fastkaggle/core.html#get_pip_library', 42 | 'fastkaggle.core.import_kaggle': 'https://fastai.github.io/fastkaggle/core.html#import_kaggle', 43 | 'fastkaggle.core.iskaggle': 'https://fastai.github.io/fastkaggle/core.html#iskaggle', 44 | 'fastkaggle.core.mk_dataset': 'https://fastai.github.io/fastkaggle/core.html#mk_dataset', 45 | 'fastkaggle.core.nb_meta': 'https://fastai.github.io/fastkaggle/core.html#nb_meta', 46 | 'fastkaggle.core.push_dataset': 'https://fastai.github.io/fastkaggle/core.html#push_dataset', 47 | 'fastkaggle.core.push_notebook': 'https://fastai.github.io/fastkaggle/core.html#push_notebook', 48 | 'fastkaggle.core.setup_comp': 'https://fastai.github.io/fastkaggle/core.html#setup_comp'}}} -------------------------------------------------------------------------------- /README.md: -------------------------------------------------------------------------------- 1 | fastkaggle 2 | ================ 3 | 4 | 5 | 6 | ## Install 7 | 8 | Either: 9 | 10 | pip install fastkaggle 11 | 12 | or: 13 | 14 | mamba install -c fastai fastkaggle 15 | 16 | (or replace `mamba` with `conda` if you don’t mind it taking much longer 17 | to run…) 18 | 19 | ## How to use 20 | 21 | ### Competition 22 | 23 | This little library is where I’ll be putting snippets of stuff which are 24 | useful on Kaggle. Functionality includes the following: 25 | 26 | It defines 27 | [`iskaggle`](https://fastai.github.io/fastkaggle/core.html#iskaggle) 28 | which is `True` if you’re running on Kaggle: 29 | 30 | ``` python 31 | 'Kaggle' if iskaggle else 'Not Kaggle' 32 | ``` 33 | 34 | 'Not Kaggle' 35 | 36 | It provides a 37 | [`setup_comp`](https://fastai.github.io/fastkaggle/core.html#setup_comp) 38 | function which gets a path to the data for a competition, downloading it 39 | if needed, and also installs any modules that might be missing or out of 40 | data if running on Kaggle: 41 | 42 | ``` python 43 | setup_comp('titanic') 44 | ``` 45 | 46 | Path('titanic') 47 | 48 | There’s also 49 | [`push_notebook`](https://fastai.github.io/fastkaggle/core.html#push_notebook) 50 | to push a notebook to Kaggle Notebooks, and 51 | [`import_kaggle`](https://fastai.github.io/fastkaggle/core.html#import_kaggle) 52 | to use the Kaggle API (even when you’re on Kaggle!) See the 53 | `fastkaggle.core` docs for details. 54 | 55 | ### Datasets 56 | 57 | This section is designed to make uploading pip libraries to kaggle 58 | datasets easy. There’s 2 primary high level functions to be used. First 59 | we can define our kaggle username and the local path we want to use to 60 | store datasets when we create them. 61 | 62 |
63 | 64 | > **Usage tip** 65 | > 66 | > The purpose of this is to create datasets that can be used in no 67 | > internet inference competitions to install libraries using 68 | > `pip install -Uqq library --no-index --find-links=file:///kaggle/input/your_dataset/` 69 | 70 |
71 | 72 | ``` python 73 | lib_path = Path('/root/kaggle_datasets') 74 | username = 'isaacflath' 75 | ``` 76 | 77 | #### List of Libraries 78 | 79 | We can take a list of libraries and upload them as seperate datasets. 80 | For example the below will create a `library-fastcore` and 81 | `library-timm` dataset. If they already exist, it will push a new 82 | version if there is a more recent version available. 83 | 84 | ``` python 85 | libs = ['fastcore','timm'] 86 | create_libs_datasets(libs,lib_path,username) 87 | ``` 88 | 89 | Processing fastcore as library-fastcore at /root/kaggle_datasets/library-fastcore 90 | -----Downloading or Creating Dataset 91 | -----Checking dataset version against pip 92 | -----Kaggle dataset already up to date 1.5.16 to 1.5.16 93 | Processing timm as library-timm at /root/kaggle_datasets/library-timm 94 | -----Downloading or Creating Dataset 95 | -----Checking dataset version against pip 96 | -----Kaggle dataset already up to date 0.6.7 to 0.6.7 97 | Complete 98 | 99 | This creates datasets in kaggle with the needed files. 100 | 101 | ![Pawpularity Dataset](images/libraries-pawpularity.png) 102 | 103 | #### requirements.txt 104 | 105 | We can also create a singular dataset with multiple libraries based on a 106 | `requirements.txt` file for the project. If there are any different 107 | files it will push a new version. 108 | 109 | ``` python 110 | create_requirements_dataset('test_files/requirements.txt',lib_path,'libraries-pawpularity', username) 111 | ``` 112 | 113 | Processing libraries-pawpularity at /root/kaggle_datasets/libraries-pawpularity 114 | -----Downloading or Creating Dataset 115 | Data package template written to: /root/kaggle_datasets/libraries-pawpularity/dataset-metadata.json 116 | -----Checking dataset version against pip 117 | -----Updating libraries-pawpularity in Kaggle 118 | Complete 119 | 120 | This creats a dataset in kaggle with the needed files. 121 | 122 | ![Fastkaggle Dataset](images/library-fastkaggle.png) 123 | -------------------------------------------------------------------------------- /index.ipynb: -------------------------------------------------------------------------------- 1 | { 2 | "cells": [ 3 | { 4 | "cell_type": "code", 5 | "execution_count": null, 6 | "metadata": {}, 7 | "outputs": [], 8 | "source": [ 9 | "#|hide\n", 10 | "from fastkaggle.core import *\n", 11 | "from pathlib import Path" 12 | ] 13 | }, 14 | { 15 | "cell_type": "markdown", 16 | "metadata": {}, 17 | "source": [ 18 | "# fastkaggle\n", 19 | "\n", 20 | "> Kaggling for fast kagglers!" 21 | ] 22 | }, 23 | { 24 | "cell_type": "markdown", 25 | "metadata": {}, 26 | "source": [ 27 | "## Install" 28 | ] 29 | }, 30 | { 31 | "cell_type": "markdown", 32 | "metadata": {}, 33 | "source": [ 34 | "Either:\n", 35 | "\n", 36 | " pip install fastkaggle\n", 37 | "\n", 38 | "or:\n", 39 | "\n", 40 | " mamba install -c fastai fastkaggle\n", 41 | "\n", 42 | "(or replace `mamba` with `conda` if you don't mind it taking much longer to run...)" 43 | ] 44 | }, 45 | { 46 | "cell_type": "markdown", 47 | "metadata": {}, 48 | "source": [ 49 | "## How to use" 50 | ] 51 | }, 52 | { 53 | "cell_type": "markdown", 54 | "metadata": {}, 55 | "source": [ 56 | "### Competition" 57 | ] 58 | }, 59 | { 60 | "cell_type": "markdown", 61 | "metadata": {}, 62 | "source": [ 63 | "This little library is where I'll be putting snippets of stuff which are useful on Kaggle. Functionality includes the following:\n", 64 | "\n", 65 | "It defines `iskaggle` which is `True` if you're running on Kaggle:" 66 | ] 67 | }, 68 | { 69 | "cell_type": "code", 70 | "execution_count": null, 71 | "metadata": {}, 72 | "outputs": [ 73 | { 74 | "data": { 75 | "text/plain": [ 76 | "'Not Kaggle'" 77 | ] 78 | }, 79 | "execution_count": null, 80 | "metadata": {}, 81 | "output_type": "execute_result" 82 | } 83 | ], 84 | "source": [ 85 | "'Kaggle' if iskaggle else 'Not Kaggle'" 86 | ] 87 | }, 88 | { 89 | "cell_type": "markdown", 90 | "metadata": {}, 91 | "source": [ 92 | "It provides a `setup_comp` function which gets a path to the data for a competition, downloading it if needed, and also installs any modules that might be missing or out of data if running on Kaggle:" 93 | ] 94 | }, 95 | { 96 | "cell_type": "code", 97 | "execution_count": null, 98 | "metadata": {}, 99 | "outputs": [ 100 | { 101 | "data": { 102 | "text/plain": [ 103 | "Path('titanic')" 104 | ] 105 | }, 106 | "execution_count": null, 107 | "metadata": {}, 108 | "output_type": "execute_result" 109 | } 110 | ], 111 | "source": [ 112 | "setup_comp('titanic')" 113 | ] 114 | }, 115 | { 116 | "cell_type": "markdown", 117 | "metadata": {}, 118 | "source": [ 119 | "There's also `push_notebook` to push a notebook to Kaggle Notebooks, and `import_kaggle` to use the Kaggle API (even when you're on Kaggle!) See the `fastkaggle.core` docs for details." 120 | ] 121 | }, 122 | { 123 | "cell_type": "markdown", 124 | "metadata": {}, 125 | "source": [ 126 | "### Datasets" 127 | ] 128 | }, 129 | { 130 | "cell_type": "markdown", 131 | "metadata": {}, 132 | "source": [ 133 | "This section is designed to make uploading pip libraries to kaggle datasets easy. There's 2 primary high level functions to be used. First we can define our kaggle username and the local path we want to use to store datasets when we create them. \n", 134 | "\n", 135 | ":::{.callout-tip}\n", 136 | "## Usage tip\n", 137 | "The purpose of this is to create datasets that can be used in no internet inference competitions to install libraries using `pip install -Uqq library --no-index --find-links=file:///kaggle/input/your_dataset/`\n", 138 | ":::" 139 | ] 140 | }, 141 | { 142 | "cell_type": "code", 143 | "execution_count": null, 144 | "metadata": {}, 145 | "outputs": [], 146 | "source": [ 147 | "lib_path = Path.home()/'kaggle_datasets'\n", 148 | "username = 'isaacflath'" 149 | ] 150 | }, 151 | { 152 | "cell_type": "markdown", 153 | "metadata": {}, 154 | "source": [ 155 | "#### List of Libraries\n", 156 | "\n", 157 | "We can take a list of libraries and upload them as seperate datasets. For example the below will create a `library-fastcore` and `library-timm` dataset. If they already exist, it will push a new version if there is a more recent version available." 158 | ] 159 | }, 160 | { 161 | "cell_type": "code", 162 | "execution_count": null, 163 | "metadata": {}, 164 | "outputs": [ 165 | { 166 | "name": "stdout", 167 | "output_type": "stream", 168 | "text": [ 169 | "Processing fastcore as library-fastcore at /Users/isaacflath/kaggle_datasets/library-fastcore\n", 170 | "-----Downloading or Creating Dataset\n", 171 | "-----Checking dataset version against pip\n", 172 | "-----Kaggle dataset already up to date 1.5.16 to 1.5.16\n", 173 | "Processing flask as library-flask at /Users/isaacflath/kaggle_datasets/library-flask\n", 174 | "-----Downloading or Creating Dataset\n", 175 | "-----Checking dataset version against pip\n", 176 | "-----Kaggle dataset already up to date 2.2.2 to 2.2.2\n", 177 | "Processing fastkaggle as library-fastkaggle at /Users/isaacflath/kaggle_datasets/library-fastkaggle\n", 178 | "-----Downloading or Creating Dataset\n", 179 | "-----Checking dataset version against pip\n", 180 | "-----Kaggle dataset already up to date 0.0.6 to 0.0.6\n", 181 | "Complete\n" 182 | ] 183 | } 184 | ], 185 | "source": [ 186 | "libs = ['fastcore','flask','fastkaggle']\n", 187 | "create_libs_datasets(libs,lib_path,username)" 188 | ] 189 | }, 190 | { 191 | "cell_type": "markdown", 192 | "metadata": {}, 193 | "source": [ 194 | "This creats datasets in kaggle with the needed files. For example the library `fastkaggle` looks like this in kaggle.\n", 195 | "\n", 196 | "![Fastkaggle Dataset](images/library-fastkaggle.png)" 197 | ] 198 | }, 199 | { 200 | "cell_type": "markdown", 201 | "metadata": {}, 202 | "source": [ 203 | "#### requirements.txt \n", 204 | "\n", 205 | "We can also create a singular dataset with multiple libraries based on a `requirements.txt` file for the project. If there are any different files it will push a new version." 206 | ] 207 | }, 208 | { 209 | "cell_type": "code", 210 | "execution_count": null, 211 | "metadata": {}, 212 | "outputs": [ 213 | { 214 | "name": "stdout", 215 | "output_type": "stream", 216 | "text": [ 217 | "Processing libraries-pawpularity at /root/kaggle_datasets/libraries-pawpularity\n", 218 | "-----Downloading or Creating Dataset\n", 219 | "Data package template written to: /root/kaggle_datasets/libraries-pawpularity/dataset-metadata.json\n", 220 | "-----Checking dataset version against pip\n", 221 | "-----Updating libraries-pawpularity in Kaggle\n", 222 | "Complete\n" 223 | ] 224 | } 225 | ], 226 | "source": [ 227 | "create_requirements_dataset('test_files/requirements.txt',lib_path,'libraries-pawpularity', username)" 228 | ] 229 | }, 230 | { 231 | "cell_type": "markdown", 232 | "metadata": {}, 233 | "source": [ 234 | "This creates a dataset in kaggle with the needed files.\n", 235 | "\n", 236 | "![Pawpularity Dataset](images/libraries-pawpularity.png)" 237 | ] 238 | }, 239 | { 240 | "cell_type": "code", 241 | "execution_count": null, 242 | "metadata": {}, 243 | "outputs": [], 244 | "source": [] 245 | } 246 | ], 247 | "metadata": { 248 | "kernelspec": { 249 | "display_name": "Python 3 (ipykernel)", 250 | "language": "python", 251 | "name": "python3" 252 | } 253 | }, 254 | "nbformat": 4, 255 | "nbformat_minor": 4 256 | } 257 | -------------------------------------------------------------------------------- /fastkaggle/core.py: -------------------------------------------------------------------------------- 1 | # AUTOGENERATED! DO NOT EDIT! File to edit: ../00_core.ipynb. 2 | 3 | # %% auto 0 4 | __all__ = ['iskaggle', 'import_kaggle', 'setup_comp', 'nb_meta', 'push_notebook', 'check_ds_exists', 'mk_dataset', 'get_dataset', 5 | 'get_pip_library', 'get_pip_libraries', 'push_dataset', 'get_local_ds_ver', 'create_libs_datasets', 6 | 'create_requirements_dataset'] 7 | 8 | # %% ../00_core.ipynb 3 9 | import os,json,subprocess, shutil 10 | import re 11 | from fastcore.utils import * 12 | # from fastcore.all import * 13 | 14 | # %% ../00_core.ipynb 4 15 | iskaggle = os.environ.get('KAGGLE_KERNEL_RUN_TYPE', '') 16 | 17 | # %% ../00_core.ipynb 5 18 | def import_kaggle(): 19 | "Import kaggle API, using Kaggle secrets `kaggle_username` and `kaggle_key` if needed" 20 | if iskaggle: 21 | from kaggle_secrets import UserSecretsClient 22 | sec = UserSecretsClient() 23 | os.environ['KAGGLE_USERNAME'] = sec.get_secret("kaggle_username") 24 | if not os.environ['KAGGLE_USERNAME']: raise Exception("Please insert your Kaggle username and key into Kaggle secrets") 25 | os.environ['KAGGLE_KEY'] = sec.get_secret("kaggle_key") 26 | from kaggle import api 27 | return api 28 | 29 | # %% ../00_core.ipynb 7 30 | def setup_comp(competition, install=''): 31 | "Get a path to data for `competition`, downloading it if needed" 32 | if iskaggle: 33 | if install: 34 | os.system(f'pip install -Uqq {install}') 35 | return Path('../input')/competition 36 | else: 37 | path = Path(competition) 38 | api = import_kaggle() 39 | if not path.exists(): 40 | import zipfile 41 | api.competition_download_cli(str(competition)) 42 | zipfile.ZipFile(f'{competition}.zip').extractall(str(competition)) 43 | return path 44 | 45 | # %% ../00_core.ipynb 10 46 | def nb_meta(user, id, title, file, competition=None, private=True, gpu=False, internet=True, linked_datasets=None): 47 | "Get the `dict` required for a kernel-metadata.json file" 48 | d = { 49 | "id": f"{user}/{id}", 50 | "title": title, 51 | "code_file": file, 52 | "language": "python", 53 | "kernel_type": "notebook", 54 | "is_private": private, 55 | "enable_gpu": gpu, 56 | "enable_internet": internet, 57 | "keywords": [], 58 | "dataset_sources": linked_datasets if linked_datasets else [], 59 | "kernel_sources": [] 60 | } 61 | if competition: d["competition_sources"] = [f"competitions/{competition}"] 62 | return d 63 | 64 | # %% ../00_core.ipynb 12 65 | def push_notebook(user, id, title, file, path='.', competition=None, private=True, gpu=False, internet=True, linked_datasets=None): 66 | "Push notebook `file` to Kaggle Notebooks" 67 | meta = nb_meta(user, id, title, file=file, competition=competition, private=private, gpu=gpu, internet=internet, linked_datasets=linked_datasets) 68 | path = Path(path) 69 | nm = 'kernel-metadata.json' 70 | path.mkdir(exist_ok=True, parents=True) 71 | with open(path/nm, 'w') as f: json.dump(meta, f, indent=2) 72 | api = import_kaggle() 73 | api.kernels_push_cli(str(path)) 74 | 75 | # %% ../00_core.ipynb 16 76 | def check_ds_exists(dataset_slug # Dataset slug (ie "zillow/zecon") 77 | ): 78 | '''Checks if a dataset exists in kaggle and returns boolean''' 79 | api = import_kaggle() 80 | ds_search = L(api.dataset_list(mine=True)).filter(lambda x: str(x)==dataset_slug) 81 | if len(ds_search)==1: return True 82 | elif len(ds_search)==0: return False 83 | else: raise exception("Multiple datasets found - Check Manually") 84 | 85 | # %% ../00_core.ipynb 17 86 | def mk_dataset(dataset_path, # Local path to create dataset in 87 | title, # Name of the dataset 88 | force=False, # Should it overwrite or error if exists? 89 | upload=True # Should it upload and create on kaggle 90 | ): 91 | '''Creates minimal dataset metadata needed to push new dataset to kaggle''' 92 | dataset_path = Path(dataset_path) 93 | dataset_path.mkdir(exist_ok=force,parents=True) 94 | api = import_kaggle() 95 | api.dataset_initialize(dataset_path) 96 | md = json.load(open(dataset_path/'dataset-metadata.json')) 97 | md['title'] = title 98 | md['id'] = md['id'].replace('INSERT_SLUG_HERE',title) 99 | json.dump(md,open(dataset_path/'dataset-metadata.json','w')) 100 | if upload: (dataset_path/'empty.txt').touch() 101 | api.dataset_create_new(str(dataset_path),public=True,dir_mode='zip',quiet=True) 102 | 103 | # %% ../00_core.ipynb 19 104 | def get_dataset(dataset_path, # Local path to download dataset to 105 | dataset_slug, # Dataset slug (ie "zillow/zecon") 106 | unzip=True, # Should it unzip after downloading? 107 | force=False # Should it overwrite or error if dataset_path exists? 108 | ): 109 | '''Downloads an existing dataset and metadata from kaggle''' 110 | if not force: assert not Path(dataset_path).exists() 111 | api = import_kaggle() 112 | api.dataset_metadata(dataset_slug,str(dataset_path)) 113 | api.dataset_download_files(dataset_slug,str(dataset_path)) 114 | if unzip: 115 | zipped_file = Path(dataset_path)/f"{dataset_slug.split('/')[-1]}.zip" 116 | import zipfile 117 | with zipfile.ZipFile(zipped_file, 'r') as zip_ref: 118 | zip_ref.extractall(Path(dataset_path)) 119 | zipped_file.unlink() 120 | 121 | 122 | # %% ../00_core.ipynb 20 123 | def get_pip_library(dataset_path, # Local path to download pip library to 124 | pip_library, # name of library for pip to install 125 | pip_cmd="pip" # pip base to use (ie "pip3" or "pip") 126 | ): 127 | '''Download the whl files for pip_library and store in dataset_path''' 128 | bashCommand = f"{pip_cmd} download {pip_library} -d {dataset_path}" 129 | process = subprocess.Popen(bashCommand.split(), stdout=subprocess.PIPE) 130 | output, error = process.communicate() 131 | 132 | # %% ../00_core.ipynb 21 133 | def get_pip_libraries(dataset_path, # Local path to download pip library to 134 | requirements_path, # path to requirements file 135 | pip_cmd="pip" # pip base to use (ie "pip3" or "pip") 136 | ): 137 | '''Download whl files for a requirements.txt file and store in dataset_path''' 138 | bashCommand = f"{pip_cmd} download -r {requirements_path} -d {dataset_path}" 139 | process = subprocess.Popen(bashCommand.split(), stdout=subprocess.PIPE) 140 | output, error = process.communicate() 141 | 142 | # %% ../00_core.ipynb 23 143 | def push_dataset(dataset_path, # Local path where dataset is stored 144 | version_comment # Comment associated with this dataset update 145 | ): 146 | '''Push dataset update to kaggle. Dataset path must contain dataset metadata file''' 147 | api = import_kaggle() 148 | api.dataset_create_version(str(dataset_path),version_comment,dir_mode='zip',quiet=True) 149 | 150 | # %% ../00_core.ipynb 24 151 | def get_local_ds_ver(lib_path, # Local path dataset is stored in 152 | lib # Name of library (ie "fastcore") 153 | ): 154 | '''checks a local copy of kaggle dataset for library version number''' 155 | wheel_lib_name = lib.replace('-','_') 156 | local_path = (lib_path/f"library-{lib}") 157 | lib_whl = local_path.ls().filter(lambda x: wheel_lib_name in x.name.lower()) 158 | if 1==len(lib_whl): 159 | return re.search(f"(?<={wheel_lib_name}-)[\d+.]+\d",lib_whl[0].name.lower())[0] 160 | elif 0 API details for fastkaggle." 19 | ] 20 | }, 21 | { 22 | "cell_type": "code", 23 | "execution_count": null, 24 | "metadata": {}, 25 | "outputs": [], 26 | "source": [ 27 | "#|hide\n", 28 | "from nbdev.showdoc import *" 29 | ] 30 | }, 31 | { 32 | "cell_type": "code", 33 | "execution_count": null, 34 | "metadata": {}, 35 | "outputs": [], 36 | "source": [ 37 | "#|export\n", 38 | "import os,json,subprocess, shutil\n", 39 | "import re\n", 40 | "from fastcore.utils import *\n", 41 | "# from fastcore.all import *" 42 | ] 43 | }, 44 | { 45 | "cell_type": "code", 46 | "execution_count": null, 47 | "metadata": {}, 48 | "outputs": [], 49 | "source": [ 50 | "#|export\n", 51 | "iskaggle = os.environ.get('KAGGLE_KERNEL_RUN_TYPE', '')" 52 | ] 53 | }, 54 | { 55 | "cell_type": "code", 56 | "execution_count": null, 57 | "metadata": {}, 58 | "outputs": [], 59 | "source": [ 60 | "#|export\n", 61 | "def import_kaggle():\n", 62 | " \"Import kaggle API, using Kaggle secrets `kaggle_username` and `kaggle_key` if needed\"\n", 63 | " if iskaggle:\n", 64 | " from kaggle_secrets import UserSecretsClient\n", 65 | " sec = UserSecretsClient()\n", 66 | " os.environ['KAGGLE_USERNAME'] = sec.get_secret(\"kaggle_username\")\n", 67 | " if not os.environ['KAGGLE_USERNAME']: raise Exception(\"Please insert your Kaggle username and key into Kaggle secrets\")\n", 68 | " os.environ['KAGGLE_KEY'] = sec.get_secret(\"kaggle_key\")\n", 69 | " from kaggle import api\n", 70 | " return api" 71 | ] 72 | }, 73 | { 74 | "cell_type": "code", 75 | "execution_count": null, 76 | "metadata": {}, 77 | "outputs": [ 78 | { 79 | "data": { 80 | "text/plain": [ 81 | "(#20) [contradictory-my-dear-watson,gan-getting-started,store-sales-time-series-forecasting,tpu-getting-started,digit-recognizer,titanic,house-prices-advanced-regression-techniques,connectx,nlp-getting-started,spaceship-titanic...]" 82 | ] 83 | }, 84 | "execution_count": null, 85 | "metadata": {}, 86 | "output_type": "execute_result" 87 | } 88 | ], 89 | "source": [ 90 | "api = import_kaggle()\n", 91 | "L(api.competitions_list())" 92 | ] 93 | }, 94 | { 95 | "cell_type": "code", 96 | "execution_count": null, 97 | "metadata": {}, 98 | "outputs": [], 99 | "source": [ 100 | "#|export\n", 101 | "def setup_comp(competition, install=''):\n", 102 | " \"Get a path to data for `competition`, downloading it if needed\"\n", 103 | " if iskaggle:\n", 104 | " if install:\n", 105 | " os.system(f'pip install -Uqq {install}')\n", 106 | " return Path('../input')/competition\n", 107 | " else:\n", 108 | " path = Path(competition)\n", 109 | " api = import_kaggle()\n", 110 | " if not path.exists():\n", 111 | " import zipfile\n", 112 | " api.competition_download_cli(str(competition))\n", 113 | " zipfile.ZipFile(f'{competition}.zip').extractall(str(competition))\n", 114 | " return path" 115 | ] 116 | }, 117 | { 118 | "cell_type": "code", 119 | "execution_count": null, 120 | "metadata": {}, 121 | "outputs": [ 122 | { 123 | "data": { 124 | "text/plain": [ 125 | "Path('titanic')" 126 | ] 127 | }, 128 | "execution_count": null, 129 | "metadata": {}, 130 | "output_type": "execute_result" 131 | } 132 | ], 133 | "source": [ 134 | "setup_comp('titanic')" 135 | ] 136 | }, 137 | { 138 | "cell_type": "markdown", 139 | "metadata": {}, 140 | "source": [ 141 | "If you pass a list of space separated modules to `install`, they'll be installed if running on Kaggle." 142 | ] 143 | }, 144 | { 145 | "cell_type": "code", 146 | "execution_count": null, 147 | "metadata": {}, 148 | "outputs": [], 149 | "source": [ 150 | "#|export\n", 151 | "def nb_meta(user, id, title, file, competition=None, private=True, gpu=False, internet=True, linked_datasets=None):\n", 152 | " \"Get the `dict` required for a kernel-metadata.json file\"\n", 153 | " d = {\n", 154 | " \"id\": f\"{user}/{id}\",\n", 155 | " \"title\": title,\n", 156 | " \"code_file\": file,\n", 157 | " \"language\": \"python\",\n", 158 | " \"kernel_type\": \"notebook\",\n", 159 | " \"is_private\": private,\n", 160 | " \"enable_gpu\": gpu,\n", 161 | " \"enable_internet\": internet,\n", 162 | " \"keywords\": [],\n", 163 | " \"dataset_sources\": linked_datasets if linked_datasets else [],\n", 164 | " \"kernel_sources\": []\n", 165 | " }\n", 166 | " if competition: d[\"competition_sources\"] = [f\"competitions/{competition}\"]\n", 167 | " return d" 168 | ] 169 | }, 170 | { 171 | "cell_type": "code", 172 | "execution_count": null, 173 | "metadata": {}, 174 | "outputs": [ 175 | { 176 | "data": { 177 | "text/plain": [ 178 | "{'id': 'jhoward/my-notebook',\n", 179 | " 'title': 'My notebook',\n", 180 | " 'code_file': 'my-notebook.ipynb',\n", 181 | " 'language': 'python',\n", 182 | " 'kernel_type': 'notebook',\n", 183 | " 'is_private': True,\n", 184 | " 'enable_gpu': False,\n", 185 | " 'enable_internet': True,\n", 186 | " 'keywords': [],\n", 187 | " 'dataset_sources': [],\n", 188 | " 'kernel_sources': [],\n", 189 | " 'competition_sources': ['competitions/paddy-disease-classification']}" 190 | ] 191 | }, 192 | "execution_count": null, 193 | "metadata": {}, 194 | "output_type": "execute_result" 195 | } 196 | ], 197 | "source": [ 198 | "nb_meta('jhoward', 'my-notebook', 'My notebook', 'my-notebook.ipynb', competition='paddy-disease-classification')" 199 | ] 200 | }, 201 | { 202 | "cell_type": "code", 203 | "execution_count": null, 204 | "metadata": {}, 205 | "outputs": [], 206 | "source": [ 207 | "#|export\n", 208 | "def push_notebook(user, id, title, file, path='.', competition=None, private=True, gpu=False, internet=True, linked_datasets=None):\n", 209 | " \"Push notebook `file` to Kaggle Notebooks\"\n", 210 | " meta = nb_meta(user, id, title, file=file, competition=competition, private=private, gpu=gpu, internet=internet, linked_datasets=linked_datasets)\n", 211 | " path = Path(path)\n", 212 | " nm = 'kernel-metadata.json'\n", 213 | " path.mkdir(exist_ok=True, parents=True)\n", 214 | " with open(path/nm, 'w') as f: json.dump(meta, f, indent=2)\n", 215 | " api = import_kaggle()\n", 216 | " api.kernels_push_cli(str(path))" 217 | ] 218 | }, 219 | { 220 | "cell_type": "markdown", 221 | "metadata": {}, 222 | "source": [ 223 | "Note that Kaggle recommends that the `id` match the *slug* for the title -- i.e it should be the same as the title, but lowercase, no punctuation, and spaces replaced with dashes. E.g:\n", 224 | "\n", 225 | "```python\n", 226 | "push_notebook('jhoward', 'first-steps-road-to-the-top-part-1',\n", 227 | " title='First Steps: Road to the Top, Part 1',\n", 228 | " file='first-steps-road-to-the-top-part-1.ipynb',\n", 229 | " competition='paddy-disease-classification',\n", 230 | " private=False, gpu=True)\n", 231 | "```" 232 | ] 233 | }, 234 | { 235 | "cell_type": "markdown", 236 | "metadata": {}, 237 | "source": [ 238 | "## Datasets" 239 | ] 240 | }, 241 | { 242 | "cell_type": "markdown", 243 | "metadata": {}, 244 | "source": [ 245 | "### Core" 246 | ] 247 | }, 248 | { 249 | "cell_type": "code", 250 | "execution_count": null, 251 | "metadata": {}, 252 | "outputs": [], 253 | "source": [ 254 | "#| export\n", 255 | "def check_ds_exists(dataset_slug # Dataset slug (ie \"zillow/zecon\")\n", 256 | " ):\n", 257 | " '''Checks if a dataset exists in kaggle and returns boolean'''\n", 258 | " api = import_kaggle()\n", 259 | " ds_search = L(api.dataset_list(mine=True)).filter(lambda x: str(x)==dataset_slug)\n", 260 | " if len(ds_search)==1: return True\n", 261 | " elif len(ds_search)==0: return False\n", 262 | " else: raise exception(\"Multiple datasets found - Check Manually\")" 263 | ] 264 | }, 265 | { 266 | "cell_type": "code", 267 | "execution_count": null, 268 | "metadata": {}, 269 | "outputs": [], 270 | "source": [ 271 | "#| export\n", 272 | "def mk_dataset(dataset_path, # Local path to create dataset in\n", 273 | " title, # Name of the dataset\n", 274 | " force=False, # Should it overwrite or error if exists?\n", 275 | " upload=True # Should it upload and create on kaggle\n", 276 | " ):\n", 277 | " '''Creates minimal dataset metadata needed to push new dataset to kaggle'''\n", 278 | " dataset_path = Path(dataset_path)\n", 279 | " dataset_path.mkdir(exist_ok=force,parents=True)\n", 280 | " api = import_kaggle()\n", 281 | " api.dataset_initialize(dataset_path)\n", 282 | " md = json.load(open(dataset_path/'dataset-metadata.json'))\n", 283 | " md['title'] = title\n", 284 | " md['id'] = md['id'].replace('INSERT_SLUG_HERE',title)\n", 285 | " json.dump(md,open(dataset_path/'dataset-metadata.json','w'))\n", 286 | " if upload: (dataset_path/'empty.txt').touch()\n", 287 | " api.dataset_create_new(str(dataset_path),public=True,dir_mode='zip',quiet=True)" 288 | ] 289 | }, 290 | { 291 | "cell_type": "code", 292 | "execution_count": null, 293 | "metadata": {}, 294 | "outputs": [ 295 | { 296 | "name": "stdout", 297 | "output_type": "stream", 298 | "text": [ 299 | "Data package template written to: testds/dataset-metadata.json\n" 300 | ] 301 | } 302 | ], 303 | "source": [ 304 | "mk_dataset('./testds','mytestds',force=True)\n", 305 | "md = json.load(open('./testds/dataset-metadata.json'))\n", 306 | "assert md['title'] == 'mytestds'\n", 307 | "assert md['id'].endswith('/mytestds')" 308 | ] 309 | }, 310 | { 311 | "cell_type": "code", 312 | "execution_count": null, 313 | "metadata": {}, 314 | "outputs": [], 315 | "source": [ 316 | "#| export\n", 317 | "def get_dataset(dataset_path, # Local path to download dataset to\n", 318 | " dataset_slug, # Dataset slug (ie \"zillow/zecon\")\n", 319 | " unzip=True, # Should it unzip after downloading?\n", 320 | " force=False # Should it overwrite or error if dataset_path exists?\n", 321 | " ):\n", 322 | " '''Downloads an existing dataset and metadata from kaggle'''\n", 323 | " if not force: assert not Path(dataset_path).exists()\n", 324 | " api = import_kaggle()\n", 325 | " api.dataset_metadata(dataset_slug,str(dataset_path))\n", 326 | " api.dataset_download_files(dataset_slug,str(dataset_path))\n", 327 | " if unzip:\n", 328 | " zipped_file = Path(dataset_path)/f\"{dataset_slug.split('/')[-1]}.zip\"\n", 329 | " import zipfile\n", 330 | " with zipfile.ZipFile(zipped_file, 'r') as zip_ref:\n", 331 | " zip_ref.extractall(Path(dataset_path))\n", 332 | " zipped_file.unlink()\n", 333 | " " 334 | ] 335 | }, 336 | { 337 | "cell_type": "code", 338 | "execution_count": null, 339 | "metadata": {}, 340 | "outputs": [], 341 | "source": [ 342 | "#| export\n", 343 | "def get_pip_library(dataset_path, # Local path to download pip library to\n", 344 | " pip_library, # name of library for pip to install\n", 345 | " pip_cmd=\"pip\" # pip base to use (ie \"pip3\" or \"pip\")\n", 346 | " ): \n", 347 | " '''Download the whl files for pip_library and store in dataset_path'''\n", 348 | " bashCommand = f\"{pip_cmd} download {pip_library} -d {dataset_path}\"\n", 349 | " process = subprocess.Popen(bashCommand.split(), stdout=subprocess.PIPE)\n", 350 | " output, error = process.communicate()" 351 | ] 352 | }, 353 | { 354 | "cell_type": "code", 355 | "execution_count": null, 356 | "metadata": {}, 357 | "outputs": [], 358 | "source": [ 359 | "#| export\n", 360 | "def get_pip_libraries(dataset_path, # Local path to download pip library to\n", 361 | " requirements_path, # path to requirements file\n", 362 | " pip_cmd=\"pip\" # pip base to use (ie \"pip3\" or \"pip\")\n", 363 | " ):\n", 364 | " '''Download whl files for a requirements.txt file and store in dataset_path'''\n", 365 | " bashCommand = f\"{pip_cmd} download -r {requirements_path} -d {dataset_path}\"\n", 366 | " process = subprocess.Popen(bashCommand.split(), stdout=subprocess.PIPE)\n", 367 | " output, error = process.communicate()" 368 | ] 369 | }, 370 | { 371 | "cell_type": "code", 372 | "execution_count": null, 373 | "metadata": {}, 374 | "outputs": [], 375 | "source": [ 376 | "dl_path = Path('./mylib')\n", 377 | "get_pip_library(dl_path,'fastkaggle')\n", 378 | "assert 1==len([o for o in dl_path.ls() if str(o).startswith(f\"{dl_path}/fastkaggle\")])" 379 | ] 380 | }, 381 | { 382 | "cell_type": "code", 383 | "execution_count": null, 384 | "metadata": {}, 385 | "outputs": [], 386 | "source": [ 387 | "#| export\n", 388 | "def push_dataset(dataset_path, # Local path where dataset is stored \n", 389 | " version_comment # Comment associated with this dataset update\n", 390 | " ):\n", 391 | " '''Push dataset update to kaggle. Dataset path must contain dataset metadata file'''\n", 392 | " api = import_kaggle()\n", 393 | " api.dataset_create_version(str(dataset_path),version_comment,dir_mode='zip',quiet=True)" 394 | ] 395 | }, 396 | { 397 | "cell_type": "code", 398 | "execution_count": null, 399 | "metadata": {}, 400 | "outputs": [], 401 | "source": [ 402 | "#| export\n", 403 | "def get_local_ds_ver(lib_path, # Local path dataset is stored in\n", 404 | " lib # Name of library (ie \"fastcore\")\n", 405 | " ):\n", 406 | " '''checks a local copy of kaggle dataset for library version number'''\n", 407 | " wheel_lib_name = lib.replace('-','_')\n", 408 | " local_path = (lib_path/f\"library-{lib}\")\n", 409 | " lib_whl = local_path.ls().filter(lambda x: wheel_lib_name in x.name.lower())\n", 410 | " if 1==len(lib_whl):\n", 411 | " return re.search(f\"(?<={wheel_lib_name}-)[\\d+.]+\\d\",lib_whl[0].name.lower())[0]\n", 412 | " elif 0