├── .gitignore ├── LICENSE ├── README.md ├── kaggle_data ├── __init__.py ├── downloader.py └── utils.py ├── setup.py └── tests ├── __init__.py └── tests.py /.gitignore: -------------------------------------------------------------------------------- 1 | input/ 2 | tests/input/ 3 | 4 | # Byte-compiled / optimized / DLL files 5 | __pycache__/ 6 | *.py[cod] 7 | *$py.class 8 | 9 | # C extensions 10 | *.so 11 | 12 | # Distribution / packaging 13 | .Python 14 | env/ 15 | build/ 16 | develop-eggs/ 17 | dist/ 18 | downloads/ 19 | eggs/ 20 | .eggs/ 21 | lib/ 22 | lib64/ 23 | parts/ 24 | sdist/ 25 | var/ 26 | wheels/ 27 | *.egg-info/ 28 | .installed.cfg 29 | *.egg 30 | 31 | # PyInstaller 32 | # Usually these files are written by a python script from a template 33 | # before PyInstaller builds the exe, so as to inject date/other infos into it. 34 | *.manifest 35 | *.spec 36 | 37 | # Installer logs 38 | pip-log.txt 39 | pip-delete-this-directory.txt 40 | 41 | # Unit test / coverage reports 42 | htmlcov/ 43 | .tox/ 44 | .coverage 45 | .coverage.* 46 | .cache 47 | nosetests.xml 48 | coverage.xml 49 | *.cover 50 | .hypothesis/ 51 | 52 | # Translations 53 | *.mo 54 | *.pot 55 | 56 | # Django stuff: 57 | *.log 58 | local_settings.py 59 | 60 | # Flask stuff: 61 | instance/ 62 | .webassets-cache 63 | 64 | # Scrapy stuff: 65 | .scrapy 66 | 67 | # Sphinx documentation 68 | docs/_build/ 69 | 70 | # PyBuilder 71 | target/ 72 | 73 | # Jupyter Notebook 74 | .ipynb_checkpoints 75 | 76 | # pyenv 77 | .python-version 78 | 79 | # celery beat schedule file 80 | celerybeat-schedule 81 | 82 | # SageMath parsed files 83 | *.sage.py 84 | 85 | # dotenv 86 | .env 87 | 88 | # virtualenv 89 | .venv 90 | venv/ 91 | ENV/ 92 | 93 | # Spyder project settings 94 | .spyderproject 95 | .spyproject 96 | 97 | # Rope project settings 98 | .ropeproject 99 | 100 | # mkdocs documentation 101 | /site 102 | 103 | # mypy 104 | .mypy_cache/ 105 | -------------------------------------------------------------------------------- /LICENSE: -------------------------------------------------------------------------------- 1 | MIT License 2 | 3 | Copyright (c) 2017 Tuatini Godard 4 | 5 | Permission is hereby granted, free of charge, to any person obtaining a copy 6 | of this software and associated documentation files (the "Software"), to deal 7 | in the Software without restriction, including without limitation the rights 8 | to use, copy, modify, merge, publish, distribute, sublicense, and/or sell 9 | copies of the Software, and to permit persons to whom the Software is 10 | furnished to do so, subject to the following conditions: 11 | 12 | The above copyright notice and this permission notice shall be included in all 13 | copies or substantial portions of the Software. 14 | 15 | THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR 16 | IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, 17 | FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE 18 | AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER 19 | LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, 20 | OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE 21 | SOFTWARE. 22 | -------------------------------------------------------------------------------- /README.md: -------------------------------------------------------------------------------- 1 | # This project may break as the Kaggle website changes 2 | 3 | To solve this issue I will move the project calls to the [official Kaggle API](https://github.com/Kaggle/kaggle-api) in the future. 4 | 5 | # Kaggle-data-downloader 6 | An unofficial Kaggle datasets downloader very much inspired by [kaggle-cli](https://github.com/floydwch/kaggle-cli) 7 | 8 | ## Installation 9 | 10 | ``` 11 | $ pip install -U kaggle_data 12 | ``` 13 | 14 | Depending on the format of your archive you may need to install some tools 15 | on your computer. 16 | 17 | - For 7-zip archives: You need the `7za` program from p7zip. 18 | - On macOS: `brew install p7zip` 19 | - On Ubuntu: `sudo apt-get install p7zip-full` 20 | 21 | ## Usage 22 | Please note that accepting the competition rules before your commands is mandatory. 23 | 24 | Usage example: 25 | ``` 26 | from kaggle_data.downloader import KaggleDataDownloader 27 | 28 | destination_path = "input/" 29 | 30 | downloader = KaggleDataDownloader("Ekami", "somePassword", "planet-understanding-the-amazon-from-space") 31 | output_path = downloader.download_dataset("test-jpg-additional.tar.7z", destination_path) 32 | downloader.decompress(output_path, destination_path) 33 | downloader.decompress(destination_path + "test-jpg-additional.tar", destination_path) 34 | ``` 35 | 36 | ## Packaging the project for Pypi deploy 37 | 38 | ``` 39 | pip install twine 40 | pip install wheel 41 | python setup.py sdist 42 | python setup.py bdist_wheel 43 | ``` 44 | 45 | [Create a pypi account](https://packaging.python.org/tutorials/distributing-packages/#id76) and create `$HOME/.pypirc` with: 46 | ``` 47 | [pypi] 48 | username = 49 | password = 50 | ``` 51 | 52 | Then upload the packages with: 53 | ``` 54 | twine upload dist/* 55 | ``` 56 | -------------------------------------------------------------------------------- /kaggle_data/__init__.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/godardt/kaggle-data-downloader/e9133113db19392520add6f7cb1c99eaa70d81fc/kaggle_data/__init__.py -------------------------------------------------------------------------------- /kaggle_data/downloader.py: -------------------------------------------------------------------------------- 1 | import re 2 | import os 3 | import sys 4 | from . import utils 5 | import progressbar 6 | from mechanicalsoup import Browser 7 | 8 | 9 | class KaggleDataDownloader: 10 | def __init__(self, username, password, competition_name): 11 | """ 12 | 13 | :param username: string 14 | Your kaggle username 15 | :param password: string 16 | Your kaggle password 17 | :param competition_name: string 18 | The name of the competition which can be found 19 | in the competition url. 20 | For example wih this competition: 21 | https://www.kaggle.com/c/planet-understanding-the-amazon-from-space 22 | The competition name would be: planet-understanding-the-amazon-from-space 23 | """ 24 | self.username = username 25 | self.password = password 26 | self.competition_name = competition_name 27 | 28 | def download_dataset(self, dataset_name, destination_path): 29 | """ 30 | 31 | :param dataset_name: string 32 | The name of the dataset you want to download 33 | :param destination_path: string 34 | The path where you want to store the dataset 35 | :return: string 36 | The path where the dataset was downloaded 37 | """ 38 | try: 39 | browser = self._login() 40 | base = 'https://www.kaggle.com' 41 | data_url = '/'.join([base, 'c', self.competition_name, 'data']) 42 | data_page = browser.get(data_url) 43 | 44 | data = str(data_page.soup) 45 | links = re.findall( 46 | '"url":"(/c/{}/download/[^"]+)"'.format(self.competition_name), data 47 | ) 48 | for link in links: 49 | url = base + link 50 | if dataset_name is None or url.endswith('/' + dataset_name): 51 | return self._download_file(browser, url, destination_path) 52 | 53 | except Exception as e: 54 | print(e, file=sys.stderr) 55 | 56 | @staticmethod 57 | def decompress(file_path, destination_path): 58 | """ 59 | Uncompress an archive 60 | :param file_path: string 61 | Path of your dataset archive 62 | :param destination_path: string 63 | Path where you want to extract your file 64 | """ 65 | file_type = utils.get_archive_type(file_path) 66 | if file_type == '7z': 67 | utils.extract_7_zip(file_path, destination_path) 68 | elif file_type == 'zip': 69 | utils.extract_zip(file_path, destination_path) 70 | elif file_type == 'tar': 71 | utils.extract_tar(file_path, destination_path) 72 | 73 | def _download_file(self, browser, url, destination_path): 74 | local_filename = url.split('/')[-1] 75 | headers = {} 76 | done = False 77 | file_size = 0 78 | content_length = int( 79 | browser.request('head', url).headers.get('Content-Length') 80 | ) 81 | 82 | widgets = [local_filename, ' ', progressbar.Percentage(), ' ', 83 | progressbar.Bar(marker='#'), ' ', 84 | progressbar.ETA(), ' ', progressbar.FileTransferSpeed()] 85 | 86 | local_filename = destination_path + local_filename 87 | print('downloading {} to {}\n'.format(url, local_filename)) 88 | if os.path.isfile(local_filename): 89 | file_size = os.path.getsize(local_filename) 90 | if file_size < content_length: 91 | headers['Range'] = 'bytes={}-'.format(file_size) 92 | else: 93 | done = True 94 | 95 | finished_bytes = file_size 96 | 97 | if file_size == content_length: 98 | print('{} already downloaded !'.format(local_filename)) 99 | return local_filename 100 | elif file_size > content_length: 101 | raise Exception('Something wrong here, Incorrect file !') 102 | else: 103 | bar = progressbar.ProgressBar(widgets=widgets, 104 | maxval=content_length).start() 105 | bar.update(finished_bytes) 106 | 107 | if not done: 108 | stream = browser.get(url, stream=True, headers=headers) 109 | if not self.is_downloadable(stream): 110 | warning = ( 111 | 'Warning:' 112 | 'download url for file {} resolves to an html document' 113 | 'rather than a downloadable file. \n' 114 | 'See the downloaded file for details.' 115 | 'Is it possible you have not' 116 | 'accepted the competition\'s rules on the kaggle website?'.format(local_filename) 117 | ) 118 | raise Exception('{}\n'.format(warning)) 119 | os.makedirs(os.path.dirname(local_filename), exist_ok=True) 120 | with open(local_filename, 'ab') as f: 121 | for chunk in stream.iter_content(chunk_size=1024): 122 | if chunk: # filter out keep-alive new chunks 123 | f.write(chunk) 124 | finished_bytes += len(chunk) 125 | bar.update(finished_bytes) 126 | bar.finish() 127 | return local_filename 128 | 129 | def is_downloadable(self, response): 130 | ''' 131 | Checks whether the response object is a html page 132 | or a likely downloadable file. 133 | Intended to detect error pages or prompts 134 | such as kaggle's competition rules acceptance prompt. 135 | Returns True if the response is a html page. False otherwise. 136 | ''' 137 | 138 | content_type = response.headers.get('Content-Type', '') 139 | content_disp = response.headers.get('Content-Disposition', '') 140 | 141 | if 'text/html' in content_type and 'attachment' not in content_disp: 142 | # This response is a html file 143 | # which is not marked as an attachment, 144 | # so we likely hit a rules acceptance prompt 145 | return False 146 | return True 147 | 148 | def _login(self): 149 | login_url = 'https://www.kaggle.com/account/login' 150 | browser = Browser() 151 | 152 | login_page = browser.get(login_url) 153 | token = re.search('antiForgeryToken: \'(?P.+)\'', str(login_page.soup)).group(1) 154 | login_result_page = browser.post(login_url, 155 | data={ 156 | 'username': self.username, 157 | 'password': self.password, 158 | '__RequestVerificationToken': token 159 | }) 160 | 161 | error_match = re.search('"errors":\["(?P.+)"\]', str(login_result_page.soup)) 162 | if error_match: 163 | raise Exception('There was an error logging in: ' + error_match.group(1)) 164 | 165 | return browser 166 | -------------------------------------------------------------------------------- /kaggle_data/utils.py: -------------------------------------------------------------------------------- 1 | import subprocess 2 | import tarfile 3 | import zipfile 4 | 5 | 6 | def get_archive_type(file_path): 7 | """ 8 | Returns the type of archive passed to file_path 9 | :param file_path: string 10 | The path of the archive 11 | :return: string 12 | Returns either: "gz", "bz2", "zip", "7z", "tar" 13 | """ 14 | file_type = None 15 | magic_dict = { 16 | b"\x1f\x8b\x08": "gz", 17 | b"\x42\x5a\x68": "bz2", 18 | b"\x50\x4b\x03\x04": "zip", 19 | b"\x37\x7A\xBC\xAF\x27\x1C": "7z" 20 | } 21 | max_len = max(len(x) for x in magic_dict) 22 | 23 | if tarfile.is_tarfile(file_path): 24 | return "tar" 25 | 26 | with open(file_path, "rb") as f: 27 | file_start = f.read(max_len) 28 | for magic, filetype in magic_dict.items(): 29 | if file_start.startswith(magic): 30 | file_type = filetype 31 | 32 | return file_type 33 | 34 | 35 | def extract_7_zip(file_path, destination_path): 36 | command = ['7za', 'x', file_path, '-o' + destination_path, '-aoa'] 37 | 38 | print("Extracting {} to {} ...".format(file_path, destination_path)) 39 | subprocess.run(command) 40 | print("Extraction finished") 41 | 42 | 43 | def extract_tar(file_path, destination_path): 44 | print("Extracting {} to {} ...".format(file_path, destination_path)) 45 | with tarfile.open(file_path) as tar: 46 | tar.extractall(path=destination_path) 47 | print("Extraction finished") 48 | 49 | 50 | def extract_zip(file_path, destination_path): 51 | print("Extracting {} to {} ...".format(file_path, destination_path)) 52 | with zipfile.ZipFile(file_path) as archive: 53 | archive.extractall(path=destination_path) 54 | print("Extraction finished") 55 | -------------------------------------------------------------------------------- /setup.py: -------------------------------------------------------------------------------- 1 | from setuptools import setup, find_packages 2 | 3 | setup( 4 | name='kaggle_data', 5 | version='0.4', 6 | 7 | description='An unofficial Kaggle datasets downloader', 8 | long_description="https://github.com/EKami/kaggle_data/blob/master/README.md", 9 | url='https://github.com/EKami/kaggle_data', 10 | author='GODARD Tuatini', 11 | author_email='tuatinigodard@gmail.com', 12 | license='MIT', 13 | 14 | classifiers=[ 15 | # 3 - Alpha 16 | # 4 - Beta 17 | # 5 - Production/Stable 18 | 'Development Status :: 3 - Alpha', 19 | 'Intended Audience :: Developers', 20 | 'Topic :: Software Development :: Build Tools', 21 | 'License :: OSI Approved :: MIT License', 22 | 23 | 'Programming Language :: Python :: 3', 24 | 'Programming Language :: Python :: 3.3', 25 | 'Programming Language :: Python :: 3.4', 26 | 'Programming Language :: Python :: 3.5', 27 | 'Programming Language :: Python :: 3.6' 28 | ], 29 | 30 | keywords='development', 31 | packages=find_packages(exclude=['tests']), 32 | install_requires=['mechanicalsoup', 33 | 'progressbar2'], 34 | ) 35 | -------------------------------------------------------------------------------- /tests/__init__.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/godardt/kaggle-data-downloader/e9133113db19392520add6f7cb1c99eaa70d81fc/tests/__init__.py -------------------------------------------------------------------------------- /tests/tests.py: -------------------------------------------------------------------------------- 1 | import os 2 | import sys 3 | 4 | sys.path.append('../kaggle_data') 5 | from kaggle_data.downloader import KaggleDataDownloader 6 | 7 | 8 | class TestKaggleDataDownloader: 9 | """ 10 | Use with pytest -q -s tests.py 11 | """ 12 | 13 | def test_download_data(self): 14 | competition_name = "planet-understanding-the-amazon-from-space" 15 | dataset_name = "test-jpg-additional.tar.7z" 16 | labels_name = "train_v2.csv.zip" 17 | destination_path = "input/" 18 | 19 | downloader = KaggleDataDownloader(os.getenv("KAGGLE_USER"), os.getenv("KAGGLE_PASSWD"), competition_name) 20 | 21 | output_path = downloader.download_dataset(dataset_name, destination_path) 22 | downloader.decompress(output_path, destination_path) 23 | downloader.decompress(destination_path + "test-jpg-additional.tar", destination_path) 24 | 25 | labels_output_path = downloader.download_dataset(labels_name, destination_path) 26 | downloader.decompress(labels_output_path, destination_path) 27 | --------------------------------------------------------------------------------