├── .gitignore
├── LICENSE
├── README.md
├── kaggle_data
    ├── __init__.py
    ├── downloader.py
    └── utils.py
├── setup.py
└── tests
    ├── __init__.py
    └── tests.py


/.gitignore:
--------------------------------------------------------------------------------
  1 | input/
  2 | tests/input/
  3 | 
  4 | # Byte-compiled / optimized / DLL files
  5 | __pycache__/
  6 | *.py[cod]
  7 | *$py.class
  8 | 
  9 | # C extensions
 10 | *.so
 11 | 
 12 | # Distribution / packaging
 13 | .Python
 14 | env/
 15 | build/
 16 | develop-eggs/
 17 | dist/
 18 | downloads/
 19 | eggs/
 20 | .eggs/
 21 | lib/
 22 | lib64/
 23 | parts/
 24 | sdist/
 25 | var/
 26 | wheels/
 27 | *.egg-info/
 28 | .installed.cfg
 29 | *.egg
 30 | 
 31 | # PyInstaller
 32 | #  Usually these files are written by a python script from a template
 33 | #  before PyInstaller builds the exe, so as to inject date/other infos into it.
 34 | *.manifest
 35 | *.spec
 36 | 
 37 | # Installer logs
 38 | pip-log.txt
 39 | pip-delete-this-directory.txt
 40 | 
 41 | # Unit test / coverage reports
 42 | htmlcov/
 43 | .tox/
 44 | .coverage
 45 | .coverage.*
 46 | .cache
 47 | nosetests.xml
 48 | coverage.xml
 49 | *.cover
 50 | .hypothesis/
 51 | 
 52 | # Translations
 53 | *.mo
 54 | *.pot
 55 | 
 56 | # Django stuff:
 57 | *.log
 58 | local_settings.py
 59 | 
 60 | # Flask stuff:
 61 | instance/
 62 | .webassets-cache
 63 | 
 64 | # Scrapy stuff:
 65 | .scrapy
 66 | 
 67 | # Sphinx documentation
 68 | docs/_build/
 69 | 
 70 | # PyBuilder
 71 | target/
 72 | 
 73 | # Jupyter Notebook
 74 | .ipynb_checkpoints
 75 | 
 76 | # pyenv
 77 | .python-version
 78 | 
 79 | # celery beat schedule file
 80 | celerybeat-schedule
 81 | 
 82 | # SageMath parsed files
 83 | *.sage.py
 84 | 
 85 | # dotenv
 86 | .env
 87 | 
 88 | # virtualenv
 89 | .venv
 90 | venv/
 91 | ENV/
 92 | 
 93 | # Spyder project settings
 94 | .spyderproject
 95 | .spyproject
 96 | 
 97 | # Rope project settings
 98 | .ropeproject
 99 | 
100 | # mkdocs documentation
101 | /site
102 | 
103 | # mypy
104 | .mypy_cache/
105 | 


--------------------------------------------------------------------------------
/LICENSE:
--------------------------------------------------------------------------------
 1 | MIT License
 2 | 
 3 | Copyright (c) 2017 Tuatini Godard
 4 | 
 5 | Permission is hereby granted, free of charge, to any person obtaining a copy
 6 | of this software and associated documentation files (the "Software"), to deal
 7 | in the Software without restriction, including without limitation the rights
 8 | to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
 9 | copies of the Software, and to permit persons to whom the Software is
10 | furnished to do so, subject to the following conditions:
11 | 
12 | The above copyright notice and this permission notice shall be included in all
13 | copies or substantial portions of the Software.
14 | 
15 | THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
16 | IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
17 | FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
18 | AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
19 | LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
20 | OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
21 | SOFTWARE.
22 | 


--------------------------------------------------------------------------------
/README.md:
--------------------------------------------------------------------------------
 1 | # This project may break as the Kaggle website changes
 2 | 
 3 | To solve this issue I will move the project calls to the [official Kaggle API](https://github.com/Kaggle/kaggle-api) in the future.
 4 | 
 5 | # Kaggle-data-downloader
 6 | An unofficial Kaggle datasets downloader very much inspired by [kaggle-cli](https://github.com/floydwch/kaggle-cli)
 7 | 
 8 | ## Installation
 9 | 
10 | ```
11 | $ pip install -U kaggle_data
12 | ```
13 | 
14 | Depending on the format of your archive you may need to install some tools
15 | on your computer.
16 | 
17 |  - For 7-zip archives: You need the `7za` program from p7zip. 
18 |     - On macOS: `brew install p7zip`
19 |     - On Ubuntu: `sudo apt-get install p7zip-full`
20 | 
21 | ## Usage
22 | Please note that accepting the competition rules before your commands is mandatory.
23 | 
24 | Usage example:
25 | ```
26 | from kaggle_data.downloader import KaggleDataDownloader
27 | 
28 | destination_path = "input/"
29 | 
30 | downloader = KaggleDataDownloader("Ekami", "somePassword", "planet-understanding-the-amazon-from-space")
31 | output_path = downloader.download_dataset("test-jpg-additional.tar.7z", destination_path)
32 | downloader.decompress(output_path, destination_path)
33 | downloader.decompress(destination_path + "test-jpg-additional.tar", destination_path)
34 | ```
35 | 
36 | ## Packaging the project for Pypi deploy
37 | 
38 | ```
39 | pip install twine
40 | pip install wheel
41 | python setup.py sdist
42 | python setup.py bdist_wheel
43 | ```
44 | 
45 | [Create a pypi account](https://packaging.python.org/tutorials/distributing-packages/#id76) and create `$HOME/.pypirc` with:
46 | ```
47 | [pypi]
48 | username = <username>
49 | password = <password>
50 | ```
51 | 
52 | Then upload the packages with:
53 | ```
54 | twine upload dist/*
55 | ```
56 | 


--------------------------------------------------------------------------------
/kaggle_data/__init__.py:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/godardt/kaggle-data-downloader/e9133113db19392520add6f7cb1c99eaa70d81fc/kaggle_data/__init__.py


--------------------------------------------------------------------------------
/kaggle_data/downloader.py:
--------------------------------------------------------------------------------
  1 | import re
  2 | import os
  3 | import sys
  4 | from . import utils
  5 | import progressbar
  6 | from mechanicalsoup import Browser
  7 | 
  8 | 
  9 | class KaggleDataDownloader:
 10 |     def __init__(self, username, password, competition_name):
 11 |         """
 12 |         
 13 |         :param username: string
 14 |             Your kaggle username
 15 |         :param password: string
 16 |             Your kaggle password
 17 |         :param competition_name: string
 18 |             The name of the competition which can be found
 19 |             in the competition url.
 20 |             For example wih this competition: 
 21 |                 https://www.kaggle.com/c/planet-understanding-the-amazon-from-space
 22 |             The competition name would be: planet-understanding-the-amazon-from-space
 23 |         """
 24 |         self.username = username
 25 |         self.password = password
 26 |         self.competition_name = competition_name
 27 | 
 28 |     def download_dataset(self, dataset_name, destination_path):
 29 |         """
 30 |         
 31 |         :param dataset_name: string
 32 |             The name of the dataset you want to download
 33 |         :param destination_path: string
 34 |             The path where you want to store the dataset
 35 |         :return: string
 36 |             The path where the dataset was downloaded
 37 |         """
 38 |         try:
 39 |             browser = self._login()
 40 |             base = 'https://www.kaggle.com'
 41 |             data_url = '/'.join([base, 'c', self.competition_name, 'data'])
 42 |             data_page = browser.get(data_url)
 43 | 
 44 |             data = str(data_page.soup)
 45 |             links = re.findall(
 46 |                 '"url":"(/c/{}/download/[^"]+)"'.format(self.competition_name), data
 47 |             )
 48 |             for link in links:
 49 |                 url = base + link
 50 |                 if dataset_name is None or url.endswith('/' + dataset_name):
 51 |                     return self._download_file(browser, url, destination_path)
 52 | 
 53 |         except Exception as e:
 54 |             print(e, file=sys.stderr)
 55 | 
 56 |     @staticmethod
 57 |     def decompress(file_path, destination_path):
 58 |         """
 59 |             Uncompress an archive
 60 |         :param file_path: string
 61 |             Path of your dataset archive
 62 |         :param destination_path: string
 63 |             Path where you want to extract your file
 64 |         """
 65 |         file_type = utils.get_archive_type(file_path)
 66 |         if file_type == '7z':
 67 |             utils.extract_7_zip(file_path, destination_path)
 68 |         elif file_type == 'zip':
 69 |             utils.extract_zip(file_path, destination_path)
 70 |         elif file_type == 'tar':
 71 |             utils.extract_tar(file_path, destination_path)
 72 | 
 73 |     def _download_file(self, browser, url, destination_path):
 74 |         local_filename = url.split('/')[-1]
 75 |         headers = {}
 76 |         done = False
 77 |         file_size = 0
 78 |         content_length = int(
 79 |             browser.request('head', url).headers.get('Content-Length')
 80 |         )
 81 | 
 82 |         widgets = [local_filename, ' ', progressbar.Percentage(), ' ',
 83 |                    progressbar.Bar(marker='#'), ' ',
 84 |                    progressbar.ETA(), ' ', progressbar.FileTransferSpeed()]
 85 | 
 86 |         local_filename = destination_path + local_filename
 87 |         print('downloading {} to {}\n'.format(url, local_filename))
 88 |         if os.path.isfile(local_filename):
 89 |             file_size = os.path.getsize(local_filename)
 90 |             if file_size < content_length:
 91 |                 headers['Range'] = 'bytes={}-'.format(file_size)
 92 |             else:
 93 |                 done = True
 94 | 
 95 |         finished_bytes = file_size
 96 | 
 97 |         if file_size == content_length:
 98 |             print('{} already downloaded !'.format(local_filename))
 99 |             return local_filename
100 |         elif file_size > content_length:
101 |             raise Exception('Something wrong here, Incorrect file !')
102 |         else:
103 |             bar = progressbar.ProgressBar(widgets=widgets,
104 |                                           maxval=content_length).start()
105 |             bar.update(finished_bytes)
106 | 
107 |         if not done:
108 |             stream = browser.get(url, stream=True, headers=headers)
109 |             if not self.is_downloadable(stream):
110 |                 warning = (
111 |                     'Warning:'
112 |                     'download url for file {} resolves to an html document'
113 |                     'rather than a downloadable file. \n'
114 |                     'See the downloaded file for details.'
115 |                     'Is it possible you have not'
116 |                     'accepted the competition\'s rules on the kaggle website?'.format(local_filename)
117 |                 )
118 |                 raise Exception('{}\n'.format(warning))
119 |             os.makedirs(os.path.dirname(local_filename), exist_ok=True)
120 |             with open(local_filename, 'ab') as f:
121 |                 for chunk in stream.iter_content(chunk_size=1024):
122 |                     if chunk:  # filter out keep-alive new chunks
123 |                         f.write(chunk)
124 |                         finished_bytes += len(chunk)
125 |                         bar.update(finished_bytes)
126 |             bar.finish()
127 |         return local_filename
128 | 
129 |     def is_downloadable(self, response):
130 |         '''
131 |         Checks whether the response object is a html page
132 |         or a likely downloadable file.
133 |         Intended to detect error pages or prompts
134 |         such as kaggle's competition rules acceptance prompt.
135 |         Returns True if the response is a html page. False otherwise.
136 |         '''
137 | 
138 |         content_type = response.headers.get('Content-Type', '')
139 |         content_disp = response.headers.get('Content-Disposition', '')
140 | 
141 |         if 'text/html' in content_type and 'attachment' not in content_disp:
142 |             # This response is a html file
143 |             # which is not marked as an attachment,
144 |             # so we likely hit a rules acceptance prompt
145 |             return False
146 |         return True
147 | 
148 |     def _login(self):
149 |         login_url = 'https://www.kaggle.com/account/login'
150 |         browser = Browser()
151 | 
152 |         login_page = browser.get(login_url)
153 |         token = re.search('antiForgeryToken: \'(?P<token>.+)\'', str(login_page.soup)).group(1)
154 |         login_result_page = browser.post(login_url,
155 |                                          data={
156 |                                              'username': self.username,
157 |                                              'password': self.password,
158 |                                              '__RequestVerificationToken': token
159 |                                          })
160 | 
161 |         error_match = re.search('"errors":\["(?P<error>.+)"\]', str(login_result_page.soup))
162 |         if error_match:
163 |             raise Exception('There was an error logging in: ' + error_match.group(1))
164 | 
165 |         return browser
166 | 


--------------------------------------------------------------------------------
/kaggle_data/utils.py:
--------------------------------------------------------------------------------
 1 | import subprocess
 2 | import tarfile
 3 | import zipfile
 4 | 
 5 | 
 6 | def get_archive_type(file_path):
 7 |     """
 8 |     Returns the type of archive passed to file_path
 9 |     :param file_path: string
10 |         The path of the archive
11 |     :return: string
12 |         Returns either: "gz", "bz2", "zip", "7z", "tar"
13 |     """
14 |     file_type = None
15 |     magic_dict = {
16 |         b"\x1f\x8b\x08": "gz",
17 |         b"\x42\x5a\x68": "bz2",
18 |         b"\x50\x4b\x03\x04": "zip",
19 |         b"\x37\x7A\xBC\xAF\x27\x1C": "7z"
20 |     }
21 |     max_len = max(len(x) for x in magic_dict)
22 | 
23 |     if tarfile.is_tarfile(file_path):
24 |         return "tar"
25 | 
26 |     with open(file_path, "rb") as f:
27 |         file_start = f.read(max_len)
28 |         for magic, filetype in magic_dict.items():
29 |             if file_start.startswith(magic):
30 |                 file_type = filetype
31 | 
32 |     return file_type
33 | 
34 | 
35 | def extract_7_zip(file_path, destination_path):
36 |     command = ['7za', 'x', file_path, '-o' + destination_path, '-aoa']
37 | 
38 |     print("Extracting {} to {} ...".format(file_path, destination_path))
39 |     subprocess.run(command)
40 |     print("Extraction finished")
41 | 
42 | 
43 | def extract_tar(file_path, destination_path):
44 |     print("Extracting {} to {} ...".format(file_path, destination_path))
45 |     with tarfile.open(file_path) as tar:
46 |         tar.extractall(path=destination_path)
47 |     print("Extraction finished")
48 | 
49 | 
50 | def extract_zip(file_path, destination_path):
51 |     print("Extracting {} to {} ...".format(file_path, destination_path))
52 |     with zipfile.ZipFile(file_path) as archive:
53 |         archive.extractall(path=destination_path)
54 |     print("Extraction finished")
55 | 


--------------------------------------------------------------------------------
/setup.py:
--------------------------------------------------------------------------------
 1 | from setuptools import setup, find_packages
 2 | 
 3 | setup(
 4 |     name='kaggle_data',
 5 |     version='0.4',
 6 | 
 7 |     description='An unofficial Kaggle datasets downloader',
 8 |     long_description="https://github.com/EKami/kaggle_data/blob/master/README.md",
 9 |     url='https://github.com/EKami/kaggle_data',
10 |     author='GODARD Tuatini',
11 |     author_email='tuatinigodard@gmail.com',
12 |     license='MIT',
13 | 
14 |     classifiers=[
15 |         #   3 - Alpha
16 |         #   4 - Beta
17 |         #   5 - Production/Stable
18 |         'Development Status :: 3 - Alpha',
19 |         'Intended Audience :: Developers',
20 |         'Topic :: Software Development :: Build Tools',
21 |         'License :: OSI Approved :: MIT License',
22 | 
23 |         'Programming Language :: Python :: 3',
24 |         'Programming Language :: Python :: 3.3',
25 |         'Programming Language :: Python :: 3.4',
26 |         'Programming Language :: Python :: 3.5',
27 |         'Programming Language :: Python :: 3.6'
28 |     ],
29 | 
30 |     keywords='development',
31 |     packages=find_packages(exclude=['tests']),
32 |     install_requires=['mechanicalsoup',
33 |                       'progressbar2'],
34 | )
35 | 


--------------------------------------------------------------------------------
/tests/__init__.py:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/godardt/kaggle-data-downloader/e9133113db19392520add6f7cb1c99eaa70d81fc/tests/__init__.py


--------------------------------------------------------------------------------
/tests/tests.py:
--------------------------------------------------------------------------------
 1 | import os
 2 | import sys
 3 | 
 4 | sys.path.append('../kaggle_data')
 5 | from kaggle_data.downloader import KaggleDataDownloader
 6 | 
 7 | 
 8 | class TestKaggleDataDownloader:
 9 |     """
10 |     Use with pytest -q -s tests.py
11 |     """
12 | 
13 |     def test_download_data(self):
14 |         competition_name = "planet-understanding-the-amazon-from-space"
15 |         dataset_name = "test-jpg-additional.tar.7z"
16 |         labels_name = "train_v2.csv.zip"
17 |         destination_path = "input/"
18 | 
19 |         downloader = KaggleDataDownloader(os.getenv("KAGGLE_USER"), os.getenv("KAGGLE_PASSWD"), competition_name)
20 | 
21 |         output_path = downloader.download_dataset(dataset_name, destination_path)
22 |         downloader.decompress(output_path, destination_path)
23 |         downloader.decompress(destination_path + "test-jpg-additional.tar", destination_path)
24 | 
25 |         labels_output_path = downloader.download_dataset(labels_name, destination_path)
26 |         downloader.decompress(labels_output_path, destination_path)
27 | 


--------------------------------------------------------------------------------