├── .github └── workflows │ ├── ci.yml │ ├── pr.yml │ └── release.yml ├── .readthedocs.yaml ├── AUTHORS ├── CHANGES ├── LICENSE ├── MANIFEST.in ├── README.md ├── doc ├── advanced.rst ├── api.rst ├── conf.py ├── index.rst ├── quickstart.rst └── requirements.txt ├── etc └── hadoop │ ├── core-site.xml │ ├── hdfs-site.xml │ ├── httpfs-site.xml │ └── log4j.properties ├── examples ├── avro-example.py ├── dataframe-example.py └── json-example.py ├── hdfs ├── __init__.py ├── __main__.py ├── client.py ├── config.py ├── ext │ ├── __init__.py │ ├── avro │ │ ├── __init__.py │ │ └── __main__.py │ ├── dataframe.py │ └── kerberos.py └── util.py ├── scripts ├── hadoop.sh └── version.sh ├── setup.py └── test ├── __init__.py ├── dat ├── client_template.py ├── weather.avro ├── weather.avsc └── weather.jsonl ├── test_client.py ├── test_config.py ├── test_examples.py ├── test_ext_avro.py ├── test_ext_dataframe.py ├── test_ext_kerberos.py ├── test_main.py ├── test_util.py └── util.py /.github/workflows/ci.yml: -------------------------------------------------------------------------------- 1 | name: CI 2 | on: 3 | push: 4 | branches: 5 | - master 6 | paths-ignore: 7 | - '**.md' 8 | - .readthedocs.yaml 9 | - doc/* 10 | jobs: 11 | test: 12 | name: Test 13 | runs-on: ubuntu-latest 14 | strategy: 15 | fail-fast: false 16 | matrix: 17 | python-version: 18 | # - '3.6' (see https://github.com/actions/setup-python/issues/544) 19 | - '3.7' 20 | - '3.8' 21 | - '3.9' 22 | - '3.10' 23 | - '3.11' 24 | - '3.12' 25 | steps: 26 | - name: Check out 27 | uses: actions/checkout@v3 28 | - name: Setup Java 29 | uses: actions/setup-java@v3 30 | with: 31 | distribution: 'adopt' 32 | java-version: '8' 33 | - name: Setup Python 34 | uses: actions/setup-python@v4 35 | with: 36 | python-version: ${{ matrix.python-version }} 37 | - name: Download Hadoop 38 | run: | 39 | echo "HADOOP_HOME=$(./scripts/hadoop.sh download)" >>"$GITHUB_ENV" 40 | - name: Configure Hadoop 41 | run: | 42 | echo "HADOOP_CONF_DIR=$(./scripts/hadoop.sh config)" >>"$GITHUB_ENV" 43 | - name: Start HDFS 44 | run: | 45 | ./scripts/hadoop.sh start 46 | echo "WEBHDFS_URL=http://$("$HADOOP_HOME/bin/hdfs" getconf -confKey dfs.namenode.http-address)" >>"$GITHUB_ENV" 47 | echo "HTTPFS_URL=http://localhost:14000" >>"$GITHUB_ENV" 48 | sleep 5 # TODO: Find a better way to wait for all datanodes to become reachable. 49 | - name: Install 50 | run: pip install .[avro] coverage mock pytest pytest-cov pandas 51 | - name: Test on WebHDFS 52 | run: HDFSCLI_TEST_URL="$WEBHDFS_URL" python -m pytest --cov=hdfs 53 | - name: Test on HTTPFS 54 | run: HDFSCLI_TEST_URL="$HTTPFS_URL" HDFSCLI_NOSNAPSHOT=1 python -m pytest --cov=hdfs 55 | - name: Stop HDFS 56 | if: always() 57 | run: ./scripts/hadoop.sh stop 58 | tag: 59 | name: Tag 60 | needs: 61 | - test 62 | runs-on: ubuntu-latest 63 | steps: 64 | - name: Check out 65 | uses: actions/checkout@v3 66 | - name: Extract version 67 | id: extract-version 68 | run: | 69 | PACKAGE_VERSION="$(./scripts/version.sh)" 70 | echo "version=$PACKAGE_VERSION" >>"$GITHUB_OUTPUT" 71 | - name: Check if tag exists 72 | uses: mukunku/tag-exists-action@v1.1.0 73 | id: check-version 74 | with: 75 | tag: v${{ steps.extract-version.outputs.version }} 76 | env: 77 | GITHUB_TOKEN: ${{ secrets.GITHUB_TOKEN }} 78 | - name: Create tag 79 | if: steps.check-version.outputs.exists == 'false' 80 | uses: pkgdeps/git-tag-action@v2 81 | with: 82 | git_commit_sha: ${{ github.sha }} 83 | git_tag_prefix: v 84 | github_repo: ${{ github.repository }} 85 | github_token: ${{ secrets.GITHUB_TOKEN }} 86 | version: ${{ steps.extract-version.outputs.version }} 87 | -------------------------------------------------------------------------------- /.github/workflows/pr.yml: -------------------------------------------------------------------------------- 1 | name: PR 2 | on: 3 | pull_request: 4 | branches: 5 | - master 6 | paths-ignore: 7 | - '**.md' 8 | - .readthedocs.yaml 9 | - doc/* 10 | jobs: 11 | test: 12 | name: Test 13 | runs-on: ubuntu-latest 14 | strategy: 15 | matrix: 16 | python-version: 17 | # - '3.6' (see https://github.com/actions/setup-python/issues/544) 18 | - '3.7' 19 | - '3.8' 20 | - '3.9' 21 | - '3.10' 22 | - '3.11' 23 | - '3.12' 24 | steps: 25 | - name: Check out 26 | uses: actions/checkout@v3 27 | - name: Setup Java 28 | uses: actions/setup-java@v3 29 | with: 30 | distribution: 'adopt' 31 | java-version: '8' 32 | - name: Setup Python 33 | uses: actions/setup-python@v4 34 | with: 35 | python-version: ${{ matrix.python-version }} 36 | - name: Download Hadoop 37 | run: | 38 | echo "HADOOP_HOME=$(./scripts/hadoop.sh download)" >>"$GITHUB_ENV" 39 | - name: Configure Hadoop 40 | run: | 41 | echo "HADOOP_CONF_DIR=$(./scripts/hadoop.sh config)" >>"$GITHUB_ENV" 42 | - name: Start HDFS 43 | run: | 44 | ./scripts/hadoop.sh start 45 | echo "WEBHDFS_URL=http://$("$HADOOP_HOME/bin/hdfs" getconf -confKey dfs.namenode.http-address)" >>"$GITHUB_ENV" 46 | echo "HTTPFS_URL=http://localhost:14000" >>"$GITHUB_ENV" 47 | sleep 5 # TODO: Find a better way to wait for all datanodes to become reachable. 48 | - name: Install 49 | run: pip install .[avro] coverage mock pytest pytest-cov pandas 50 | - name: Test on WebHDFS 51 | run: HDFSCLI_TEST_URL="$WEBHDFS_URL" python -m pytest --cov=hdfs 52 | - name: Test on HTTPFS 53 | run: HDFSCLI_TEST_URL="$HTTPFS_URL" HDFSCLI_NOSNAPSHOT=1 python -m pytest --cov=hdfs 54 | - name: Stop HDFS 55 | if: always() 56 | run: ./scripts/hadoop.sh stop 57 | -------------------------------------------------------------------------------- /.github/workflows/release.yml: -------------------------------------------------------------------------------- 1 | name: Release 2 | on: 3 | release: 4 | types: 5 | - published 6 | jobs: 7 | test: 8 | name: Publish 9 | timeout-minutes: 2 10 | runs-on: ubuntu-latest 11 | steps: 12 | - name: Check out 13 | uses: actions/checkout@v2 14 | - name: Setup Python 15 | uses: actions/setup-python@v4 16 | with: 17 | python-version: '3.10' 18 | - name: Install 19 | run: pip install twine 20 | - name: Publish 21 | run: | 22 | python setup.py sdist 23 | twine upload dist/* 24 | env: 25 | TWINE_USERNAME: __token__ 26 | TWINE_PASSWORD: ${{ secrets.PYPI_TOKEN }} 27 | -------------------------------------------------------------------------------- /.readthedocs.yaml: -------------------------------------------------------------------------------- 1 | version: 2 2 | build: 3 | os: ubuntu-22.04 4 | tools: 5 | python: '3.9' 6 | sphinx: 7 | configuration: doc/conf.py 8 | python: 9 | install: 10 | - path: . 11 | method: pip 12 | extra_requirements: 13 | - avro 14 | - requirements: doc/requirements.txt 15 | -------------------------------------------------------------------------------- /AUTHORS: -------------------------------------------------------------------------------- 1 | Matthieu Monsch 2 | Artemy Kolchinsky 3 | Evan Borgstrom 4 | Wes McKinney 5 | Isaac Hodes 6 | -------------------------------------------------------------------------------- /CHANGES: -------------------------------------------------------------------------------- 1 | HdfsCLI 2 | ======= 3 | 4 | Version 2.0 (2015/20/08) 5 | ------------------------ 6 | 7 | * Add python 3 support (the Kerberos extension's requirements must however be 8 | manually installed). 9 | * Allow specifying relative client roots. These will be assumed relative to the 10 | user's home directory. 11 | * Add several client methods: `makedirs`, `set_times`, `checksum`, 12 | `set_replication`, etc. 13 | * Add `progress` argument to `Client.read`, `Client.upload`, and 14 | `Client.download`. Also add a `chunk_size` argument to the latter to allow 15 | better tracking. 16 | * Add `strict` option to `Client.status` and `Client.content` to perform path 17 | existence checks. 18 | * `Client.write` now can be used as a context manager (returning a file-like 19 | object). 20 | * `Client.read` and `Client.write` can now return file-like objects (supporting 21 | `read` and `write` calls respectively). 22 | * Improve robustness of `KerberosClient`. In particular add the 23 | `max_concurrency` parameter which can be tuned to prevent authentication 24 | errors when too many simultaneous requests are being made. Along with the new 25 | delay parameter, this lets us remove the timeouts in `Client.download` and 26 | `Client.upload`, which both simplify and speed up these functions. 27 | * Add `Config` class which handles all CLI configuration (e.g. aliases and 28 | logging). 29 | * Add `autoload.modules` and `autoload.paths` configuration options. 30 | * Rename alias configuration sections to `ALIAS.alias` (the old format, 31 | `ALIAS_alias` is still supported). 32 | * Add `--verbose` option to CLI, enabling logging at various levels. 33 | * Switch Avro extension to using `fastavro`. This speeds up `AvroWriter` and 34 | `AvroReader` by a significant amount (~5 times faster on a reasonable 35 | connection). 36 | * Add `write` command to Avro CLI. 37 | * Remove CSV support for dataframe extension. 38 | 39 | Breaking changes: 40 | 41 | * Change default configuration file path (to `~/.hdfscli.cfg`). 42 | * Change location of `default.alias` option in configuration file (from command 43 | specific section to `global`). 44 | * Add `session` argument to `Client` and remove newly redundant parameters 45 | (e.g. `verify`, `cert`). 46 | * Remove `Client.from_alias` method (delegated to the new `Config` class). The 47 | `Client.from_options` method is now public (renamed from 48 | `Client._from_options`). 49 | * Change default entry point name to `hdfscli` to avoid clashing with Hadoop 50 | HDFS script. 51 | * `Client.delete` now returns a boolean indicating success (rather than failing 52 | if the path was already empty). 53 | * `Client.read` must now be used inside a `with` block. This ensures that 54 | connections are properly closed. The context manager now returns a file like 55 | object (useful for composing with other functions, e.g. to decode Avro). 56 | Setting `chunk_size` to a positive value will make it return a generator 57 | similar to the previous behavior. 58 | * Rename `Client.set_permissions` to `Client.set_permission` to make 59 | `permission` argument uniform across `Client` methods (always singular, 60 | consistent with WebHDFS API). 61 | * `Client.parts` will now throw an error if called on a normal file. 62 | * Make most client attributes private (e.g. `cert`, `timeout`, etc.), except 63 | `url` and `root`. 64 | * Remove `--` prefix from CLI commands. Also simplify CLI to only interactive, 65 | download and upload commands (write and read behavior can be achieved by 66 | passing '-' as local path). The `Client` API changes should make it more 67 | convenient to perform these from a python shell. 68 | * Rename several CLI options (e.g. `--log`, `--force`, `--version`). 69 | * Change meaning of `n_threads` option in `Client.download` and 70 | `Client.upload`. `0` now means one thread per part-file rather than a single 71 | thread. 72 | * Change `Client.walk` to be consistent with `os.walk`. Also change meaning of 73 | `depth` option (`0` being unlimited). 74 | * Add `status` option to `Client.list`, `Client.walk`, and `Client.parts`. By 75 | default these functions now only return the names of the relevant files and 76 | folders. 77 | * Remove `Client.append` method (replaced by `append` keyword argument to 78 | `Client.write`). 79 | * Symbols exported by extensions aren't imported in the main `hdfs` module 80 | anymore. This removes the need for some custom error handling (when 81 | dependency requirements weren't met). 82 | * Remove Bash autocompletion file (for now). 83 | * Remove compatibility layer for entry point configuration (i.e. 84 | `HDFS_ENTRY_POINT` isn't supported anymore). 85 | 86 | 87 | Version 1.4.0 (2015/07/24) 88 | -------------------------- 89 | 90 | * Add support for download and upload of arbitrary folders. 91 | * Deprecate `Client.append` (in favor of `append` argument to `Client.write`). 92 | 93 | 94 | Version 1.1.0 (2015/06/23) 95 | -------------------------- 96 | 97 | * Rename Avro extension entry point to `hdfs-avro`. 98 | 99 | 100 | Version 1.0.1 (2015/06/17) 101 | -------------------------- 102 | 103 | * Added support for Windows. 104 | * Added support for remote filepaths with `=` characters. 105 | 106 | 107 | Version 0.3.0 (2014/11/14) 108 | -------------------------- 109 | 110 | * Added `--interactive` command. 111 | 112 | Breaking changes: 113 | 114 | * Renamed `--info` command to `--list`. 115 | * Made `--interactive` the new default command. 116 | 117 | 118 | Version 0.2.6 (2014/08/04) 119 | -------------------------- 120 | 121 | * Added parallelized downloading. 122 | * Added Avro-format reading and writing. 123 | * Added `hdfs.ext.dataframe` extension. 124 | 125 | 126 | Version 0.2.0 (2014/04/26) 127 | -------------------------- 128 | 129 | * Added `Client.status` and `Client.content` methods. 130 | * Added callback to `Client.write`. 131 | 132 | Breaking changes: 133 | 134 | * Removed content from `Client.walk`. 135 | * Simplified CLI. All download and uploads are normalized through standard in, 136 | and standard out. 137 | 138 | 139 | Version 0.1.0 (2014/03/25) 140 | -------------------------- 141 | 142 | * Initial release 143 | -------------------------------------------------------------------------------- /LICENSE: -------------------------------------------------------------------------------- 1 | Copyright (c) 2014, Matthieu Monsch. 2 | 3 | Permission is hereby granted, free of charge, to any person obtaining a copy of 4 | this software and associated documentation files (the "Software"), to deal in 5 | the Software without restriction, including without limitation the rights to 6 | use, copy, modify, merge, publish, distribute, sublicense, and/or sell copies 7 | of the Software, and to permit persons to whom the Software is furnished to do 8 | so, subject to the following conditions: 9 | 10 | The above copyright notice and this permission notice shall be included in all 11 | copies or substantial portions of the Software. 12 | 13 | THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR 14 | IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, 15 | FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE 16 | AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER 17 | LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, 18 | OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE 19 | SOFTWARE. 20 | -------------------------------------------------------------------------------- /MANIFEST.in: -------------------------------------------------------------------------------- 1 | include LICENSE README.md 2 | -------------------------------------------------------------------------------- /README.md: -------------------------------------------------------------------------------- 1 | # HdfsCLI [![CI](https://github.com/mtth/hdfs/actions/workflows/ci.yml/badge.svg)](https://github.com/mtth/hdfs/actions/workflows/ci.yml) [![Pypi badge](https://badge.fury.io/py/hdfs.svg)](https://pypi.python.org/pypi/hdfs/) [![Downloads badge](https://img.shields.io/pypi/dm/hdfs.svg)](https://pypistats.org/packages/hdfs) 2 | 3 | API and command line interface for HDFS. 4 | 5 | ``` 6 | $ hdfscli --alias=dev 7 | 8 | Welcome to the interactive HDFS python shell. 9 | The HDFS client is available as `CLIENT`. 10 | 11 | In [1]: CLIENT.list('models/') 12 | Out[1]: ['1.json', '2.json'] 13 | 14 | In [2]: CLIENT.status('models/2.json') 15 | Out[2]: { 16 | 'accessTime': 1439743128690, 17 | 'blockSize': 134217728, 18 | 'childrenNum': 0, 19 | 'fileId': 16389, 20 | 'group': 'supergroup', 21 | 'length': 48, 22 | 'modificationTime': 1439743129392, 23 | 'owner': 'drwho', 24 | 'pathSuffix': '', 25 | 'permission': '755', 26 | 'replication': 1, 27 | 'storagePolicy': 0, 28 | 'type': 'FILE' 29 | } 30 | 31 | In [3]: with CLIENT.read('models/2.json', encoding='utf-8') as reader: 32 | ...: from json import load 33 | ...: model = load(reader) 34 | ...: 35 | ``` 36 | 37 | ## Features 38 | 39 | * Python 3 bindings for the [WebHDFS][] (and [HttpFS][]) API, 40 | supporting both secure and insecure clusters. 41 | * Command line interface to transfer files and start an interactive client 42 | shell, with aliases for convenient namenode URL caching. 43 | * Additional functionality through optional extensions: 44 | 45 | + `avro`, to [read and write Avro files directly from HDFS][]. 46 | + `dataframe`, to [load and save Pandas dataframes][]. 47 | + `kerberos`, to [support Kerberos authenticated clusters][]. 48 | 49 | See the [documentation][] to learn more. 50 | 51 | ## Getting started 52 | 53 | ```sh 54 | $ pip install hdfs 55 | ``` 56 | 57 | Then hop on over to the [quickstart][] guide. A [Conda 58 | feedstock](https://github.com/conda-forge/python-hdfs-feedstock) is also 59 | available. 60 | 61 | ## Testing 62 | 63 | HdfsCLI is tested against both [WebHDFS][] and [HttpFS][]. There are two ways 64 | of running tests (see `scripts/` for helpers to set up a test HDFS cluster): 65 | 66 | ```sh 67 | $ HDFSCLI_TEST_URL=http://localhost:50070 pytest # Using a namenode's URL. 68 | $ HDFSCLI_TEST_ALIAS=dev pytest # Using an alias. 69 | ``` 70 | 71 | ## Contributing 72 | 73 | We'd love to hear what you think on the [issues][] page. Pull requests are also 74 | most welcome! 75 | 76 | [HttpFS]: http://hadoop.apache.org/docs/current/hadoop-hdfs-httpfs/ 77 | [WebHDFS]: http://hadoop.apache.org/docs/current/hadoop-project-dist/hadoop-hdfs/WebHDFS.html 78 | [read and write Avro files directly from HDFS]: https://hdfscli.readthedocs.io/en/latest/api.html#module-hdfs.ext.avro 79 | [load and save Pandas dataframes]: https://hdfscli.readthedocs.io/en/latest/api.html#module-hdfs.ext.dataframe 80 | [support Kerberos authenticated clusters]: https://hdfscli.readthedocs.io/en/latest/api.html#module-hdfs.ext.kerberos 81 | [documentation]: https://hdfscli.readthedocs.io/ 82 | [quickstart]: https://hdfscli.readthedocs.io/en/latest/quickstart.html 83 | [issues]: https://github.com/mtth/hdfs/issues 84 | -------------------------------------------------------------------------------- /doc/advanced.rst: -------------------------------------------------------------------------------- 1 | .. default-role:: code 2 | 3 | 4 | .. _advanced_usage: 5 | 6 | Advanced usage 7 | ============== 8 | 9 | 10 | Path expansion 11 | -------------- 12 | 13 | All :class:`~hdfs.client.Client` methods provide a path expansion functionality 14 | via the :meth:`~hdfs.client.Client.resolve` method. It enables the use of 15 | special markers to identify paths. For example, it currently supports the 16 | `#LATEST` marker which expands to the last modified file inside a given folder. 17 | 18 | .. code-block:: python 19 | 20 | # Load the most recent data in the `tracking` folder. 21 | with client.read('tracking/#LATEST') as reader: 22 | data = reader.read() 23 | 24 | See the method's documentation for more information. 25 | 26 | 27 | .. _custom_client: 28 | 29 | Custom client support 30 | --------------------- 31 | 32 | In order for the CLI to be able to instantiate arbitrary client classes, it has 33 | to be able to discover these first. This is done by specifying where they are 34 | defined in the `global` section of HdfsCLI's configuration file. For example, 35 | here is how we can make the :class:`~hdfs.ext.kerberos.KerberosClient` class 36 | available: 37 | 38 | .. code-block:: cfg 39 | 40 | [global] 41 | autoload.modules = hdfs.ext.kerberos 42 | 43 | More precisely, there are two options for telling the CLI where to load the 44 | clients from: 45 | 46 | + `autoload.modules`, a comma-separated list of modules (which must be on 47 | python's path). 48 | + `autoload.paths`, a comma-separated list of paths to python files. 49 | 50 | Implementing custom clients can be particularly useful for passing default 51 | options (e.g. a custom `session` argument to each client). We describe below a 52 | working example implementing a secure client with optional custom certificate 53 | support. 54 | 55 | We first implement our new client and save it somewhere, for example 56 | `/etc/hdfscli.py`. 57 | 58 | .. code-block:: python 59 | 60 | from hdfs import Client 61 | from requests import Session 62 | 63 | class SecureClient(Client): 64 | 65 | """A new client subclass for handling HTTPS connections. 66 | 67 | :param url: URL to namenode. 68 | :param cert: Local certificate. See `requests` documentation for details 69 | on how to use this. 70 | :param verify: Whether to check the host's certificate. 71 | :param \*\*kwargs: Keyword arguments passed to the default `Client` 72 | constructor. 73 | 74 | """ 75 | 76 | def __init__(self, url, cert=None, verify=True, **kwargs): 77 | session = Session() 78 | if ',' in cert: 79 | session.cert = [path.strip() for path in cert.split(',')] 80 | else: 81 | session.cert = cert 82 | if isinstance(verify, basestring): # Python 2. 83 | verify = verify.lower() in ('true', 'yes', 'ok') 84 | session.verify = verify 85 | super(SecureClient, self).__init__(url, session=session, **kwargs) 86 | 87 | We then edit our configuration to tell the CLI how to load this module and 88 | define a `prod` alias using our new client: 89 | 90 | .. code-block:: cfg 91 | 92 | [global] 93 | autoload.paths = /etc/hdfscli.py 94 | 95 | [prod.alias] 96 | client = SecureClient 97 | url = https://host:port 98 | cert = /etc/server.crt, /etc/key 99 | 100 | 101 | Note that options used to instantiate clients from the CLI (using 102 | :meth:`hdfs.client.Client.from_options` under the hood) are always passed in as 103 | strings. This is why we had to implement some parsing logic in the 104 | `SecureClient` constructor above. 105 | 106 | 107 | Tracking transfer progress 108 | -------------------------- 109 | 110 | The :meth:`~hdfs.client.Client.read`, :meth:`~hdfs.client.Client.upload`, 111 | :meth:`~hdfs.client.Client.download` client methods accept a `progress` 112 | callback argument which can be used to track transfers. The passed function 113 | will be called every `chunk_size` bytes with two arguments: 114 | 115 | + The source path of the file currently being transferred. 116 | + The number of bytes currently transferred for this file or `-1` to signal 117 | that this file's transfer has just finished. 118 | 119 | Below is an implementation of a toy tracker which simply outputs to standard 120 | error the total number of transferred bytes each time a file transfer completes 121 | (we must still take care to ensure correct behavior even during multi-threaded 122 | transfers). 123 | 124 | .. code-block:: python 125 | 126 | from sys import stderr 127 | from threading import Lock 128 | 129 | class Progress(object): 130 | 131 | """Basic progress tracker callback.""" 132 | 133 | def __init__(self): 134 | self._data = {} 135 | self._lock = Lock() 136 | 137 | def __call__(self, hdfs_path, nbytes): 138 | with self._lock: 139 | if nbytes >= 0: 140 | self._data[hdfs_path] = nbytes 141 | else: 142 | stderr.write('%s\n' % (sum(self._data.values()), )) 143 | 144 | Finally, note that the :meth:`~hdfs.client.Client.write` method doesn't expose 145 | a `progress` argument since this functionality can be replicated by passing a 146 | custom `data` generator (or within the context manager). 147 | 148 | 149 | Logging configuration 150 | --------------------- 151 | 152 | It is possible to configure and disable where the CLI logs are written for each 153 | entry point. To do this, we can set the following options in its corresponding 154 | section (the entry point's name suffixed with `.command`). For example: 155 | 156 | .. code-block:: cfg 157 | 158 | [hdfscli-avro.command] 159 | log.level = INFO 160 | log.path = /tmp/hdfscli/avro.log 161 | 162 | The following options are available: 163 | 164 | + `log.level`, handler log level (defaults to `DEBUG`). 165 | + `log.path`, path to log file. The log is rotated every day (keeping a single 166 | copy). The default is a file named `COMMAND.log` in your current temporary 167 | directory. It is possible to view the currently active log file at any time 168 | by using the `--log` option at the command line. 169 | + `log.disable`, disable logging to a file entirely (defaults to `False`). 170 | 171 | 172 | Renaming entry points 173 | --------------------- 174 | 175 | By default the command line entry point will be named `hdfscli`. You can choose 176 | another name by specifying the `HDFSCLI_ENTRY_POINT` environment variable at 177 | installation time: 178 | 179 | .. code-block:: bash 180 | 181 | $ HDFSCLI_ENTRY_POINT=hdfs pip install hdfs 182 | 183 | Extension prefixes will be adjusted similarly (e.g. in the previous example, 184 | `hdfscli-avro` would become `hdfs-avro`). 185 | -------------------------------------------------------------------------------- /doc/api.rst: -------------------------------------------------------------------------------- 1 | .. default-role:: code 2 | 3 | 4 | .. _api_reference: 5 | 6 | API reference 7 | ============= 8 | 9 | 10 | Client 11 | ------ 12 | 13 | .. automodule:: hdfs.client 14 | :members: 15 | :show-inheritance: 16 | 17 | 18 | Extensions 19 | ---------- 20 | 21 | The following extensions are currently available: 22 | 23 | 24 | .. _kerberos_extension: 25 | 26 | Kerberos 27 | ******** 28 | 29 | .. automodule:: hdfs.ext.kerberos 30 | :members: 31 | :show-inheritance: 32 | 33 | 34 | Avro 35 | **** 36 | 37 | .. automodule:: hdfs.ext.avro 38 | :members: 39 | :show-inheritance: 40 | 41 | 42 | Dataframe 43 | ********* 44 | 45 | .. automodule:: hdfs.ext.dataframe 46 | :members: 47 | 48 | 49 | Configuration 50 | ------------- 51 | 52 | .. automodule:: hdfs.config 53 | :members: 54 | :show-inheritance: 55 | 56 | 57 | Utilities 58 | --------- 59 | 60 | .. automodule:: hdfs.util 61 | :members: 62 | :show-inheritance: 63 | -------------------------------------------------------------------------------- /doc/conf.py: -------------------------------------------------------------------------------- 1 | # -*- coding: utf-8 -*- 2 | # 3 | # hdfs documentation build configuration file, created by 4 | # sphinx-quickstart on Thu Mar 6 16:04:56 2014. 5 | # 6 | # This file is execfile()d with the current directory set to its 7 | # containing dir. 8 | # 9 | # Note that not all possible configuration values are present in this 10 | # autogenerated file. 11 | # 12 | # All configuration values have a default; values that are commented out 13 | # serve to show the default. 14 | 15 | import os 16 | import sys 17 | try: 18 | from unittest import mock 19 | except ImportError: 20 | import mock 21 | 22 | MOCK_MODULES = ['fastavro', 'pandas', 'requests_kerberos'] 23 | for mod_name in MOCK_MODULES: 24 | sys.modules[mod_name] = mock.Mock() 25 | 26 | # If extensions (or modules to document with autodoc) are in another directory, 27 | # add these directories to sys.path here. If the directory is relative to the 28 | # documentation root, use os.path.abspath to make it absolute, like shown here. 29 | #sys.path.insert(0, os.path.abspath('.')) 30 | 31 | # -- General configuration ------------------------------------------------ 32 | 33 | # If your documentation needs a minimal Sphinx version, state it here. 34 | #needs_sphinx = '1.0' 35 | 36 | # Add any Sphinx extension module names here, as strings. They can be 37 | # extensions coming with Sphinx (named 'sphinx.ext.*') or your custom 38 | # ones. 39 | extensions = [ 40 | 'sphinx.ext.autodoc', 41 | 'sphinx.ext.viewcode', 42 | ] 43 | 44 | # Add any paths that contain templates here, relative to this directory. 45 | templates_path = ['_templates'] 46 | 47 | # The suffix of source filenames. 48 | source_suffix = '.rst' 49 | 50 | # The encoding of source files. 51 | #source_encoding = 'utf-8-sig' 52 | 53 | # The master toctree document. 54 | master_doc = 'index' 55 | 56 | # General information about the project. 57 | project = u'HdfsCLI' 58 | copyright = u'2014, Matthieu Monsch' 59 | 60 | # The version info for the project you're documenting, acts as replacement for 61 | # |version| and |release|, also used in various other places throughout the 62 | # built documents. 63 | # 64 | import hdfs 65 | # The short X.Y version. 66 | version = hdfs.__version__.rsplit('.', 1)[0] 67 | # The full version, including alpha/beta/rc tags. 68 | release = hdfs.__version__ 69 | 70 | # The language for content autogenerated by Sphinx. Refer to documentation 71 | # for a list of supported languages. 72 | #language = None 73 | 74 | # There are two options for replacing |today|: either, you set today to some 75 | # non-false value, then it is used: 76 | #today = '' 77 | # Else, today_fmt is used as the format for a strftime call. 78 | #today_fmt = '%B %d, %Y' 79 | 80 | # List of patterns, relative to source directory, that match files and 81 | # directories to ignore when looking for source files. 82 | exclude_patterns = ['_build'] 83 | 84 | # The reST default role (used for this markup: `text`) to use for all 85 | # documents. 86 | #default_role = None 87 | 88 | # If true, '()' will be appended to :func: etc. cross-reference text. 89 | #add_function_parentheses = True 90 | 91 | # If true, the current module name will be prepended to all description 92 | # unit titles (such as .. function::). 93 | #add_module_names = True 94 | 95 | # If true, sectionauthor and moduleauthor directives will be shown in the 96 | # output. They are ignored by default. 97 | #show_authors = False 98 | 99 | # The name of the Pygments (syntax highlighting) style to use. 100 | pygments_style = 'sphinx' 101 | 102 | # A list of ignored prefixes for module index sorting. 103 | #modindex_common_prefix = [] 104 | 105 | # If true, keep warnings as "system message" paragraphs in the built documents. 106 | #keep_warnings = False 107 | 108 | # Autodoc 109 | 110 | autoclass_content = 'both' 111 | 112 | 113 | # -- Options for HTML output ---------------------------------------------- 114 | 115 | # The theme to use for HTML and HTML Help pages. See the documentation for 116 | # a list of builtin themes. 117 | html_theme = 'default' 118 | 119 | # Theme options are theme-specific and customize the look and feel of a theme 120 | # further. For a list of options available for each theme, see the 121 | # documentation. 122 | #html_theme_options = {} 123 | 124 | # Add any paths that contain custom themes here, relative to this directory. 125 | #html_theme_path = [] 126 | 127 | # The name for this set of Sphinx documents. If None, it defaults to 128 | # " v documentation". 129 | #html_title = None 130 | 131 | # A shorter title for the navigation bar. Default is the same as html_title. 132 | #html_short_title = None 133 | 134 | # The name of an image file (relative to this directory) to place at the top 135 | # of the sidebar. 136 | #html_logo = None 137 | 138 | # The name of an image file (within the static path) to use as favicon of the 139 | # docs. This file should be a Windows icon file (.ico) being 16x16 or 32x32 140 | # pixels large. 141 | #html_favicon = None 142 | 143 | # Add any paths that contain custom static files (such as style sheets) here, 144 | # relative to this directory. They are copied after the builtin static files, 145 | # so a file named "default.css" will overwrite the builtin "default.css". 146 | #html_static_path = ['_static'] 147 | html_static_path = [] 148 | 149 | # Add any extra paths that contain custom files (such as robots.txt or 150 | # .htaccess) here, relative to this directory. These files are copied 151 | # directly to the root of the documentation. 152 | #html_extra_path = [] 153 | 154 | # If not '', a 'Last updated on:' timestamp is inserted at every page bottom, 155 | # using the given strftime format. 156 | #html_last_updated_fmt = '%b %d, %Y' 157 | 158 | # If true, SmartyPants will be used to convert quotes and dashes to 159 | # typographically correct entities. 160 | #html_use_smartypants = True 161 | 162 | # Custom sidebar templates, maps document names to template names. 163 | #html_sidebars = {} 164 | 165 | # Additional templates that should be rendered to pages, maps page names to 166 | # template names. 167 | #html_additional_pages = {} 168 | 169 | # If false, no module index is generated. 170 | #html_domain_indices = True 171 | 172 | # If false, no index is generated. 173 | #html_use_index = True 174 | 175 | # If true, the index is split into individual pages for each letter. 176 | #html_split_index = False 177 | 178 | # If true, links to the reST sources are added to the pages. 179 | #html_show_sourcelink = True 180 | 181 | # If true, "Created using Sphinx" is shown in the HTML footer. Default is True. 182 | #html_show_sphinx = True 183 | 184 | # If true, "(C) Copyright ..." is shown in the HTML footer. Default is True. 185 | #html_show_copyright = True 186 | 187 | # If true, an OpenSearch description file will be output, and all pages will 188 | # contain a tag referring to it. The value of this option must be the 189 | # base URL from which the finished HTML is served. 190 | #html_use_opensearch = '' 191 | 192 | # This is the file name suffix for HTML files (e.g. ".xhtml"). 193 | #html_file_suffix = None 194 | 195 | # Output file base name for HTML help builder. 196 | htmlhelp_basename = 'hdfsdoc' 197 | 198 | 199 | # -- Options for LaTeX output --------------------------------------------- 200 | 201 | latex_elements = { 202 | # The paper size ('letterpaper' or 'a4paper'). 203 | #'papersize': 'letterpaper', 204 | 205 | # The font size ('10pt', '11pt' or '12pt'). 206 | #'pointsize': '10pt', 207 | 208 | # Additional stuff for the LaTeX preamble. 209 | #'preamble': '', 210 | } 211 | 212 | # Grouping the document tree into LaTeX files. List of tuples 213 | # (source start file, target name, title, 214 | # author, documentclass [howto, manual, or own class]). 215 | latex_documents = [ 216 | ('index', 'hdfs.tex', u'hdfs Documentation', 217 | u'Author', 'manual'), 218 | ] 219 | 220 | # The name of an image file (relative to this directory) to place at the top of 221 | # the title page. 222 | #latex_logo = None 223 | 224 | # For "manual" documents, if this is true, then toplevel headings are parts, 225 | # not chapters. 226 | #latex_use_parts = False 227 | 228 | # If true, show page references after internal links. 229 | #latex_show_pagerefs = False 230 | 231 | # If true, show URL addresses after external links. 232 | #latex_show_urls = False 233 | 234 | # Documents to append as an appendix to all manuals. 235 | #latex_appendices = [] 236 | 237 | # If false, no module index is generated. 238 | #latex_domain_indices = True 239 | 240 | 241 | # -- Options for manual page output --------------------------------------- 242 | 243 | # One entry per manual page. List of tuples 244 | # (source start file, name, description, authors, manual section). 245 | man_pages = [ 246 | ('index', 'hdfs', u'hdfs documentation', 247 | [u'Author'], 1) 248 | ] 249 | 250 | # If true, show URL addresses after external links. 251 | #man_show_urls = False 252 | 253 | 254 | # -- Options for Texinfo output ------------------------------------------- 255 | 256 | # Grouping the document tree into Texinfo files. List of tuples 257 | # (source start file, target name, title, author, 258 | # dir menu entry, description, category) 259 | texinfo_documents = [ 260 | ('index', 'hdfs', u'hdfs documentation', 261 | u'Author', 'hdfs', 'One line description of project.', 262 | 'Miscellaneous'), 263 | ] 264 | 265 | # Documents to append as an appendix to all manuals. 266 | #texinfo_appendices = [] 267 | 268 | # If false, no module index is generated. 269 | #texinfo_domain_indices = True 270 | 271 | # How to display URL addresses: 'footnote', 'no', or 'inline'. 272 | #texinfo_show_urls = 'footnote' 273 | 274 | # If true, do not generate a @detailmenu in the "Top" node's menu. 275 | #texinfo_no_detailmenu = False 276 | 277 | 278 | # -- Options for Epub output ---------------------------------------------- 279 | 280 | # Bibliographic Dublin Core info. 281 | epub_title = u'hdfs' 282 | epub_author = u'Author' 283 | epub_publisher = u'Author' 284 | epub_copyright = u'2014, Matthieu Monsch' 285 | 286 | # The basename for the epub file. It defaults to the project name. 287 | #epub_basename = u'hdfs' 288 | 289 | # The HTML theme for the epub output. Since the default themes are not optimized 290 | # for small screen space, using the same theme for HTML and epub output is 291 | # usually not wise. This defaults to 'epub', a theme designed to save visual 292 | # space. 293 | #epub_theme = 'epub' 294 | 295 | # The language of the text. It defaults to the language option 296 | # or en if the language is not set. 297 | #epub_language = '' 298 | 299 | # The scheme of the identifier. Typical schemes are ISBN or URL. 300 | #epub_scheme = '' 301 | 302 | # The unique identifier of the text. This can be a ISBN number 303 | # or the project homepage. 304 | #epub_identifier = '' 305 | 306 | # A unique identification for the text. 307 | #epub_uid = '' 308 | 309 | # A tuple containing the cover image and cover page html template filenames. 310 | #epub_cover = () 311 | 312 | # A sequence of (type, uri, title) tuples for the guide element of content.opf. 313 | #epub_guide = () 314 | 315 | # HTML files that should be inserted before the pages created by sphinx. 316 | # The format is a list of tuples containing the path and title. 317 | #epub_pre_files = [] 318 | 319 | # HTML files that should be inserted after the pages created by sphinx. 320 | # The format is a list of tuples containing the path and title. 321 | #epub_post_files = [] 322 | 323 | # A list of files that should not be packed into the epub file. 324 | epub_exclude_files = ['search.html'] 325 | 326 | # The depth of the table of contents in toc.ncx. 327 | #epub_tocdepth = 3 328 | 329 | # Allow duplicate toc entries. 330 | #epub_tocdup = True 331 | 332 | # Choose between 'default' and 'includehidden'. 333 | #epub_tocscope = 'default' 334 | 335 | # Fix unsupported image types using the PIL. 336 | #epub_fix_images = False 337 | 338 | # Scale large images. 339 | #epub_max_image_width = 0 340 | 341 | # How to display URL addresses: 'footnote', 'no', or 'inline'. 342 | #epub_show_urls = 'inline' 343 | 344 | # If false, no index is generated. 345 | #epub_use_index = True 346 | -------------------------------------------------------------------------------- /doc/index.rst: -------------------------------------------------------------------------------- 1 | .. default-role:: code 2 | 3 | 4 | HdfsCLI 5 | ======= 6 | 7 | API and command line interface for HDFS. 8 | 9 | + `Project homepage on GitHub`_ 10 | + `PyPI entry`_ 11 | 12 | 13 | Installation 14 | ------------ 15 | 16 | Using pip_: 17 | 18 | .. code-block:: bash 19 | 20 | $ pip install hdfs 21 | 22 | By default none of the package requirements for extensions are installed. To do 23 | so simply suffix the package name with the desired extensions: 24 | 25 | .. code-block:: bash 26 | 27 | $ pip install hdfs[avro,dataframe,kerberos] 28 | 29 | 30 | User guide 31 | ---------- 32 | 33 | .. toctree:: 34 | :maxdepth: 2 35 | 36 | quickstart 37 | advanced 38 | api 39 | 40 | 41 | Sample script 42 | ------------- 43 | 44 | .. literalinclude:: ../examples/json.py 45 | 46 | More examples can be found in the `examples/` folder on GitHub. 47 | 48 | 49 | .. _Project homepage on GitHub: https://github.com/mtth/hdfs 50 | .. _PyPI entry: https://pypi.python.org/pypi/hdfs/ 51 | .. _pip: http://www.pip-installer.org/en/latest/ 52 | -------------------------------------------------------------------------------- /doc/quickstart.rst: -------------------------------------------------------------------------------- 1 | .. default-role:: code 2 | 3 | 4 | Quickstart 5 | ========== 6 | 7 | This page first goes through the steps required to configure HdfsCLI's command 8 | line interface then gives an overview of the python API. If you are only 9 | interested in using HdfsCLI as a library, then feel free to jump ahead to the 10 | `Python bindings`_ section. 11 | 12 | 13 | Configuration 14 | ------------- 15 | 16 | HdfsCLI uses *aliases* to figure out how to connect to different HDFS clusters. 17 | These are defined in HdfsCLI's configuration file, located by default at 18 | `~/.hdfscli.cfg` (or elsewhere by setting the `HDFSCLI_CONFIG` environment 19 | variable correspondingly). See below for a sample configuration defining two 20 | aliases, `dev` and `prod`: 21 | 22 | .. code-block:: cfg 23 | 24 | [global] 25 | default.alias = dev 26 | 27 | [dev.alias] 28 | url = http://dev.namenode:port 29 | user = ann 30 | 31 | [prod.alias] 32 | url = http://prod.namenode:port 33 | root = /jobs/ 34 | 35 | Each alias is defined as its own `ALIAS.alias` section which must at least 36 | contain a `url` option with the URL to the namenode (including protocol and 37 | port). All other options can be omitted. If specified, `client` determines 38 | which :class:`hdfs.client.Client` class to use and the remaining options are 39 | passed as keyword arguments to the appropriate constructor. The currently 40 | available client classes are: 41 | 42 | + :class:`~hdfs.client.InsecureClient` (the default) 43 | + :class:`~hdfs.client.TokenClient` 44 | 45 | See the :ref:`Kerberos extension ` to enable the 46 | :class:`~hdfs.ext.kerberos.KerberosClient` and :ref:`custom_client` to learn 47 | how to use other client classes. 48 | 49 | The `url` option can be configured to support High Availability namenodes when using WebHDFS, 50 | simply add more URLs by delimiting with a semicolon (`;`). 51 | 52 | Finally, note the `default.alias` entry in the global configuration section 53 | which will be used as default alias if none is specified. 54 | 55 | 56 | Command line interface 57 | ---------------------- 58 | 59 | HdfsCLI comes by default with a single entry point `hdfscli` which provides a 60 | convenient interface to perform common actions. All its commands accept an 61 | `--alias` argument (described above), which defines against which cluster to 62 | operate. 63 | 64 | 65 | Downloading and uploading files 66 | ******************************* 67 | 68 | HdfsCLI supports downloading and uploading files and folders transparently from 69 | HDFS (we can also specify the degree of parallelism by using the `--threads` 70 | option). 71 | 72 | .. code-block:: bash 73 | 74 | $ # Write a single file to HDFS. 75 | $ hdfscli upload --alias=dev weights.json models/ 76 | $ # Read all files inside a folder from HDFS and store them locally. 77 | $ hdfscli download export/results/ "results-$(date +%F)" 78 | 79 | If reading (resp. writing) a single file, its contents can also be streamed to 80 | standard out (resp. from standard in) by using `-` as path argument: 81 | 82 | .. code-block:: bash 83 | 84 | $ # Read a file from HDFS and append its contents to a local log file. 85 | $ hdfscli download logs/1987-03-23.txt - >>logs 86 | 87 | By default HdfsCLI will throw an error if trying to write to an existing path 88 | (either locally or on HDFS). We can force the path to be overwritten with the 89 | `--force` option. 90 | 91 | 92 | .. _interactive_shell: 93 | 94 | Interactive shell 95 | ***************** 96 | 97 | The `interactive` command (used also when no command is specified) will create 98 | an HDFS client and expose it inside a python shell (using IPython_ if 99 | available). This makes is convenient to perform file system operations on HDFS 100 | and interact with its data. See :ref:`python_bindings` below for an overview of 101 | the methods available. 102 | 103 | .. code-block:: bash 104 | 105 | $ hdfscli --alias=dev 106 | 107 | Welcome to the interactive HDFS python shell. 108 | The HDFS client is available as `CLIENT`. 109 | 110 | In [1]: CLIENT.list('data/') 111 | Out[1]: ['1.json', '2.json'] 112 | 113 | In [2]: CLIENT.status('data/2.json') 114 | Out[2]: { 115 | 'accessTime': 1439743128690, 116 | 'blockSize': 134217728, 117 | 'childrenNum': 0, 118 | 'fileId': 16389, 119 | 'group': 'supergroup', 120 | 'length': 2, 121 | 'modificationTime': 1439743129392, 122 | 'owner': 'drwho', 123 | 'pathSuffix': '', 124 | 'permission': '755', 125 | 'replication': 1, 126 | 'storagePolicy': 0, 127 | 'type': 'FILE' 128 | } 129 | 130 | In [3]: CLIENT.delete('data/2.json') 131 | Out[3]: True 132 | 133 | Using the full power of python lets us easily perform more complex operations 134 | such as renaming folder which match some pattern, deleting files which haven't 135 | been accessed for some duration, finding all paths owned by a certain user, 136 | etc. 137 | 138 | 139 | More 140 | **** 141 | 142 | Cf. `hdfscli --help` for the full list of commands and options. 143 | 144 | 145 | .. _python_bindings: 146 | 147 | Python bindings 148 | --------------- 149 | 150 | 151 | Instantiating a client 152 | ********************** 153 | 154 | The simplest way of getting a :class:`hdfs.client.Client` instance is by using 155 | the :ref:`interactive_shell` described above, where the client will be 156 | automatically available. To instantiate a client programmatically, there are 157 | two options: 158 | 159 | The first is to import the client class and call its constructor directly. This 160 | is the most straightforward and flexible, but doesn't let us reuse our 161 | configured aliases: 162 | 163 | .. code-block:: python 164 | 165 | from hdfs import InsecureClient 166 | client = InsecureClient('http://host:port', user='ann') 167 | 168 | The second leverages the :class:`hdfs.config.Config` class to load an existing 169 | configuration file (defaulting to the same one as the CLI) and create clients 170 | from existing aliases: 171 | 172 | .. code-block:: python 173 | 174 | from hdfs import Config 175 | client = Config().get_client('dev') 176 | 177 | 178 | Reading and writing files 179 | ************************* 180 | 181 | The :meth:`~hdfs.client.Client.read` method provides a file-like interface for 182 | reading files from HDFS. It must be used in a `with` block (making sure that 183 | connections are always properly closed): 184 | 185 | .. code-block:: python 186 | 187 | # Loading a file in memory. 188 | with client.read('features') as reader: 189 | features = reader.read() 190 | 191 | # Directly deserializing a JSON object. 192 | with client.read('model.json', encoding='utf-8') as reader: 193 | from json import load 194 | model = load(reader) 195 | 196 | If a `chunk_size` argument is passed, the method will return a generator 197 | instead, making it sometimes simpler to stream the file's contents. 198 | 199 | .. code-block:: python 200 | 201 | # Stream a file. 202 | with client.read('features', chunk_size=8096) as reader: 203 | for chunk in reader: 204 | pass 205 | 206 | Similarly, if a `delimiter` argument is passed, the method will return a 207 | generator of the delimited chunks. 208 | 209 | .. code-block:: python 210 | 211 | with client.read('samples.csv', encoding='utf-8', delimiter='\n') as reader: 212 | for line in reader: 213 | pass 214 | 215 | Writing files to HDFS is done using the :meth:`~hdfs.client.Client.write` 216 | method which returns a file-like writable object: 217 | 218 | .. code-block:: python 219 | 220 | # Writing part of a file. 221 | with open('samples') as reader, client.write('samples') as writer: 222 | for line in reader: 223 | if line.startswith('-'): 224 | writer.write(line) 225 | 226 | # Writing a serialized JSON object. 227 | with client.write('model.json', encoding='utf-8') as writer: 228 | from json import dump 229 | dump(model, writer) 230 | 231 | For convenience, it is also possible to pass an iterable `data` argument 232 | directly to the method. 233 | 234 | .. code-block:: python 235 | 236 | # This is equivalent to the JSON example above. 237 | from json import dumps 238 | client.write('model.json', dumps(model)) 239 | 240 | 241 | Exploring the file system 242 | ************************* 243 | 244 | All :class:`~hdfs.client.Client` subclasses expose a variety of methods to 245 | interact with HDFS. Most are modeled directly after the WebHDFS operations, a 246 | few of these are shown in the snippet below: 247 | 248 | .. code-block:: python 249 | 250 | # Retrieving a file or folder content summary. 251 | content = client.content('dat') 252 | 253 | # Listing all files inside a directory. 254 | fnames = client.list('dat') 255 | 256 | # Retrieving a file or folder status. 257 | status = client.status('dat/features') 258 | 259 | # Renaming ("moving") a file. 260 | client.rename('dat/features', 'features') 261 | 262 | # Deleting a file or folder. 263 | client.delete('dat', recursive=True) 264 | 265 | Other methods build on these to provide more advanced features: 266 | 267 | .. code-block:: python 268 | 269 | # Download a file or folder locally. 270 | client.download('dat', 'dat', n_threads=5) 271 | 272 | # Get all files under a given folder (arbitrary depth). 273 | import posixpath as psp 274 | fpaths = [ 275 | psp.join(dpath, fname) 276 | for dpath, _, fnames in client.walk('predictions') 277 | for fname in fnames 278 | ] 279 | 280 | See the :ref:`api_reference` for the comprehensive list of methods available. 281 | 282 | 283 | Checking path existence 284 | *********************** 285 | 286 | Most of the methods described above will raise an :class:`~hdfs.util.HdfsError` 287 | if called on a missing path. The recommended way of checking whether a path 288 | exists is using the :meth:`~hdfs.client.Client.content` or 289 | :meth:`~hdfs.client.Client.status` methods with a `strict=False` argument (in 290 | which case they will return `None` on a missing path). 291 | 292 | 293 | More 294 | **** 295 | 296 | See the :ref:`advanced_usage` section to learn more. 297 | 298 | 299 | .. _IPython: http://ipython.org/ 300 | -------------------------------------------------------------------------------- /doc/requirements.txt: -------------------------------------------------------------------------------- 1 | mock; python_version<'3.3' 2 | -------------------------------------------------------------------------------- /etc/hadoop/core-site.xml: -------------------------------------------------------------------------------- 1 | 2 | 3 | fs.defaultFS 4 | hdfs://localhost:51000 5 | 6 | 7 | fs.trash.interval 8 | 10 9 | 10 | 11 | fs.trash.checkpoint.interval 12 | 1 13 | 14 | 15 | hadoop.proxyuser.#USER#.hosts 16 | * 17 | 18 | 19 | hadoop.proxyuser.#USER#.groups 20 | * 21 | 22 | 23 | -------------------------------------------------------------------------------- /etc/hadoop/hdfs-site.xml: -------------------------------------------------------------------------------- 1 | 2 | 3 | dfs.replication 4 | 1 5 | 6 | 7 | dfs.support.append 8 | true 9 | 10 | 11 | dfs.webhdfs.enabled 12 | true 13 | 14 | 15 | dfs.namenode.acls.enabled 16 | true 17 | 18 | 19 | -------------------------------------------------------------------------------- /etc/hadoop/httpfs-site.xml: -------------------------------------------------------------------------------- 1 | 2 | 3 | httpfs.authentication.signature.secret.file 4 | ${httpfs.config.dir}/httpfs-site.xml 5 | 6 | 7 | -------------------------------------------------------------------------------- /etc/hadoop/log4j.properties: -------------------------------------------------------------------------------- 1 | log4j.appender.console.layout.ConversionPattern=%d{yy/MM/dd HH:mm:ss} %p %c{2}: %m%n 2 | log4j.appender.console.layout=org.apache.log4j.PatternLayout 3 | log4j.appender.console.target=System.err 4 | log4j.appender.console=org.apache.log4j.ConsoleAppender 5 | log4j.rootLogger=INFO,console 6 | -------------------------------------------------------------------------------- /examples/avro-example.py: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env python 2 | # encoding: utf-8 3 | 4 | """Avro extension example.""" 5 | 6 | from hdfs import Config 7 | from hdfs.ext.avro import AvroReader, AvroWriter 8 | 9 | 10 | # Get the default alias' client. 11 | client = Config().get_client() 12 | 13 | # Some sample data. 14 | records = [ 15 | {'name': 'Ann', 'age': 23}, 16 | {'name': 'Bob', 'age': 22}, 17 | ] 18 | 19 | # Write an Avro File to HDFS (since our records' schema is very simple, we let 20 | # the writer infer it automatically, otherwise we would pass it as argument). 21 | with AvroWriter(client, 'names.avro', overwrite=True) as writer: 22 | for record in records: 23 | writer.write(record) 24 | 25 | # Read it back. 26 | with AvroReader(client, 'names.avro') as reader: 27 | schema = reader.schema # The inferred schema. 28 | content = reader.content # The remote file's HDFS content object. 29 | assert list(reader) == records # The records match! 30 | -------------------------------------------------------------------------------- /examples/dataframe-example.py: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env python 2 | # encoding: utf-8 3 | 4 | """Dataframe extension example.""" 5 | 6 | from hdfs import Config 7 | from hdfs.ext.dataframe import read_dataframe, write_dataframe 8 | import pandas as pd 9 | 10 | 11 | # Get the default alias' client. 12 | client = Config().get_client() 13 | 14 | # A sample dataframe. 15 | df = pd.DataFrame.from_records([ 16 | {'A': 1, 'B': 2}, 17 | {'A': 11, 'B': 23} 18 | ]) 19 | 20 | # Write dataframe to HDFS using Avro serialization. 21 | write_dataframe(client, 'data.avro', df, overwrite=True) 22 | 23 | # Read the Avro file back from HDFS. 24 | _df = read_dataframe(client, 'data.avro') 25 | 26 | # The frames match! 27 | pd.testing.assert_frame_equal(df, _df) 28 | -------------------------------------------------------------------------------- /examples/json-example.py: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env python 2 | # encoding: utf-8 3 | 4 | """Sample HdfsCLI script. 5 | 6 | This example shows how to write files to HDFS, read them back, and perform a 7 | few other simple filesystem operations. 8 | 9 | """ 10 | 11 | from hdfs import Config 12 | from json import dump, load 13 | 14 | 15 | # Get the default alias' client. (See the quickstart section in the 16 | # documentation to learn more about this.) 17 | client = Config().get_client() 18 | 19 | # Some fake data that we are interested in uploading to HDFS. 20 | model = { 21 | '(intercept)': 48., 22 | 'first_feature': 2., 23 | 'second_feature': 12., 24 | } 25 | 26 | # First, we delete any existing `models/` folder on HDFS. 27 | client.delete('models', recursive=True) 28 | 29 | # We can now upload the data, first as CSV. 30 | with client.write('models/1.csv', encoding='utf-8') as writer: 31 | for item in model.items(): 32 | writer.write(u'%s,%s\n' % item) 33 | 34 | # We can also serialize it to JSON and directly upload it. 35 | with client.write('models/1.json', encoding='utf-8') as writer: 36 | dump(model, writer) 37 | 38 | # We can check that the files exist and get their properties. 39 | assert client.list('models') == ['1.csv', '1.json'] 40 | status = client.status('models/1.csv') 41 | content = client.content('models/1.json') 42 | 43 | # Later, we can download the files back. The `delimiter` option makes it 44 | # convenient to read CSV files. 45 | with client.read('models/1.csv', delimiter='\n', encoding='utf-8') as reader: 46 | items = (line.split(',') for line in reader if line) 47 | assert {name: float(value) for name, value in items} == model 48 | 49 | # Loading JSON directly from HDFS is even simpler. 50 | with client.read('models/1.json', encoding='utf-8') as reader: 51 | assert load(reader) == model 52 | -------------------------------------------------------------------------------- /hdfs/__init__.py: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env python 2 | # encoding: utf-8 3 | 4 | """HdfsCLI: API and command line interface for HDFS.""" 5 | 6 | from .client import Client, InsecureClient, TokenClient 7 | from .config import Config, NullHandler 8 | from .util import HdfsError 9 | import logging as lg 10 | 11 | 12 | __version__ = '2.7.3' 13 | __license__ = 'MIT' 14 | 15 | 16 | lg.getLogger(__name__).addHandler(NullHandler()) 17 | -------------------------------------------------------------------------------- /hdfs/__main__.py: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env python 2 | # encoding: utf-8 3 | 4 | """HdfsCLI: a command line interface for HDFS. 5 | 6 | Usage: 7 | hdfscli [interactive] [-a ALIAS] [-v...] 8 | hdfscli download [-fsa ALIAS] [-v...] [-t THREADS] HDFS_PATH LOCAL_PATH 9 | hdfscli upload [-sa ALIAS] [-v...] [-A | -f] [-t THREADS] LOCAL_PATH HDFS_PATH 10 | hdfscli -L | -V | -h 11 | 12 | Commands: 13 | download Download a file or folder from HDFS. If a 14 | single file is downloaded, - can be 15 | specified as LOCAL_PATH to stream it to 16 | standard out. 17 | interactive Start the client and expose it via the python 18 | interpreter (using iPython if available). 19 | upload Upload a file or folder to HDFS. - can be 20 | specified as LOCAL_PATH to read from standard 21 | in. 22 | 23 | Arguments: 24 | HDFS_PATH Remote HDFS path. 25 | LOCAL_PATH Path to local file or directory. 26 | 27 | Options: 28 | -A --append Append data to an existing file. Only supported 29 | if uploading a single file or from standard in. 30 | -L --log Show path to current log file and exit. 31 | -V --version Show version and exit. 32 | -a ALIAS --alias=ALIAS Alias of namenode to connect to. 33 | -f --force Allow overwriting any existing files. 34 | -s --silent Don't display progress status. 35 | -t THREADS --threads=THREADS Number of threads to use for parallelization. 36 | 0 allocates a thread per file. [default: 0] 37 | -v --verbose Enable log output. Can be specified up to three 38 | times (increasing verbosity each time). 39 | 40 | Examples: 41 | hdfscli -a prod /user/foo 42 | hdfscli download features.avro dat/ 43 | hdfscli download logs/1987-03-23 - >>logs 44 | hdfscli upload -f - data/weights.tsv 0: 144 | self._writer.write( 145 | '%3.1f%%\t[ pending: %d | downloading: %d | complete: %d ] \r' % 146 | ( 147 | 100. * sum(data.values()) / self._total_bytes, 148 | self._pending_files, 149 | self._downloading_files, 150 | self._complete_files, 151 | ) 152 | ) 153 | else: 154 | self._writer.write('%79s\r' % ('', )) 155 | 156 | @classmethod 157 | def from_hdfs_path(cls, client, hdfs_path, writer=None): 158 | """Instantiate from remote path. 159 | 160 | :param client: HDFS client. 161 | :param hdfs_path: HDFS path. 162 | 163 | """ 164 | content = client.content(hdfs_path) 165 | return cls(content['length'], content['fileCount'], writer=writer) 166 | 167 | @classmethod 168 | def from_local_path(cls, local_path, writer=None): 169 | """Instantiate from a local path. 170 | 171 | :param local_path: Local path. 172 | 173 | """ 174 | if osp.isdir(local_path): 175 | nbytes = 0 176 | nfiles = 0 177 | for dpath, _, fnames in os.walk(local_path): 178 | for fname in fnames: 179 | nbytes += osp.getsize(osp.join(dpath, fname)) 180 | nfiles += 1 181 | elif osp.exists(local_path): 182 | nbytes = osp.getsize(local_path) 183 | nfiles = 1 184 | else: 185 | raise HdfsError('No file found at: %s', local_path) 186 | return cls(nbytes, nfiles, writer=writer) 187 | 188 | @catch(HdfsError) 189 | def main(argv=None, client=None): 190 | """Entry point. 191 | 192 | :param argv: Arguments list. 193 | :param client: For testing. 194 | 195 | """ 196 | args = docopt(__doc__, argv=argv, version=__version__) 197 | if not client: 198 | client = configure_client('hdfscli', args) 199 | elif args['--log']: 200 | raise HdfsError('Logging is only available when no client is specified.') 201 | hdfs_path = args['HDFS_PATH'] 202 | local_path = args['LOCAL_PATH'] 203 | n_threads = parse_arg(args, '--threads', int) 204 | force = args['--force'] 205 | silent = args['--silent'] 206 | if args['download']: 207 | chunk_size = 2 ** 16 208 | if local_path == '-': 209 | if not sys.stdout.isatty() and sys.stderr.isatty() and not silent: 210 | progress = _Progress.from_hdfs_path(client, hdfs_path) 211 | else: 212 | progress = None 213 | with client.read( 214 | hdfs_path, 215 | chunk_size=chunk_size, 216 | progress=progress, 217 | ) as reader: 218 | # https://stackoverflow.com/a/23932488/1062617 219 | stdout = getattr(sys.stdout, 'buffer', sys.stdout) 220 | for chunk in reader: 221 | stdout.write(chunk) 222 | else: 223 | if sys.stderr.isatty() and not silent: 224 | progress = _Progress.from_hdfs_path(client, hdfs_path) 225 | else: 226 | progress = None 227 | client.download( 228 | hdfs_path, 229 | local_path, 230 | overwrite=force, 231 | n_threads=n_threads, 232 | chunk_size=chunk_size, 233 | progress=progress, 234 | ) 235 | elif args['upload']: 236 | append = args['--append'] 237 | if local_path == '-': 238 | client.write( 239 | hdfs_path, 240 | (line for line in sys.stdin), # Doesn't work with stdin. 241 | append=append, 242 | overwrite=force, 243 | ) 244 | else: 245 | if append: 246 | # TODO: Add progress tracking here. 247 | if osp.isfile(local_path): 248 | with open(local_path) as reader: 249 | client.write(hdfs_path, reader, append=True) 250 | else: 251 | raise HdfsError('Can only append when uploading a single file.') 252 | else: 253 | if sys.stderr.isatty() and not silent: 254 | progress = _Progress.from_local_path(local_path) 255 | else: 256 | progress = None 257 | client.upload( 258 | hdfs_path, 259 | local_path, 260 | overwrite=force, 261 | n_threads=n_threads, 262 | progress=progress, 263 | ) 264 | else: 265 | banner = ( 266 | '\n' 267 | 'Welcome to the interactive HDFS python shell.\n' 268 | 'The HDFS client is available as `CLIENT`.\n' 269 | ) 270 | namespace = {'CLIENT': client} 271 | try: 272 | from IPython import embed 273 | except ImportError: 274 | from code import interact 275 | interact(banner=banner, local=namespace) 276 | else: 277 | embed(banner1=banner, user_ns=namespace) 278 | 279 | if __name__ == '__main__': 280 | main() 281 | -------------------------------------------------------------------------------- /hdfs/config.py: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env python 2 | # encoding: utf-8 3 | 4 | """Command line interface configuration module. 5 | 6 | This module provides programmatic access to HdfsCLI's configuration settings. 7 | In particular it exposes the ability to instantiate clients from aliases (see 8 | :meth:`Config.get_client`). 9 | 10 | """ 11 | 12 | from .client import Client 13 | from .util import HdfsError 14 | from functools import wraps 15 | from logging.handlers import TimedRotatingFileHandler 16 | from six.moves.configparser import ParsingError, RawConfigParser 17 | from tempfile import gettempdir 18 | import importlib.util 19 | import importlib.machinery 20 | import logging as lg 21 | import os 22 | import os.path as osp 23 | import sys 24 | 25 | _logger = lg.getLogger(__name__) 26 | 27 | 28 | def _load_source(modname, filename): 29 | """Imitate the old imp.load_source() function, removed in Python 3.12""" 30 | # Based on sample code in https://docs.python.org/3.12/whatsnew/3.12.html. 31 | loader = importlib.machinery.SourceFileLoader(modname, filename) 32 | spec = importlib.util.spec_from_file_location(modname, filename, loader=loader) 33 | module = importlib.util.module_from_spec(spec) 34 | sys.modules[module.__name__] = module 35 | loader.exec_module(module) 36 | return module 37 | 38 | 39 | class NullHandler(lg.Handler): 40 | 41 | """Pass-through logging handler. 42 | 43 | This is required for python <2.7. 44 | 45 | """ 46 | 47 | def emit(self, record): 48 | """Do nothing.""" 49 | pass 50 | 51 | 52 | class Config(RawConfigParser): 53 | 54 | """Configuration class. 55 | 56 | :param path: path to configuration file. If no file exists at that location, 57 | the configuration parser will be empty. If not specified, the value of the 58 | `HDFSCLI_CONFIG` environment variable is used if it exists, otherwise it 59 | defaults to `~/.hdfscli.cfg`. 60 | :param stream_log_level: Stream handler log level, attached to the root 61 | logger. A false-ish value will disable this handler. This is particularly 62 | useful with the :func:`catch` function which reports exceptions as log 63 | messages. 64 | 65 | On instantiation, the configuration object will attempt to load modules 66 | defined in the `autoload` global options (see :ref:`custom_client` for more 67 | information). 68 | 69 | """ 70 | 71 | default_path = osp.expanduser('~/.hdfscli.cfg') 72 | global_section = 'global' 73 | 74 | def __init__(self, path=None, stream_log_level=None): 75 | RawConfigParser.__init__(self) 76 | self._clients = {} 77 | self.path = path or os.getenv('HDFSCLI_CONFIG', self.default_path) 78 | if stream_log_level: 79 | stream_handler = lg.StreamHandler() 80 | stream_handler.setLevel(stream_log_level) 81 | fmt = '%(levelname)s\t%(message)s' 82 | stream_handler.setFormatter(lg.Formatter(fmt)) 83 | lg.getLogger().addHandler(stream_handler) 84 | if osp.exists(self.path): 85 | try: 86 | self.read(self.path) 87 | except ParsingError: 88 | raise HdfsError('Invalid configuration file %r.', self.path) 89 | else: 90 | self._autoload() 91 | _logger.info('Instantiated configuration from %r.', self.path) 92 | else: 93 | _logger.info('Instantiated empty configuration.') 94 | 95 | def __repr__(self): 96 | return ''.format(self.path) 97 | 98 | def get_client(self, alias=None): 99 | """Load HDFS client. 100 | 101 | :param alias: The client to look up. If not specified, the default alias be 102 | used (`default.alias` option in the `global` section) if available and an 103 | error will be raised otherwise. 104 | 105 | Further calls to this method for the same alias will return the same client 106 | instance (in particular, any option changes to this alias will not be taken 107 | into account). 108 | 109 | """ 110 | if not alias: 111 | if ( 112 | not self.has_section(self.global_section) or 113 | not self.has_option(self.global_section, 'default.alias') 114 | ): 115 | raise HdfsError('No alias specified and no default alias found.') 116 | alias = self.get(self.global_section, 'default.alias') 117 | if not alias in self._clients: 118 | for suffix in ('.alias', '_alias'): 119 | section = '{}{}'.format(alias, suffix) 120 | if self.has_section(section): 121 | options = dict(self.items(section)) 122 | class_name = options.pop('client', 'InsecureClient') 123 | # Massage options. 124 | if 'timeout' in options: 125 | timeout = tuple(int(s) for s in options['timeout'].split(',')) 126 | options['timeout'] = timeout[0] if len(timeout) == 1 else timeout 127 | self._clients[alias] = Client.from_options(options, class_name) 128 | break 129 | else: 130 | raise HdfsError('Alias %r not found in %r.', alias, self.path) 131 | return self._clients[alias] 132 | 133 | def get_log_handler(self, command): 134 | """Configure and return log handler. 135 | 136 | :param command: The command to load the configuration for. All options will 137 | be looked up in the `[COMMAND.command]` section. This is currently only 138 | used for configuring the file handler for logging. If logging is disabled 139 | for the command, a :class:`NullHandler` will be returned, else a 140 | :class:`TimedRotatingFileHandler`. 141 | 142 | """ 143 | section = '{}.command'.format(command) 144 | path = osp.join(gettempdir(), '{}.log'.format(command)) 145 | level = lg.DEBUG 146 | if self.has_section(section): 147 | key = 'log.disable' 148 | if self.has_option(section, key) and self.getboolean(section, key): 149 | return NullHandler() 150 | if self.has_option(section, 'log.path'): 151 | path = self.get(section, 'log.path') # Override default path. 152 | if self.has_option(section, 'log.level'): 153 | level = getattr(lg, self.get(section, 'log.level').upper()) 154 | file_handler = TimedRotatingFileHandler( 155 | path, 156 | when='midnight', # Daily backups. 157 | backupCount=1, 158 | encoding='utf-8', 159 | ) 160 | fmt = '%(asctime)s\t%(name)-16s\t%(levelname)-5s\t%(message)s' 161 | file_handler.setFormatter(lg.Formatter(fmt)) 162 | file_handler.setLevel(level) 163 | return file_handler 164 | 165 | def _autoload(self): 166 | """Load modules to find clients.""" 167 | 168 | def _load(suffix, loader): 169 | """Generic module loader.""" 170 | option = 'autoload.{}'.format(suffix) 171 | if self.has_option(self.global_section, option): 172 | entries = self.get(self.global_section, option) 173 | for entry in entries.split(','): 174 | module = entry.strip() 175 | try: 176 | loader(module) 177 | except Exception: # pylint: disable=broad-except 178 | _logger.exception( 179 | 'Unable to load %r defined at %r.', 180 | module, self.path 181 | ) 182 | sys.exit(1) 183 | 184 | 185 | _load('modules', __import__) 186 | _load('paths', lambda path: _load_source( 187 | osp.splitext(osp.basename(path))[0], 188 | path 189 | )) 190 | 191 | 192 | def catch(*error_classes): 193 | r"""Returns a decorator that catches errors and prints messages to stderr. 194 | 195 | :param \*error_classes: Error classes. 196 | 197 | Also exits with status 1 if any errors are caught. 198 | 199 | """ 200 | def decorator(func): 201 | """Decorator.""" 202 | @wraps(func) 203 | def wrapper(*args, **kwargs): 204 | """Wrapper. Finally.""" 205 | try: 206 | return func(*args, **kwargs) 207 | except error_classes as err: 208 | _logger.error(err) 209 | sys.exit(1) 210 | except Exception: # pylint: disable=broad-except 211 | _logger.exception('Unexpected exception.') 212 | sys.exit(1) 213 | return wrapper 214 | return decorator 215 | -------------------------------------------------------------------------------- /hdfs/ext/__init__.py: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env python 2 | # encoding: utf-8 3 | 4 | """Extensions.""" 5 | -------------------------------------------------------------------------------- /hdfs/ext/avro/__init__.py: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env python 2 | # encoding: utf-8 3 | # pylint: disable=protected-access 4 | 5 | """Read and write Avro_ files directly from HDFS. 6 | 7 | This extension enables streaming decoding and encoding of files from and to 8 | HDFS. It requires the `fastavro` library. 9 | 10 | + :class:`AvroWriter` writes Avro files on HDFS from python objects. 11 | + :class:`AvroReader` reads Avro files from HDFS into an iterable of records. 12 | 13 | Sample usage: 14 | 15 | .. literalinclude:: ../examples/avro.py 16 | 17 | It also features an entry point (named `hdfscli-avro` by default) which 18 | provides access to the above functionality from the shell. For usage examples 19 | and more information: 20 | 21 | .. code-block:: bash 22 | 23 | $ hdfscli-avro --help 24 | 25 | .. _Avro: https://avro.apache.org/docs/1.7.7/index.html 26 | 27 | """ 28 | 29 | from ...util import AsyncWriter, HdfsError 30 | from json import dumps 31 | from six import integer_types, string_types 32 | import fastavro 33 | import io 34 | import logging as lg 35 | import os 36 | import posixpath as psp 37 | import sys 38 | 39 | 40 | _logger = lg.getLogger(__name__) 41 | 42 | 43 | # The number of bytes in a sync marker (http://mtth.xyz/_9lc9t3hjtx69x54). 44 | SYNC_SIZE = 16 45 | 46 | class _SchemaInferrer(object): 47 | 48 | """Utility to infer Avro schemas from python values.""" 49 | 50 | def __init__(self): 51 | self.record_index = 0 52 | 53 | def infer(self, obj): 54 | """Infer Avro type corresponding to a python object. 55 | 56 | :param obj: Python primitive. 57 | 58 | There are multiple limitations with this functions, among which: 59 | 60 | + Nullable fields aren't supported. 61 | + Only Avro integers will be inferred, so some values may overflow. 62 | + Record names are auto-generated. 63 | 64 | """ 65 | if isinstance(obj, bool): 66 | return 'boolean' 67 | elif isinstance(obj, string_types): 68 | return 'string' 69 | elif isinstance(obj, integer_types): # Python 3 doesn't have `long`. 70 | return 'int' 71 | elif isinstance(obj, float): 72 | return 'float' 73 | elif isinstance(obj, list): 74 | if not obj: 75 | raise ValueError('Cannot infer type of empty array.') 76 | return { 77 | 'type': 'array', 78 | 'items': self.infer(obj[0]) 79 | } 80 | elif isinstance(obj, dict): 81 | if not obj: 82 | raise ValueError('Cannot infer type of empty record.') 83 | self.record_index += 1 84 | return { 85 | 'name': '__Record{}'.format(self.record_index), 86 | 'type': 'record', 87 | 'fields': [ 88 | {'name': k, 'type': self.infer(v)} 89 | for k, v in sorted(obj.items()) # Sort fields by name. 90 | ] 91 | } 92 | raise ValueError('Cannot infer type from {}: {!r}'.format(type(obj), obj)) 93 | 94 | 95 | class _SeekableReader(object): 96 | 97 | """Customized reader for Avro. 98 | 99 | :param reader: Non-seekable reader. 100 | :param size: For testing. 101 | 102 | It detects reads of sync markers' sizes and will buffer these. Note that this 103 | reader is heavily particularized to how the `fastavro` library performs Avro 104 | decoding. 105 | 106 | """ 107 | 108 | def __init__(self, reader, size=None): 109 | self._reader = reader 110 | self._size = size or SYNC_SIZE 111 | self._buffer = None 112 | self._saught = False 113 | 114 | def read(self, nbytes): 115 | """Read bytes, caching the read if the size matches.""" 116 | buf = self._buffer 117 | if self._saught: 118 | assert buf 119 | missing_bytes = nbytes - len(buf) 120 | if missing_bytes < 0: 121 | chunk = buf[:nbytes] 122 | self._buffer = buf[nbytes:] 123 | else: 124 | chunk = buf 125 | if missing_bytes: 126 | chunk += self._reader.read(missing_bytes) 127 | self._buffer = None 128 | self._saught = False 129 | else: 130 | self._buffer = None 131 | chunk = self._reader.read(nbytes) 132 | if nbytes == self._size: 133 | self._buffer = chunk 134 | return chunk 135 | 136 | def seek(self, offset, whence): 137 | """Go back using the cached bytes.""" 138 | assert offset == - self._size 139 | assert whence == os.SEEK_CUR 140 | assert self._buffer 141 | self._saught = True 142 | 143 | 144 | class AvroReader(object): 145 | 146 | """HDFS Avro file reader. 147 | 148 | :param client: :class:`hdfs.client.Client` instance. 149 | :param hdfs_path: Remote path. 150 | :param parts: Part-files to read, when reading a distributed file. The 151 | default is to read all part-files in order. See 152 | :meth:`hdfs.client.Client.parts` for details. 153 | :param reader_schema: Schema to read the data as. If specified, it must be 154 | compatible with the writer's schema (the default). 155 | 156 | The contents of the file will be decoded in a streaming manner, as the data 157 | is transferred. This makes it possible to use on files of arbitrary size. As 158 | a convenience, the content summary object of the remote file is available on 159 | the reader's `content` attribute. 160 | 161 | Usage: 162 | 163 | .. code-block:: python 164 | 165 | with AvroReader(client, 'foo.avro') as reader: 166 | schema = reader.writer_schema # The remote file's Avro schema. 167 | content = reader.content # Content metadata (e.g. size). 168 | for record in reader: 169 | pass # and its records 170 | 171 | """ 172 | 173 | def __init__(self, client, hdfs_path, parts=None, reader_schema=None): 174 | self.content = client.content(hdfs_path) #: Content summary of Avro file. 175 | self.metadata = None #: Avro header metadata. 176 | self.reader_schema = reader_schema #: Input reader schema. 177 | self._writer_schema = None 178 | if self.content['directoryCount']: 179 | # This is a folder. 180 | self._paths = [ 181 | psp.join(hdfs_path, fname) 182 | for fname in client.parts(hdfs_path, parts) 183 | ] 184 | else: 185 | # This is a single file. 186 | self._paths = [hdfs_path] 187 | self._client = client 188 | self._records = None 189 | _logger.debug('Instantiated %r.', self) 190 | 191 | def __repr__(self): 192 | return ''.format(self._paths) 193 | 194 | def __enter__(self): 195 | 196 | def _reader(): 197 | """Record generator over all part-files.""" 198 | for path in self._paths: 199 | with self._client.read(path) as bytes_reader: 200 | reader = fastavro.reader( 201 | _SeekableReader(bytes_reader), 202 | reader_schema=self.reader_schema 203 | ) 204 | if not self._writer_schema: 205 | schema = reader.writer_schema 206 | _logger.debug('Read schema from %r.', path) 207 | yield (schema, reader.metadata) 208 | for record in reader: 209 | yield record 210 | 211 | self._records = _reader() 212 | self._writer_schema, self.metadata = next(self._records) 213 | return self 214 | 215 | def __exit__(self, exc_type, exc_value, traceback): 216 | self._records.close() 217 | _logger.debug('Closed records iterator for %r.', self) 218 | 219 | def __iter__(self): # pylint: disable=non-iterator-returned 220 | if not self._records: 221 | raise HdfsError('Iteration is only supported inside a `with` block.') 222 | return self._records 223 | 224 | @property 225 | def writer_schema(self): 226 | """Get the underlying file's schema. 227 | 228 | The schema will only be available after entering the reader's corresponding 229 | `with` block. 230 | 231 | """ 232 | if not self._writer_schema: 233 | raise HdfsError('Schema not yet inferred.') 234 | return self._writer_schema 235 | 236 | # Legacy property, preserved for backwards-compatibility. 237 | schema = writer_schema 238 | 239 | 240 | class AvroWriter(object): 241 | 242 | r"""Write an Avro file on HDFS from python dictionaries. 243 | 244 | :param client: :class:`hdfs.client.Client` instance. 245 | :param hdfs_path: Remote path. 246 | :param schema: Avro schema. If not specified, the writer will try to infer it 247 | from the first record sent. There are however limitations regarding what 248 | can be inferred. 249 | :param codec: Compression codec. The default is `'null'` (no compression). 250 | :param sync_interval: Number of bytes after which a block will be written. 251 | :param sync_marker: 16 byte tag used for synchronization. If not specified, 252 | one will be generated at random. 253 | :param metadata: Additional metadata to include in the container file's 254 | header. Keys starting with `'avro.'` are reserved. 255 | :param \*\*kwargs: Keyword arguments forwarded to 256 | :meth:`hdfs.client.Client.write`. 257 | 258 | Usage: 259 | 260 | .. code-block:: python 261 | 262 | with AvroWriter(client, 'data.avro') as writer: 263 | for record in records: 264 | writer.write(record) 265 | 266 | """ 267 | 268 | def __init__(self, client, hdfs_path, schema=None, codec=None, 269 | sync_interval=None, sync_marker=None, metadata=None, **kwargs): 270 | self._hdfs_path = hdfs_path 271 | self._fo = client.write(hdfs_path, **kwargs) 272 | self._schema = schema 273 | self._writer_kwargs = { 274 | 'codec': codec or 'null', 275 | 'metadata': metadata, 276 | 'sync_interval': sync_interval or 1000 * SYNC_SIZE, 277 | 'sync_marker': sync_marker or os.urandom(SYNC_SIZE), 278 | } 279 | self._entered = False 280 | self._writer = None 281 | _logger.info('Instantiated %r.', self) 282 | 283 | def __repr__(self): 284 | return ''.format(self._hdfs_path) 285 | 286 | def __enter__(self): 287 | if self._entered: 288 | raise HdfsError('Avro writer cannot be reused.') 289 | self._entered = True 290 | if self._schema: 291 | self._start_writer() 292 | return self 293 | 294 | def __exit__(self, *exc_info): 295 | if not self._writer: 296 | return # No header or records were written. 297 | try: 298 | self._writer.__exit__(*exc_info) 299 | _logger.debug('Closed underlying writer.') 300 | finally: 301 | self._fo.__exit__(*exc_info) 302 | 303 | @property 304 | def schema(self): 305 | """Avro schema.""" 306 | if not self._schema: 307 | raise HdfsError('Schema not yet inferred.') 308 | return self._schema 309 | 310 | def write(self, record): 311 | """Store a record. 312 | 313 | :param record: Record object to store. 314 | 315 | Only available inside the `with` block. 316 | 317 | """ 318 | if not self._entered: 319 | raise HdfsError('Avro writer not available outside context block.') 320 | if not self._schema: 321 | self._schema = _SchemaInferrer().infer(record) 322 | _logger.info('Inferred schema: %s', dumps(self._schema)) 323 | self._start_writer() 324 | self._writer.write(record) 325 | 326 | def _start_writer(self): 327 | _logger.debug('Starting underlying writer.') 328 | 329 | def write(records): 330 | fastavro.writer( 331 | fo=self._fo.__enter__(), 332 | schema=self._schema, 333 | records=records, 334 | **self._writer_kwargs 335 | ) 336 | 337 | self._writer = AsyncWriter(write).__enter__() 338 | -------------------------------------------------------------------------------- /hdfs/ext/avro/__main__.py: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env python 2 | # encoding: utf-8 3 | 4 | """HdfsCLI Avro: an Avro extension for HdfsCLI. 5 | 6 | Usage: 7 | hdfscli-avro schema [-a ALIAS] [-v...] HDFS_PATH 8 | hdfscli-avro read [-a ALIAS] [-v...] [-F FREQ | -n NUM] [-p PARTS] HDFS_PATH 9 | hdfscli-avro write [-fa ALIAS] [-v...] [-C CODEC] [-S SCHEMA] HDFS_PATH 10 | hdfscli-avro -L | -h 11 | 12 | Commands: 13 | schema Pretty print schema. 14 | read Read an Avro file from HDFS and output records 15 | as JSON to standard out. 16 | write Read JSON records from standard in and 17 | serialize them into a single Avro file on HDFS. 18 | 19 | Arguments: 20 | HDFS_PATH Remote path to Avro file or directory 21 | containing Avro part-files. 22 | 23 | Options: 24 | -C CODEC --codec=CODEC Compression codec. Available values are among: 25 | null, deflate, snappy. [default: deflate] 26 | -F FREQ --freq=FREQ Probability of sampling a record. 27 | -L --log Show path to current log file and exit. 28 | -S SCHEMA --schema=SCHEMA Schema for serializing records. If not passed, 29 | it will be inferred from the first record. 30 | -a ALIAS --alias=ALIAS Alias of namenode to connect to. 31 | -f --force Overwrite any existing file. 32 | -h --help Show this message and exit. 33 | -n NUM --num=NUM Cap number of records to output. 34 | -p PARTS --parts=PARTS Part-files to read. Specify a number to 35 | randomly select that many, or a comma-separated 36 | list of numbers to read only these. Use a 37 | number followed by a comma (e.g. `1,`) to get a 38 | unique part-file. The default is to read all 39 | part-files. 40 | -v --verbose Enable log output. Can be specified up to three 41 | times (increasing verbosity each time). 42 | 43 | Examples: 44 | hdfscli-avro schema /data/impressions.avro 45 | hdfscli-avro read -a dev snapshot.avro >snapshot.jsonl 46 | hdfscli-avro read -F 0.1 -p 2,3 clicks.avro 47 | hdfscli-avro write -f positives.avro 0: 75 | sleep(delay) # Avoid replay errors. 76 | self._timestamp = time() 77 | return super(_HdfsHTTPKerberosAuth, self).__call__(req) 78 | 79 | 80 | class KerberosClient(Client): 81 | 82 | r"""HDFS web client using Kerberos authentication. 83 | 84 | :param url: Hostname or IP address of HDFS namenode, prefixed with protocol, 85 | followed by WebHDFS port on namenode. 86 | :param mutual_auth: Whether to enforce mutual authentication or not (possible 87 | values: `'REQUIRED'`, `'OPTIONAL'`, `'DISABLED'`). 88 | :param max_concurrency: Maximum number of allowed concurrent authentication 89 | requests. This is required since requests exceeding the threshold allowed 90 | by the server will be unable to authenticate. 91 | :param proxy: User to proxy as. 92 | :param root: Root path, this will be prefixed to all HDFS paths passed to the 93 | client. If the root is relative, the path will be assumed relative to the 94 | user's home directory. 95 | :param timeout: Connection timeouts, forwarded to the request handler. How 96 | long to wait for the server to send data before giving up, as a float, or a 97 | `(connect_timeout, read_timeout)` tuple. If the timeout is reached, an 98 | appropriate exception will be raised. See the requests_ documentation for 99 | details. 100 | :param session: `requests.Session` instance, used to emit all requests. 101 | :param \*\*kwargs: Additional arguments passed to the underlying 102 | :class:`~requests_kerberos.HTTPKerberosAuth` class. 103 | 104 | To avoid replay errors, a timeout of 1 ms is enforced between requests. If a 105 | session argument is passed in, it will be modified in-place to support 106 | authentication. 107 | 108 | """ 109 | 110 | def __init__(self, url, mutual_auth='OPTIONAL', max_concurrency=1, root=None, 111 | proxy=None, timeout=None, session=None, **kwargs): 112 | # We allow passing in a string as mutual authentication value. 113 | if isinstance(mutual_auth, string_types): 114 | try: 115 | mutual_auth = getattr(requests_kerberos, mutual_auth) 116 | except AttributeError: 117 | raise HdfsError('Invalid mutual authentication type: %r', mutual_auth) 118 | kwargs['mutual_authentication'] = mutual_auth 119 | if not session: 120 | session = rq.Session() 121 | session.auth = _HdfsHTTPKerberosAuth(int(max_concurrency), **kwargs) 122 | super(KerberosClient, self).__init__( 123 | url, root=root, proxy=proxy, timeout=timeout, session=session 124 | ) 125 | -------------------------------------------------------------------------------- /hdfs/util.py: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env python 2 | # encoding: utf-8 3 | 4 | """Common utilities.""" 5 | 6 | from contextlib import contextmanager 7 | from shutil import rmtree 8 | from six.moves.queue import Queue 9 | from tempfile import mkstemp 10 | from threading import Thread 11 | import logging as lg 12 | import os 13 | import os.path as osp 14 | 15 | 16 | _logger = lg.getLogger(__name__) 17 | 18 | 19 | class HdfsError(Exception): 20 | 21 | """Base error class. 22 | 23 | :param message: Error message. 24 | :param args: optional Message formatting arguments. 25 | 26 | """ 27 | 28 | def __init__(self, message, *args, **kwargs): 29 | self.message = message % args if args else message 30 | super(HdfsError, self).__init__(self.message) 31 | self.exception = kwargs.get("exception") 32 | 33 | 34 | class AsyncWriter(object): 35 | 36 | """Asynchronous publisher-consumer. 37 | 38 | :param consumer: Function which takes a single generator as argument. 39 | 40 | This class can be used to transform functions which expect a generator into 41 | file-like writer objects. This can make it possible to combine different APIs 42 | together more easily. For example, to send streaming requests: 43 | 44 | .. code-block:: python 45 | 46 | import requests as rq 47 | 48 | with AsyncWriter(lambda data: rq.post(URL, data=data)) as writer: 49 | writer.write('Hello, world!') 50 | 51 | """ 52 | 53 | # Expected by pandas to write csv files (https://github.com/mtth/hdfs/pull/130). 54 | __iter__ = None 55 | 56 | def __init__(self, consumer): 57 | self._consumer = consumer 58 | self._queue = None 59 | self._reader = None 60 | self._err = None 61 | _logger.debug('Instantiated %r.', self) 62 | 63 | def __repr__(self): 64 | return '<{}(consumer={!r})>'.format(self.__class__.__name__, self._consumer) 65 | 66 | def __enter__(self): 67 | if self._queue: 68 | raise ValueError('Cannot nest contexts.') 69 | self._queue = Queue() 70 | self._err = None 71 | 72 | def consumer(data): 73 | """Wrapped consumer that lets us get a child's exception.""" 74 | try: 75 | _logger.debug('Starting consumer.') 76 | self._consumer(data) 77 | except Exception as err: # pylint: disable=broad-except 78 | _logger.exception('Exception in child.') 79 | self._err = err 80 | finally: 81 | _logger.debug('Finished consumer.') 82 | 83 | def reader(queue): 84 | """Generator read by the consumer.""" 85 | while True: 86 | chunk = queue.get() 87 | if chunk is None: 88 | break 89 | yield chunk 90 | 91 | self._reader = Thread(target=consumer, args=(reader(self._queue), )) 92 | self._reader.start() 93 | _logger.debug('Started child thread.') 94 | return self 95 | 96 | def __exit__(self, exc_type, exc_value, traceback): 97 | if exc_value: 98 | _logger.debug('Exception in parent.') 99 | if self._reader and self._reader.is_alive(): 100 | _logger.debug('Signaling child.') 101 | self._queue.put(None) 102 | self._reader.join() 103 | if self._err: 104 | raise self._err # pylint: disable=raising-bad-type 105 | else: 106 | _logger.debug('Child terminated without errors.') 107 | self._queue = None 108 | 109 | def flush(self): 110 | """Pass-through implementation.""" 111 | pass 112 | 113 | def seekable(self): 114 | """Implement file-like method expected by certain libraries. 115 | 116 | `fastavro` relies on it in python 3. 117 | 118 | """ 119 | return False 120 | 121 | def tell(self): 122 | """No-op implementation.""" 123 | return 0 124 | 125 | def write(self, chunk): 126 | """Stream data to the underlying consumer. 127 | 128 | :param chunk: Bytes to write. These will be buffered in memory until the 129 | consumer reads them. 130 | 131 | """ 132 | if chunk: 133 | # We skip empty chunks, otherwise they cause request to terminate the 134 | # response stream. Note that these chunks can be produced by valid 135 | # upstream encoders (e.g. bzip2). 136 | self._queue.put(chunk) 137 | 138 | 139 | @contextmanager 140 | def temppath(dpath=None): 141 | """Create a temporary path. 142 | 143 | :param dpath: Explicit directory name where to create the temporary path. A 144 | system dependent default will be used otherwise (cf. `tempfile.mkstemp`). 145 | 146 | Usage:: 147 | 148 | with temppath() as path: 149 | pass # do stuff 150 | 151 | Any file or directory corresponding to the path will be automatically deleted 152 | afterwards. 153 | 154 | """ 155 | (desc, path) = mkstemp(dir=dpath) 156 | os.close(desc) 157 | os.remove(path) 158 | try: 159 | _logger.debug('Created temporary path at %s.', path) 160 | yield path 161 | finally: 162 | if osp.exists(path): 163 | if osp.isdir(path): 164 | rmtree(path) 165 | _logger.debug('Deleted temporary directory at %s.', path) 166 | else: 167 | os.remove(path) 168 | _logger.debug('Deleted temporary file at %s.', path) 169 | else: 170 | _logger.debug('No temporary file or directory to delete at %s.', path) 171 | -------------------------------------------------------------------------------- /scripts/hadoop.sh: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env bash 2 | 3 | # Hadoop utilities to setup a standalone HDFS cluster for integration tests. 4 | # 5 | # The following commands will download Hadoop locally and start a single node 6 | # HDFS cluster: 7 | # 8 | # ```bash 9 | # $ export HADOOP_HOME="$(./scripts/hadoop.sh download)" 10 | # $ export HADOOP_CONF_DIR="$(./scripts/hadoop.sh config)" 11 | # $ ./scripts/hadoop.sh start 12 | # ``` 13 | # 14 | # Later, to stop it: 15 | # 16 | # ```bash 17 | # $ ./scripts/hadoop.sh stop 18 | # ``` 19 | # 20 | 21 | set -o nounset 22 | set -o errexit 23 | 24 | # Print usage and exit. 25 | # 26 | # Refer to individual functions below for more information. 27 | # 28 | usage() { 29 | echo "usage: $0 (config|download|start|stop)" >&2 30 | exit 1 31 | } 32 | 33 | # Download Hadoop binary. 34 | # 35 | # TODO: Test against several versions? (But they are very big...) 36 | # 37 | hadoop-download() { 38 | # Verification as per https://web.archive.org/web/20211018165755/https://hadoop.apache.org/releases.html#to-verify-hadoop-releases-using-gpg 39 | local hadoop=hadoop-2.9.2 40 | cd "$(mktemp -d 2>/dev/null || mktemp -d -t hadoop)" 41 | curl -O "https://archive.apache.org/dist/hadoop/common/$hadoop/$hadoop.tar.gz" 42 | curl -O "https://archive.apache.org/dist/hadoop/common/$hadoop/$hadoop.tar.gz.asc" 43 | curl -O https://downloads.apache.org/hadoop/common/KEYS 44 | gpg -q --import KEYS 45 | gpg --verify "$hadoop.tar.gz.asc" "$hadoop.tar.gz" 46 | tar -xzf "$hadoop.tar.gz" 47 | echo "$(pwd)/$hadoop" 48 | } 49 | 50 | # Generate configuration and print corresponding path. 51 | # 52 | # The returned path is suitable to be used as environment variable 53 | # `$HADOOP_CONF_DIR`. Note that this is necessary because proxy users are 54 | # defined as property keys, so it's not possible to allow the current user 55 | # otherwise. 56 | # 57 | hadoop-config() { 58 | local tpl_dir="$(cd "$(dirname "${BASH_SOURCE[0]}")" && pwd)/../etc/hadoop" 59 | local conf_dir="$(mktemp -d 2>/dev/null || mktemp -d -t 'hadoop-conf')" 60 | for i in "$tpl_dir"/*; do 61 | sed -e "s/#USER#/$(whoami)/" "$i" >"${conf_dir}/$(basename "$i")" 62 | done 63 | echo "$conf_dir" 64 | } 65 | 66 | # Start HDFS cluster (single namenode and datanode) and HttpFS server. 67 | # 68 | # This requires `$HADOOP_HOME` and `$HADOOP_CONF_DIR` to be set. 69 | # 70 | hadoop-start() { 71 | "${HADOOP_HOME}/bin/hdfs" namenode -format -nonInteractive || : 72 | "${HADOOP_HOME}/sbin/hadoop-daemon.sh" --config "$HADOOP_CONF_DIR" --script hdfs start namenode 73 | "${HADOOP_HOME}/sbin/hadoop-daemon.sh" --config "$HADOOP_CONF_DIR" --script hdfs start datanode 74 | HTTPFS_CONFIG="$HADOOP_CONF_DIR" "${HADOOP_HOME}/sbin/httpfs.sh" start 75 | } 76 | 77 | # Stop HDFS cluster and HttpFS server. 78 | # 79 | # This requires `$HADOOP_HOME` to be set. 80 | # 81 | hadoop-stop() { 82 | "${HADOOP_HOME}/sbin/httpfs.sh" stop 83 | "${HADOOP_HOME}/sbin/hadoop-daemon.sh" --script hdfs stop datanode 84 | "${HADOOP_HOME}/sbin/hadoop-daemon.sh" --script hdfs stop namenode 85 | } 86 | 87 | if [[ $# -ne 1 ]]; then 88 | usage 89 | fi 90 | 91 | case "$1" in 92 | download) hadoop-download ;; 93 | config) hadoop-config ;; 94 | start) hadoop-start ;; 95 | stop) hadoop-stop ;; 96 | *) usage ;; 97 | esac 98 | -------------------------------------------------------------------------------- /scripts/version.sh: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env bash 2 | 3 | set -o nounset 4 | set -o errexit 5 | set -o pipefail 6 | shopt -s nullglob 7 | 8 | __dirname="$(cd "$(dirname "${BASH_SOURCE[0]}")" && pwd)" 9 | 10 | fail() { # MSG 11 | echo "$1" >&2 && exit 1 12 | } 13 | 14 | version_pattern="__version__ = '([^']+)'" 15 | 16 | main() { 17 | cd "$__dirname/.." 18 | local line="$(grep __version__ hdfs/__init__.py)" 19 | if ! [[ $line =~ $version_pattern ]]; then 20 | fail 'missing version' 21 | fi 22 | echo "${BASH_REMATCH[1]}" 23 | } 24 | 25 | main "$@" 26 | -------------------------------------------------------------------------------- /setup.py: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env python 2 | 3 | """HdfsCLI: API and command line interface for HDFS.""" 4 | 5 | from os import environ 6 | from setuptools import find_packages, setup 7 | import re 8 | 9 | 10 | def _get_version(): 11 | """Extract version from package.""" 12 | with open('hdfs/__init__.py') as reader: 13 | match = re.search( 14 | r'^__version__\s*=\s*[\'"]([^\'"]*)[\'"]', 15 | reader.read(), 16 | re.MULTILINE 17 | ) 18 | if match: 19 | return match.group(1) 20 | else: 21 | raise RuntimeError('Unable to extract version.') 22 | 23 | def _get_long_description(): 24 | """Get README contents.""" 25 | with open('README.md') as reader: 26 | return reader.read() 27 | 28 | # Allow configuration of the CLI alias. 29 | ENTRY_POINT = environ.get('HDFSCLI_ENTRY_POINT', 'hdfscli') 30 | 31 | setup( 32 | name='hdfs', 33 | version=_get_version(), 34 | description=__doc__, 35 | long_description=_get_long_description(), 36 | long_description_content_type='text/markdown', 37 | author='Matthieu Monsch', 38 | author_email='mtth@apache.org', 39 | url='https://hdfscli.readthedocs.io', 40 | license='MIT', 41 | packages=find_packages(exclude=['test*']), 42 | classifiers=[ 43 | 'Development Status :: 5 - Production/Stable', 44 | 'Intended Audience :: Developers', 45 | 'License :: OSI Approved :: MIT License', 46 | 'Programming Language :: Python', 47 | 'Programming Language :: Python :: 3.7', 48 | 'Programming Language :: Python :: 3.8', 49 | 'Programming Language :: Python :: 3.9', 50 | 'Programming Language :: Python :: 3.10', 51 | 'Programming Language :: Python :: 3.11', 52 | 'Programming Language :: Python :: 3.12', 53 | ], 54 | install_requires=[ 55 | 'docopt', 56 | 'requests>=2.7.0', 57 | 'six>=1.9.0', 58 | ], 59 | extras_require={ 60 | 'avro': ['fastavro>=0.21.19'], 61 | 'kerberos': ['requests-kerberos>=0.7.0'], 62 | 'dataframe': ['fastavro>=0.21.19', 'pandas>=0.14.1'], 63 | }, 64 | entry_points={'console_scripts': [ 65 | '{} = hdfs.__main__:main'.format(ENTRY_POINT), 66 | '{}-avro = hdfs.ext.avro.__main__:main'.format(ENTRY_POINT), 67 | ]}, 68 | ) 69 | -------------------------------------------------------------------------------- /test/__init__.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/mtth/hdfs/039a7f4730653a8264c092845b5602ccb692a7ef/test/__init__.py -------------------------------------------------------------------------------- /test/dat/client_template.py: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env python 2 | # encoding: utf-8 3 | 4 | """A template for generating new clients. 5 | 6 | This is used to test autoloading from `CliConfig` (see `test/test_main.py`). 7 | 8 | """ 9 | 10 | from hdfs import Client 11 | 12 | 13 | class $class_name(Client): 14 | 15 | one = 1 16 | 17 | def __init__(self, url): 18 | super($class_name, self).__init__(url) 19 | -------------------------------------------------------------------------------- /test/dat/weather.avro: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/mtth/hdfs/039a7f4730653a8264c092845b5602ccb692a7ef/test/dat/weather.avro -------------------------------------------------------------------------------- /test/dat/weather.avsc: -------------------------------------------------------------------------------- 1 | { 2 | "name": "test.Weather", 3 | "type": "record", 4 | "fields": [ 5 | {"name": "station", "type": "string"}, 6 | {"name": "time", "type": "long"}, 7 | {"name": "temp", "type": "int"} 8 | ] 9 | } 10 | -------------------------------------------------------------------------------- /test/dat/weather.jsonl: -------------------------------------------------------------------------------- 1 | {"station":"gqxurbcrru","time":-3367677834113346249,"temp":209887781} 2 | {"station":"fdvvmtpedxsifd","time":6645465191399988678,"temp":-2056157190} 3 | {"station":"ci","time":6543782083632958711,"temp":-565739712} 4 | {"station":"xadxqapgjnk","time":-4449449961661895660,"temp":2065274889} 5 | {"station":"usafhhcjcfw","time":-6395806787784552082,"temp":254868980} 6 | -------------------------------------------------------------------------------- /test/test_client.py: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env python 2 | # encoding: utf-8 3 | 4 | """Test Hdfs client interactions with HDFS.""" 5 | 6 | from collections import defaultdict 7 | from hdfs.client import * 8 | from hdfs.util import HdfsError, temppath 9 | from test.util import _IntegrationTest 10 | from requests.exceptions import ConnectTimeout, ReadTimeout 11 | from shutil import rmtree 12 | from six import b 13 | from tempfile import mkdtemp 14 | import os 15 | import os.path as osp 16 | import posixpath as psp 17 | import pytest 18 | 19 | 20 | class TestLoad(object): 21 | 22 | """Test client loader.""" 23 | 24 | def test_bare(self): 25 | client = Client.from_options({'url': 'foo'}) 26 | assert isinstance(client, Client) 27 | 28 | def test_new_type(self): 29 | class NewClient(Client): 30 | def __init__(self, url, bar): 31 | super(NewClient, self).__init__(url) 32 | self.bar = bar 33 | client = Client.from_options({'url': 'bar', 'bar': 2}, 'NewClient') 34 | assert client.bar == 2 35 | 36 | def test_missing_options(self): 37 | with pytest.raises(HdfsError): 38 | Client.from_options({}, 'KerberosClient') 39 | 40 | def test_invalid_options(self): 41 | with pytest.raises(HdfsError): 42 | Client.from_options({'foo': 123}) 43 | 44 | def test_missing_type(self): 45 | with pytest.raises(HdfsError): 46 | Client.from_options({}, 'MissingClient') 47 | 48 | def test_timeout(self): 49 | assert Client('')._timeout == None 50 | assert Client('', timeout=1)._timeout == 1 51 | assert Client('', timeout=(1,2))._timeout == (1,2) 52 | assert Client.from_options({'url': ''})._timeout == None 53 | 54 | 55 | class TestOptions(_IntegrationTest): 56 | 57 | """Test client options.""" 58 | 59 | @pytest.mark.skip(reason="TODO: Investigate why this fails in Python 3.7 and 3.9") 60 | def test_timeout(self): 61 | with pytest.raises(ConnectTimeout, ReadTimeout): 62 | self.client._timeout = 1e-6 # Small enough for it to always timeout. 63 | try: 64 | self.client.status('.') 65 | finally: 66 | self.client._timeout = None 67 | 68 | 69 | class TestApi(_IntegrationTest): 70 | 71 | """Test client raw API interactions.""" 72 | 73 | def test_list_status_absolute_root(self): 74 | assert self.client._list_status('/') 75 | 76 | def test_get_folder_status(self): 77 | self.client._mkdirs('foo') 78 | status = self.client._get_file_status('foo').json()['FileStatus'] 79 | assert status['type'] == 'DIRECTORY' 80 | 81 | def test_get_home_directory(self): 82 | path = self.client._get_home_directory('/').json()['Path'] 83 | assert '/user/' in path 84 | 85 | def test_delete_file(self): 86 | path = 'bar' 87 | self._write(path, b'hello') 88 | assert self.client._delete(path).json()['boolean'] 89 | assert not self._exists(path) 90 | 91 | def test_delete_missing_file(self): 92 | path = 'bar2' 93 | assert not self.client._delete(path).json()['boolean'] 94 | 95 | def test_rename_file(self): 96 | paths = ['foo', '{}/bar'.format(self.client.root.rstrip('/'))] 97 | self._write(paths[0], b'hello') 98 | assert self.client._rename(paths[0], destination=paths[1]).json()['boolean'] 99 | assert not self._exists(paths[0]) 100 | assert self.client._open(paths[1].rsplit('/', 1)[1]).content == b'hello' 101 | self.client._delete(paths[1]) 102 | 103 | def test_rename_file_to_existing(self): 104 | p = ['foo', '{}/bar'.format(self.client.root.rstrip('/'))] 105 | self._write(p[0], b'hello') 106 | self._write(p[1], b'hi') 107 | try: 108 | assert not self.client._rename(p[0], destination=p[1]).json()['boolean'] 109 | finally: 110 | self.client._delete(p[0]) 111 | self.client._delete(p[1]) 112 | 113 | def test_open_file(self): 114 | self._write('foo', b'hello') 115 | assert self.client._open('foo').content == b'hello' 116 | 117 | def test_get_file_checksum(self): 118 | self._write('foo', b'hello') 119 | data = self.client._get_file_checksum('foo').json()['FileChecksum'] 120 | assert sorted(data) == ['algorithm', 'bytes', 'length'] 121 | assert int(data['length']) 122 | 123 | def test_get_file_checksum_on_folder(self): 124 | with pytest.raises(HdfsError): 125 | self.client._get_file_checksum('') 126 | 127 | 128 | class TestResolve(_IntegrationTest): 129 | 130 | def test_resolve_relative(self): 131 | assert Client('url', root='/').resolve('bar') == '/bar' 132 | assert Client('url', root='/foo').resolve('bar') == '/foo/bar' 133 | assert Client('url', root='/foo/').resolve('bar') == '/foo/bar' 134 | assert Client('url', root='/foo/').resolve('bar/') == '/foo/bar' 135 | assert Client('url', root='/foo/').resolve('/bar/') == '/bar' 136 | 137 | def test_resolve_relative_no_root(self): 138 | root = self.client.root 139 | try: 140 | self.client.root = None 141 | home = self.client._get_home_directory('/').json()['Path'] 142 | assert self.client.resolve('bar') == psp.join(home, 'bar') 143 | assert self.client.root == home 144 | finally: 145 | self.client.root = root 146 | 147 | def test_resolve_relative_root(self): 148 | root = self.client.root 149 | try: 150 | self.client.root = 'bar' 151 | home = self.client._get_home_directory('/').json()['Path'] 152 | assert self.client.resolve('foo') == psp.join(home, 'bar', 'foo') 153 | assert self.client.root == psp.join(home, 'bar') 154 | finally: 155 | self.client.root = root 156 | 157 | def test_resolve_absolute(self): 158 | assert Client('url').resolve('/bar') == '/bar' 159 | assert Client('url').resolve('/bar/foo/') == '/bar/foo' 160 | 161 | def test_create_file_with_percent(self): 162 | # `%` (`0x25`) is a special case because it seems to cause errors (even 163 | # though the action still goes through). Typical error message will be 164 | # `"Unknown exception in doAs"`. 165 | path = 'fo&o/a%a' 166 | try: 167 | self._write(path, b'hello') 168 | except HdfsError: 169 | pass 170 | assert self._read(path) == b'hello' 171 | 172 | 173 | class TestWrite(_IntegrationTest): 174 | 175 | def test_create_from_string(self): 176 | self.client.write('up', b'hello, world!') 177 | assert self._read('up') == b'hello, world!' 178 | 179 | def test_create_from_string_with_encoding(self): 180 | self.client.write('up', u'hello, world!', encoding='utf-8') 181 | assert self._read('up') == b'hello, world!' 182 | 183 | def test_create_from_generator(self): 184 | data = (e for e in [b'hello, ', b'world!']) 185 | self.client.write('up', data) 186 | assert self._read('up') == b'hello, world!' 187 | 188 | def test_create_from_generator_with_encoding(self): 189 | data = (e for e in [u'hello, ', u'world!']) 190 | self.client.write('up', data, encoding='utf-8') 191 | assert self._read('up') == b'hello, world!' 192 | 193 | def test_create_from_file_object(self): 194 | with temppath() as tpath: 195 | with open(tpath, 'w') as writer: 196 | writer.write('hello, world!') 197 | with open(tpath) as reader: 198 | self.client.write('up', reader) 199 | assert self._read('up') == b'hello, world!' 200 | 201 | def test_create_set_permission(self): 202 | self.client.write('up', b'hello, world!', permission='722') 203 | assert self._read('up') == b'hello, world!' 204 | assert self.client.status('up')['permission'] == '722' 205 | 206 | def test_create_to_existing_file_without_overwrite(self): 207 | with pytest.raises(HdfsError): 208 | self.client.write('up', b'hello, world!') 209 | self.client.write('up', b'hello again, world!') 210 | 211 | def test_create_and_overwrite_file(self): 212 | self.client.write('up', b'hello, world!') 213 | self.client.write('up', b'hello again, world!', overwrite=True) 214 | assert self._read('up') == b'hello again, world!' 215 | 216 | def test_as_context_manager(self): 217 | with self.client.write('up') as writer: 218 | writer.write(b'hello, ') 219 | writer.write(b'world!') 220 | assert self._read('up') == b'hello, world!' 221 | 222 | def test_as_context_manager_with_encoding(self): 223 | with self.client.write('up', encoding='utf-8') as writer: 224 | writer.write(u'hello, ') 225 | writer.write(u'world!') 226 | assert self._read('up') == b'hello, world!' 227 | 228 | def test_dump_json(self): 229 | from json import dump, loads 230 | data = {'one': 1, 'two': 2} 231 | with self.client.write('up', encoding='utf-8') as writer: 232 | dump(data, writer) 233 | assert loads(self._read('up', encoding='utf-8')) == data 234 | 235 | def test_create_and_overwrite_directory(self): 236 | with pytest.raises(HdfsError): 237 | # can't overwrite a directory with a file 238 | self.client._mkdirs('up') 239 | self.client.write('up', b'hello, world!') 240 | 241 | def test_create_invalid_path(self): 242 | with pytest.raises(HdfsError): 243 | # conversely, can't overwrite a file with a directory 244 | self.client.write('up', b'hello, world!') 245 | self.client.write('up/up', b'hello again, world!') 246 | 247 | 248 | class TestAppend(_IntegrationTest): 249 | 250 | @classmethod 251 | def setup_class(cls): 252 | super(TestAppend, cls).setup_class() 253 | if cls.client: 254 | try: 255 | cls.client.write('ap', b'') # We can't append to an empty file. 256 | cls.client.write('ap', b'', append=True) # Try a simple append. 257 | except HdfsError as err: 258 | if 'Append is not supported' in str(err): 259 | cls.client = None 260 | # Skip these tests if HDFS isn't configured to support appends. 261 | else: 262 | raise err 263 | 264 | def test_simple(self): 265 | self.client.write('ap', b'hello,') 266 | self.client.write('ap', b' world!', append=True) 267 | assert self._read('ap') == b'hello, world!' 268 | 269 | def test_missing_file(self): 270 | with pytest.raises(HdfsError): 271 | self.client.write('ap', b'hello!', append=True) 272 | 273 | def test_overwrite_and_append(self): 274 | with pytest.raises(ValueError): 275 | self.client.write('ap', b'hello!', overwrite=True, append=True) 276 | 277 | def test_set_permission_and_append(self): 278 | with pytest.raises(ValueError): 279 | self.client.write('ap', b'hello!', permission='777', append=True) 280 | 281 | 282 | class TestUpload(_IntegrationTest): 283 | 284 | def test_upload_file(self): 285 | with temppath() as tpath: 286 | with open(tpath, 'w') as writer: 287 | writer.write('hello, world!') 288 | self.client.upload('up', tpath) 289 | assert self._read('up') == b'hello, world!' 290 | 291 | def test_upload_missing(self): 292 | with pytest.raises(HdfsError): 293 | with temppath() as tpath: 294 | self.client.upload('up', tpath) 295 | 296 | def test_upload_empty_directory(self): 297 | with pytest.raises(HdfsError): 298 | dpath = mkdtemp() 299 | try: 300 | self.client.upload('up', dpath) 301 | finally: 302 | os.rmdir(dpath) 303 | 304 | def test_upload_directory_to_existing_directory(self): 305 | dpath = mkdtemp() 306 | try: 307 | npath = osp.join(dpath, 'hi') 308 | os.mkdir(npath) 309 | with open(osp.join(npath, 'foo'), 'w') as writer: 310 | writer.write('hello!') 311 | os.mkdir(osp.join(npath, 'bar')) 312 | with open(osp.join(npath, 'bar', 'baz'), 'w') as writer: 313 | writer.write('world!') 314 | self.client._mkdirs('up') 315 | self.client.upload('up', npath) 316 | assert self._read('up/hi/foo') == b'hello!' 317 | assert self._read('up/hi/bar/baz') == b'world!' 318 | finally: 319 | rmtree(dpath) 320 | 321 | def test_upload_directory_to_missing(self): 322 | dpath = mkdtemp() 323 | try: 324 | with open(osp.join(dpath, 'foo'), 'w') as writer: 325 | writer.write('hello!') 326 | os.mkdir(osp.join(dpath, 'bar')) 327 | with open(osp.join(dpath, 'bar', 'baz'), 'w') as writer: 328 | writer.write('world!') 329 | self.client.upload('up', dpath) 330 | assert self._read('up/foo') == b'hello!' 331 | assert self._read('up/bar/baz') == b'world!' 332 | finally: 333 | rmtree(dpath) 334 | 335 | def test_upload_directory_overwrite_existing_file(self): 336 | dpath = mkdtemp() 337 | try: 338 | with open(osp.join(dpath, 'foo'), 'w') as writer: 339 | writer.write('hello!') 340 | os.mkdir(osp.join(dpath, 'bar')) 341 | with open(osp.join(dpath, 'bar', 'baz'), 'w') as writer: 342 | writer.write('world!') 343 | self._write('up', b'hi') 344 | self.client.upload('up', dpath, overwrite=True) 345 | assert self._read('up/foo') == b'hello!' 346 | assert self._read('up/bar/baz') == b'world!' 347 | finally: 348 | rmtree(dpath) 349 | 350 | def test_upload_overwrite(self): 351 | with temppath() as tpath: 352 | with open(tpath, 'w') as writer: 353 | writer.write('hello') 354 | self.client.upload('up', tpath) 355 | with temppath() as tpath: 356 | with open(tpath, 'w') as writer: 357 | writer.write('there') 358 | self.client.upload('up', tpath, overwrite=True) 359 | assert self._read('up') == b'there' 360 | 361 | def test_upload_overwrite_error(self): 362 | with pytest.raises(HdfsError): 363 | with temppath() as tpath: 364 | with open(tpath, 'w') as writer: 365 | writer.write('here') 366 | self.client.upload('up', tpath) 367 | self.client.upload('up', tpath) 368 | 369 | def test_upload_cleanup(self): 370 | dpath = mkdtemp() 371 | _write = self.client.write 372 | 373 | def write(hdfs_path, *args, **kwargs): 374 | if 'bar' in hdfs_path: 375 | raise RuntimeError() 376 | return _write(hdfs_path, *args, **kwargs) 377 | 378 | try: 379 | self.client.write = write 380 | npath = osp.join(dpath, 'hi') 381 | os.mkdir(npath) 382 | with open(osp.join(npath, 'foo'), 'w') as writer: 383 | writer.write('hello!') 384 | os.mkdir(osp.join(npath, 'bar')) 385 | with open(osp.join(npath, 'bar', 'baz'), 'w') as writer: 386 | writer.write('world!') 387 | try: 388 | self.client.upload('foo', dpath) 389 | except RuntimeError: 390 | assert not self._exists('foo') 391 | else: 392 | assert False # This shouldn't happen. 393 | finally: 394 | rmtree(dpath) 395 | self.client.write = _write 396 | 397 | def test_upload_no_cleanup(self): 398 | dpath = mkdtemp() 399 | _write = self.client.write 400 | 401 | def write(hdfs_path, *args, **kwargs): 402 | if 'bar' in hdfs_path: 403 | raise RuntimeError() 404 | return _write(hdfs_path, *args, **kwargs) 405 | 406 | try: 407 | self.client.write = write 408 | npath = osp.join(dpath, 'hi') 409 | os.mkdir(npath) 410 | with open(osp.join(npath, 'foo'), 'w') as writer: 411 | writer.write('hello!') 412 | os.mkdir(osp.join(npath, 'bar')) 413 | with open(osp.join(npath, 'bar', 'baz'), 'w') as writer: 414 | writer.write('world!') 415 | try: 416 | self.client.upload('foo', dpath, cleanup=False) 417 | except RuntimeError: 418 | # The outer folder still exists. 419 | assert self._exists('foo') 420 | else: 421 | assert False # This shouldn't happen. 422 | finally: 423 | rmtree(dpath) 424 | self.client.write = _write 425 | 426 | def test_upload_with_progress(self): 427 | 428 | def callback(path, nbytes, history=defaultdict(list)): 429 | history[path].append(nbytes) 430 | return history 431 | 432 | dpath = mkdtemp() 433 | try: 434 | path1 = osp.join(dpath, 'foo') 435 | with open(path1, 'w') as writer: 436 | writer.write('hello!') 437 | os.mkdir(osp.join(dpath, 'bar')) 438 | path2 = osp.join(dpath, 'bar', 'baz') 439 | with open(path2, 'w') as writer: 440 | writer.write('the world!') 441 | self.client.upload( 442 | 'up', 443 | dpath, 444 | chunk_size=4, 445 | n_threads=1, # Callback isn't thread-safe. 446 | progress=callback 447 | ) 448 | assert self._read('up/foo') == b'hello!' 449 | assert self._read('up/bar/baz') == b'the world!' 450 | assert ( 451 | callback('', 0) == 452 | {path1: [4, 6, -1], path2: [4, 8, 10, -1], '': [0]}) 453 | finally: 454 | rmtree(dpath) 455 | 456 | 457 | class TestDelete(_IntegrationTest): 458 | 459 | def test_delete_file(self): 460 | self._write('foo', b'hello, world!') 461 | assert self.client.delete('foo') 462 | assert not self._exists('foo') 463 | 464 | def test_delete_empty_directory(self): 465 | self.client._mkdirs('foo') 466 | assert self.client.delete('foo') 467 | assert not self._exists('foo') 468 | 469 | def test_delete_missing_file(self): 470 | assert not self.client.delete('foo') 471 | 472 | def test_delete_non_empty_directory(self): 473 | self._write('de/foo', b'hello, world!') 474 | assert self.client.delete('de', recursive=True) 475 | assert not self._exists('de') 476 | 477 | def test_delete_non_empty_directory_without_recursive(self): 478 | with pytest.raises(HdfsError): 479 | self._write('de/foo', b'hello, world!') 480 | self.client.delete('de') 481 | 482 | def test_trash_file(self): 483 | self._write('foo', b'hello, world!') 484 | assert self.client.delete('foo', skip_trash=False) 485 | assert self.client.status('foo', strict=False) == None 486 | 487 | def test_trash_missing_file(self): 488 | assert not self.client.delete('foo', skip_trash=False) 489 | 490 | def test_trash_directory_non_recursive(self): 491 | with pytest.raises(HdfsError): 492 | self._write('bar/foo', b'hello, world!') 493 | self.client.delete('bar', skip_trash=False) 494 | 495 | def test_trash_directory(self): 496 | self._write('bar/foo', b'hello, world!') 497 | assert self.client.delete('bar', recursive=True, skip_trash=False) 498 | assert self.client.status('bar', strict=False) == None 499 | 500 | 501 | class TestRead(_IntegrationTest): 502 | 503 | def test_progress_without_chunk_size(self): 504 | with pytest.raises(ValueError): 505 | self._write('foo', b'hello, world!') 506 | with self.client.read('foo', progress=lambda path, nbytes: None) as reader: 507 | pass 508 | 509 | def test_delimiter_without_encoding(self): 510 | with pytest.raises(ValueError): 511 | self._write('foo', b'hello, world!') 512 | with self.client.read('foo', delimiter=',') as reader: 513 | pass 514 | 515 | def test_delimiter_with_chunk_size(self): 516 | with pytest.raises(ValueError): 517 | self._write('foo', b'hello, world!') 518 | with self.client.read('foo', delimiter=',', chunk_size=1) as reader: 519 | pass 520 | 521 | def test_read_file(self): 522 | self._write('foo', b'hello, world!') 523 | with self.client.read('foo') as reader: 524 | assert reader.read() == b'hello, world!' 525 | 526 | def test_read_directory(self): 527 | with pytest.raises(HdfsError): 528 | self.client._mkdirs('foo') 529 | with self.client.read('foo') as reader: 530 | pass 531 | 532 | def test_read_missing_file(self): 533 | with pytest.raises(HdfsError): 534 | with self.client.read('foo') as reader: 535 | pass 536 | 537 | def test_read_file_from_offset(self): 538 | self._write('foo', b'hello, world!') 539 | with self.client.read('foo', offset=7) as reader: 540 | assert reader.read() == b'world!' 541 | 542 | def test_read_file_from_offset_with_limit(self): 543 | self._write('foo', b'hello, world!') 544 | with self.client.read('foo', offset=7, length=5) as reader: 545 | assert reader.read() == b'world' 546 | 547 | def test_read_file_with_chunk_size(self): 548 | self._write('foo', b'hello, world!') 549 | with self.client.read('foo', chunk_size=5) as reader: 550 | assert list(reader) == [b'hello', b', wor', b'ld!'] 551 | 552 | def test_with_progress(self): 553 | def cb(path, nbytes, chunk_lengths=[]): 554 | chunk_lengths.append(nbytes) 555 | return chunk_lengths 556 | self._write('foo', b'hello, world!') 557 | with temppath() as tpath: 558 | with open(tpath, 'wb') as writer: 559 | with self.client.read('foo', chunk_size=5, progress=cb) as reader: 560 | for chunk in reader: 561 | writer.write(chunk) 562 | with open(tpath, 'rb') as reader: 563 | assert reader.read() == b'hello, world!' 564 | assert cb('', 0) == [5, 10, 13, -1, 0] 565 | 566 | def test_read_with_encoding(self): 567 | s = u'hello, world!' 568 | self._write('foo', s, encoding='utf-8') 569 | with self.client.read('foo', encoding='utf-8') as reader: 570 | assert reader.read() == s 571 | 572 | def test_read_with_chunk_size_and_encoding(self): 573 | s = u'hello, world!' 574 | self._write('foo', s, encoding='utf-8') 575 | with self.client.read('foo', chunk_size=5, encoding='utf-8') as reader: 576 | assert list(reader) == [u'hello', u', wor', u'ld!'] 577 | 578 | def test_read_json(self): 579 | from json import dumps, load 580 | data = {'one': 1, 'two': 2} 581 | self._write('foo', data=dumps(data), encoding='utf-8') 582 | with self.client.read('foo', encoding='utf-8') as reader: 583 | assert load(reader) == data 584 | 585 | def test_read_with_delimiter(self): 586 | self._write('foo', u'hi\nworld!\n', encoding='utf-8') 587 | with self.client.read('foo', delimiter='\n', encoding='utf-8') as reader: 588 | assert list(reader) == [u'hi', u'world!', u''] 589 | 590 | 591 | class TestRename(_IntegrationTest): 592 | 593 | def test_rename_file(self): 594 | self._write('foo', b'hello, world!') 595 | self.client.rename('foo', 'bar') 596 | assert self._read('bar') == b'hello, world!' 597 | 598 | def test_rename_missing_file(self): 599 | with pytest.raises(HdfsError): 600 | self.client.rename('foo', 'bar') 601 | 602 | def test_rename_file_to_existing_file(self): 603 | with pytest.raises(HdfsError): 604 | self._write('foo', b'hello, world!') 605 | self._write('bar', b'hello again, world!') 606 | self.client.rename('foo', 'bar') 607 | 608 | def test_move_file_into_existing_directory(self): 609 | self._write('foo', b'hello, world!') 610 | self.client._mkdirs('bar') 611 | self.client.rename('foo', 'bar') 612 | assert self._read('bar/foo') == b'hello, world!' 613 | 614 | def test_rename_file_into_existing_directory(self): 615 | self._write('foo', b'hello, world!') 616 | self.client._mkdirs('bar') 617 | self.client.rename('foo', 'bar/baz') 618 | assert self._read('bar/baz') == b'hello, world!' 619 | 620 | def test_rename_file_with_special_characters(self): 621 | path = 'fo&oa ?a=1' 622 | self._write('foo', b'hello, world!') 623 | self.client.rename('foo', path) 624 | assert self._read(path) == b'hello, world!' 625 | 626 | 627 | class TestDownload(_IntegrationTest): 628 | 629 | def test_missing_dir(self): 630 | with pytest.raises(HdfsError): 631 | self._write('dl', b'hello') 632 | with temppath() as tpath: 633 | self.client.download('dl', osp.join(tpath, 'foo')) 634 | 635 | def test_normal_file(self): 636 | self._write('dl', b'hello') 637 | with temppath() as tpath: 638 | fpath = self.client.download('dl', tpath) 639 | with open(fpath) as reader: 640 | assert reader.read() == 'hello' 641 | 642 | def test_nonpartitioned_file(self): 643 | partname = 'part-r-00000' 644 | self._write('dl/' + partname, b'world') 645 | with temppath() as tpath: 646 | fname = self.client.download('dl/' + partname, tpath) 647 | with open(fname) as reader: 648 | assert reader.read() == 'world' 649 | 650 | def test_singly_partitioned_file(self): 651 | partname = 'part-r-00000' 652 | self._write('dl/' + partname, b'world') 653 | with temppath() as tpath: 654 | os.mkdir(tpath) 655 | fname = self.client.download('dl', tpath) 656 | with open(osp.join(fname, partname)) as reader: 657 | assert reader.read() == 'world' 658 | 659 | def _download_partitioned_file(self, n_threads): 660 | parts = { 661 | 'part-r-00000': b'fee', 662 | 'part-r-00001': b'faa', 663 | 'part-r-00002': b'foo', 664 | } 665 | for name, content in parts.items(): 666 | self._write('dl/{}'.format(name), content) 667 | with temppath() as tpath: 668 | self.client.download('dl', tpath, n_threads=-1) 669 | local_parts = os.listdir(tpath) 670 | assert set(local_parts) == set(parts) # We have all the parts. 671 | for part in local_parts: 672 | with open(osp.join(tpath, part), mode='rb') as reader: 673 | assert reader.read() == parts[part] # Their content is correct. 674 | 675 | def test_partitioned_file_max_threads(self): 676 | self._download_partitioned_file(0) 677 | 678 | def test_partitioned_file_sync(self): 679 | self._download_partitioned_file(1) 680 | 681 | def test_partitioned_file_setting_n_threads(self): 682 | self._download_partitioned_file(2) 683 | 684 | def test_overwrite_file(self): 685 | with temppath() as tpath: 686 | self._write('dl', b'hello') 687 | self.client.download('dl', tpath) 688 | self.client.write('dl', b'there', overwrite=True) 689 | fname = self.client.download('dl', tpath, overwrite=True) 690 | with open(fname) as reader: 691 | assert reader.read() == 'there' 692 | 693 | def test_download_file_to_existing_file(self): 694 | with pytest.raises(HdfsError): 695 | self._write('dl', b'hello') 696 | with temppath() as tpath: 697 | with open(tpath, 'w') as writer: 698 | writer.write('hi') 699 | self.client.download('dl', tpath) 700 | 701 | def test_download_file_to_existing_file_with_overwrite(self): 702 | self._write('dl', b'hello') 703 | with temppath() as tpath: 704 | with open(tpath, 'w') as writer: 705 | writer.write('hi') 706 | self.client.download('dl', tpath, overwrite=True) 707 | with open(tpath) as reader: 708 | assert reader.read() == 'hello' 709 | 710 | def test_download_file_to_existing_folder(self): 711 | self._write('dl', b'hello') 712 | with temppath() as tpath: 713 | os.mkdir(tpath) 714 | self.client.download('dl', tpath) 715 | with open(osp.join(tpath, 'dl')) as reader: 716 | assert reader.read() == 'hello' 717 | 718 | def test_download_file_to_existing_folder_with_matching_file(self): 719 | with pytest.raises(HdfsError): 720 | self._write('dl', b'hello') 721 | with temppath() as tpath: 722 | os.mkdir(tpath) 723 | with open(osp.join(tpath, 'dl'), 'w') as writer: 724 | writer.write('hey') 725 | self.client.download('dl', tpath) 726 | 727 | def test_download_file_to_existing_folder_overwrite_matching_file(self): 728 | self._write('dl', b'hello') 729 | with temppath() as tpath: 730 | os.mkdir(tpath) 731 | with open(osp.join(tpath, 'dl'), 'w') as writer: 732 | writer.write('hey') 733 | self.client.download('dl', tpath, overwrite=True) 734 | with open(osp.join(tpath, 'dl')) as reader: 735 | assert reader.read() == 'hello' 736 | 737 | def test_download_folder_to_existing_folder(self): 738 | self._write('foo/dl', b'hello') 739 | self._write('foo/bar/dl', b'there') 740 | with temppath() as tpath: 741 | os.mkdir(tpath) 742 | self.client.download('foo', tpath) 743 | with open(osp.join(tpath, 'foo', 'dl')) as reader: 744 | assert reader.read() == 'hello' 745 | with open(osp.join(tpath, 'foo', 'bar', 'dl')) as reader: 746 | assert reader.read() == 'there' 747 | 748 | def test_download_folder_to_existing_folder_parallel(self): 749 | self._write('foo/dl', b'hello') 750 | self._write('foo/bar/dl', b'there') 751 | with temppath() as tpath: 752 | os.mkdir(tpath) 753 | self.client.download('foo', tpath, n_threads=0) 754 | with open(osp.join(tpath, 'foo', 'dl')) as reader: 755 | assert reader.read() == 'hello' 756 | with open(osp.join(tpath, 'foo', 'bar', 'dl')) as reader: 757 | assert reader.read() == 'there' 758 | 759 | def test_download_folder_to_missing_folder(self): 760 | self._write('foo/dl', b'hello') 761 | self._write('foo/bar/dl', b'there') 762 | with temppath() as tpath: 763 | self.client.download('foo', tpath) 764 | with open(osp.join(tpath, 'dl')) as reader: 765 | assert reader.read() == 'hello' 766 | with open(osp.join(tpath, 'bar', 'dl')) as reader: 767 | assert reader.read() == 'there' 768 | 769 | def test_download_cleanup(self): 770 | self._write('foo/dl', b'hello') 771 | self._write('foo/bar/dl', b'there') 772 | _read = self.client.read 773 | 774 | def read(hdfs_path, *args, **kwargs): 775 | if 'bar' in hdfs_path: 776 | raise RuntimeError() 777 | return _read(hdfs_path, *args, **kwargs) 778 | 779 | with temppath() as tpath: 780 | try: 781 | self.client.read = read 782 | self.client.download('foo', tpath) 783 | except RuntimeError: 784 | assert not osp.exists(tpath) 785 | else: 786 | assert False # This shouldn't happen. 787 | finally: 788 | self.client.read = _read 789 | 790 | def test_download_empty_folder(self): 791 | with pytest.raises(HdfsError): 792 | self.client._mkdirs('foo') 793 | with temppath() as tpath: 794 | self.client.download('foo', tpath) 795 | 796 | def test_download_dir_whitespace(self): 797 | self._write('foo/foo bar.txt', b'hello') 798 | with temppath() as tpath: 799 | self.client.download('foo', tpath) 800 | with open(osp.join(tpath, 'foo bar.txt')) as reader: 801 | assert reader.read() == 'hello' 802 | 803 | def test_download_file_whitespace(self): 804 | self._write('foo/foo bar%.txt', b'hello') 805 | with temppath() as tpath: 806 | self.client.download('foo/foo bar%.txt', tpath) 807 | with open(tpath) as reader: 808 | assert reader.read() == 'hello' 809 | 810 | 811 | class TestStatus(_IntegrationTest): 812 | 813 | def test_directory(self): 814 | self.client._mkdirs('foo') 815 | status = self.client.status('foo') 816 | assert status['type'] == 'DIRECTORY' 817 | assert status['length'] == 0 818 | 819 | def test_file(self): 820 | self._write('foo', b'hello, world!') 821 | status = self.client.status('foo') 822 | assert status['type'] == 'FILE' 823 | assert status['length'] == 13 824 | 825 | def test_missing(self): 826 | with pytest.raises(HdfsError): 827 | self.client.status('foo') 828 | 829 | def test_missing_non_strict(self): 830 | assert self.client.status('foo', strict=False) is None 831 | 832 | 833 | class TestSetOwner(_IntegrationTest): 834 | 835 | @classmethod 836 | def setup_class(cls): 837 | super(TestSetOwner, cls).setup_class() 838 | if cls.client: 839 | try: 840 | cls.client.write('foo', b'') 841 | cls.client.set_owner('foo', 'bar') 842 | except HdfsError as err: 843 | if 'Non-super user cannot change owner' in str(err): 844 | cls.client = None 845 | # Skip these tests if HDFS isn't configured to support them. 846 | else: 847 | raise err 848 | 849 | def test_directory_owner(self): 850 | new_owner = 'newowner' 851 | self.client._mkdirs('foo') 852 | self.client.set_owner('foo', 'oldowner') 853 | self.client.set_owner('foo', new_owner) 854 | status = self.client.status('foo') 855 | assert status['owner'] == new_owner 856 | 857 | def test_file_owner(self): 858 | new_owner = 'newowner' 859 | self._write('foo', b'hello, world!') 860 | self.client.set_owner('foo', 'oldowner') 861 | self.client.set_owner('foo', new_owner) 862 | status = self.client.status('foo') 863 | assert status['owner'] == new_owner 864 | 865 | def test_directory_for_group(self): 866 | new_group = 'newgroup' 867 | self.client._mkdirs('foo') 868 | self.client.set_owner('foo', group='oldgroup') 869 | self.client.set_owner('foo', group=new_group) 870 | status = self.client.status('foo') 871 | assert status['group'] == new_group 872 | 873 | def test_file_for_group(self): 874 | new_group = 'newgroup' 875 | self._write('foo', b'hello, world!') 876 | self.client.set_owner('foo', group='oldgroup') 877 | self.client.set_owner('foo', group=new_group) 878 | status = self.client.status('foo') 879 | assert status['group'] == new_group 880 | 881 | def test_missing_for_group(self): 882 | with pytest.raises(HdfsError): 883 | self.client.set_owner('foo', group='blah') 884 | 885 | 886 | class TestSetPermission(_IntegrationTest): 887 | 888 | def test_directory(self): 889 | new_permission = '755' 890 | self.client._mkdirs('foo', permission='444') 891 | self.client.set_permission('foo', new_permission) 892 | status = self.client.status('foo') 893 | assert status['permission'] == new_permission 894 | 895 | def test_file(self): 896 | new_permission = '755' 897 | self.client.write('foo', b'hello, world!', permission='444') 898 | self.client.set_permission('foo', new_permission) 899 | status = self.client.status('foo') 900 | assert status['permission'] == new_permission 901 | 902 | def test_missing(self): 903 | with pytest.raises(HdfsError): 904 | self.client.set_permission('foo', '755') 905 | 906 | 907 | class TestContent(_IntegrationTest): 908 | 909 | def test_directory(self): 910 | self._write('foo', b'hello, world!') 911 | content = self.client.content('') 912 | assert content['directoryCount'] == 1 913 | assert content['fileCount'] == 1 914 | assert content['length'] == 13 915 | 916 | def test_file(self): 917 | self._write('foo', b'hello, world!') 918 | content = self.client.content('foo') 919 | assert content['directoryCount'] == 0 920 | assert content['fileCount'] == 1 921 | assert content['length'] == 13 922 | 923 | def test_missing(self): 924 | with pytest.raises(HdfsError): 925 | self.client.content('foo') 926 | 927 | def test_missing_non_strict(self): 928 | assert self.client.content('foo', strict=False) is None 929 | 930 | 931 | class TestAcl(_IntegrationTest): 932 | 933 | def test_directory(self): 934 | self._write('foo', b'hello, world!') 935 | content = self.client.acl_status('') 936 | assert len(content) > 1 937 | assert 'entries' in content 938 | assert 'group' in content 939 | assert 'owner' in content 940 | 941 | def test_set_acl(self): 942 | self.client.write('foo', 'hello, world!') 943 | self.client.set_acl('foo', 'user::rwx,user:foouser:rwx,group::r--,other::---') 944 | content = self.client.acl_status('foo') 945 | assert any('user:foouser:rwx' in s for s in content['entries']) 946 | assert len(content) > 1 947 | assert content['entries'] is not None 948 | 949 | def test_modify_acl(self): 950 | self.client.write('foo', 'hello, world!') 951 | self.client.set_acl('foo', 'user::rwx,user:foouser:rwx,group::r--,other::---') 952 | self.client.set_acl('foo', 'user:foouser:rw-', clear=False) 953 | content = self.client.acl_status('foo') 954 | assert any('user:foouser:rw-' in s for s in content['entries']) 955 | 956 | def test_missing(self): 957 | with pytest.raises(HdfsError): 958 | self.client.acl_status('foo') 959 | 960 | def test_missing_non_strict(self): 961 | assert self.client.acl_status('foo', strict=False) is None 962 | 963 | def test_remove_acl_entries(self): 964 | self.client.write('foo', 'hello, world!') 965 | self.client.set_acl('foo', 'user:baruser:rwx,user:foouser:rw-', clear=False) 966 | self.client.remove_acl_entries('foo', 'user:foouser:') 967 | content = self.client.acl_status('foo') 968 | assert not any('user:foouser:rw-' in s for s in content['entries']) 969 | assert any('user:baruser:rwx' in s for s in content['entries']) 970 | 971 | def test_remove_default_acl(self): 972 | self.client.write('foo', 'hello, world!') 973 | self.client.set_acl('foo', 'user:foouser:rwx', clear=False) 974 | self.client.remove_default_acl('foo') 975 | content = self.client.acl_status('foo') 976 | assert not any('user::rwx' in s for s in content['entries']) 977 | 978 | def test_remove_acl(self): 979 | self.client.write('foo', 'hello, world!') 980 | self.client.remove_acl('foo') 981 | content = self.client.acl_status('foo') 982 | assert content.get('entries') == [] 983 | 984 | 985 | class TestList(_IntegrationTest): 986 | 987 | def test_file(self): 988 | with pytest.raises(HdfsError): 989 | self.client.write('foo', 'hello, world!') 990 | self.client.list('foo') 991 | 992 | def test_missing(self): 993 | with pytest.raises(HdfsError): 994 | self.client.list('foo') 995 | 996 | def test_empty_dir(self): 997 | self.client._mkdirs('foo') 998 | assert self.client.list('foo') == [] 999 | 1000 | def test_dir(self): 1001 | self.client.write('foo/bar', 'hello, world!') 1002 | assert self.client.list('foo') == ['bar'] 1003 | 1004 | def test_dir_with_status(self): 1005 | self.client.write('foo/bar', 'hello, world!') 1006 | statuses = self.client.list('foo', status=True) 1007 | assert len(statuses) == 1 1008 | status = self.client.status('foo/bar') 1009 | status['pathSuffix'] = 'bar' 1010 | assert statuses[0] == ('bar', status) 1011 | 1012 | 1013 | class TestWalk(_IntegrationTest): 1014 | 1015 | def test_missing(self): 1016 | with pytest.raises(HdfsError): 1017 | list(self.client.walk('foo')) 1018 | 1019 | def test_file(self): 1020 | self.client.write('foo', 'hello, world!') 1021 | assert not list(self.client.walk('foo')) 1022 | 1023 | def test_folder(self): 1024 | self.client.write('hello', 'hello, world!') 1025 | self.client.write('foo/hey', 'hey, world!') 1026 | infos = list(self.client.walk('')) 1027 | assert len(infos) == 2 1028 | assert infos[0] == (psp.join(self.client.root), ['foo'], ['hello']) 1029 | assert infos[1] == (psp.join(self.client.root, 'foo'), [], ['hey']) 1030 | 1031 | def test_folder_with_depth(self): 1032 | self.client.write('foo/bar', 'hello, world!') 1033 | infos = list(self.client.walk('', depth=1)) 1034 | assert len(infos) == 1 1035 | assert infos[0] == (self.client.root, ['foo'], []) 1036 | 1037 | def test_folder_with_status(self): 1038 | self.client.write('foo', 'hello, world!') 1039 | infos = list(self.client.walk('', status=True)) 1040 | status = self.client.status('foo') 1041 | status['pathSuffix'] = 'foo' 1042 | assert len(infos) == 1 1043 | assert ( 1044 | infos[0] == 1045 | ( 1046 | (self.client.root, self.client.status('')), 1047 | [], 1048 | [('foo', status)] 1049 | )) 1050 | 1051 | def test_skip_missing_folder(self): 1052 | self.client.write('file', 'one') 1053 | self.client.write('folder/hey', 'two') 1054 | for info in self.client.walk('', ignore_missing=True): 1055 | assert info == (psp.join(self.client.root), ['folder'], ['file']) 1056 | self.client.delete('folder', recursive=True) 1057 | 1058 | def test_status_and_allow_dir_changes(self): 1059 | with pytest.raises(ValueError): 1060 | list(self.client.walk('.', status=True, allow_dir_changes=True)) 1061 | 1062 | def test_allow_dir_changes_subset(self): 1063 | self.client.write('foo/file1', 'one') 1064 | self.client.write('bar/file2', 'two') 1065 | infos = self.client.walk('.', allow_dir_changes=True) 1066 | info = next(infos) 1067 | info[1][:] = ['bar'] 1068 | info = next(infos) 1069 | assert info == (psp.join(self.client.root, 'bar'), [], ['file2']) 1070 | 1071 | def test_allow_dir_changes_insert(self): 1072 | self.client.write('foo/file1', 'one') 1073 | infos = self.client.walk('.', allow_dir_changes=True) 1074 | info = next(infos) 1075 | self.client.write('bar/file2', 'two') 1076 | info[1][:] = ['bar'] # Insert new directory. 1077 | info = next(infos) 1078 | assert info == (psp.join(self.client.root, 'bar'), [], ['file2']) 1079 | 1080 | 1081 | class TestLatestExpansion(_IntegrationTest): 1082 | 1083 | def test_resolve_simple(self): 1084 | self.client.write('bar', 'hello, world!') 1085 | self.client.write('foo', 'hello again, world!') 1086 | assert self.client.resolve('#LATEST') == osp.join(self.client.root, 'foo') 1087 | 1088 | def test_resolve_nested(self): 1089 | self.client.write('baz/bar', 'hello, world!') 1090 | self.client.write('bar/bar', 'hello there, world!') 1091 | self.client.write('bar/foo', 'hello again, world!') 1092 | latest = self.client.resolve('#LATEST/#LATEST') 1093 | assert latest == osp.join(self.client.root, 'bar', 'foo') 1094 | 1095 | def test_resolve_multiple(self): 1096 | self.client.write('bar/bar', 'hello, world!') 1097 | self.client.write('bar/foo', 'hello again, world!') 1098 | latest = self.client.resolve('#LATEST/#LATEST') 1099 | assert latest == osp.join(self.client.root, 'bar', 'foo') 1100 | 1101 | def test_resolve_multiple_shortcut(self): 1102 | self.client.write('bar/bar', 'hello, world!') 1103 | self.client.write('bar/foo', 'hello again, world!') 1104 | latest = self.client.resolve('#LATEST{2}') 1105 | assert latest == osp.join(self.client.root, 'bar', 'foo') 1106 | 1107 | @pytest.mark.skip(reason="HttpFS is inconsistent here.") 1108 | def test_resolve_file(self): 1109 | with pytest.raises(HdfsError): 1110 | self.client.write('bar', 'hello, world!') 1111 | self.client.resolve('bar/#LATEST') 1112 | 1113 | def test_resolve_empty_directory(self): 1114 | with pytest.raises(HdfsError): 1115 | self.client._mkdirs('bar') 1116 | self.client.resolve('bar/#LATEST') 1117 | 1118 | 1119 | class TestParts(_IntegrationTest): 1120 | 1121 | def test_missing(self): 1122 | with pytest.raises(HdfsError): 1123 | self.client.parts('foo') 1124 | 1125 | def test_file(self): 1126 | with pytest.raises(HdfsError): 1127 | self.client.write('foo', 'hello') 1128 | self.client.parts('foo') 1129 | 1130 | def test_empty_folder(self): 1131 | with pytest.raises(HdfsError): 1132 | self.client._mkdirs('foo') 1133 | self.client.parts('foo') 1134 | 1135 | def test_folder_without_parts(self): 1136 | with pytest.raises(HdfsError): 1137 | self.client.write('foo/bar', 'hello') 1138 | self.client.parts('foo') 1139 | 1140 | def test_folder_with_single_part(self): 1141 | fname = 'part-m-00000.avro' 1142 | self.client.write(psp.join('foo', fname), 'first') 1143 | assert self.client.parts('foo') == [fname] 1144 | 1145 | def test_folder_with_multiple_parts(self): 1146 | fnames = ['part-m-00000.avro', 'part-m-00001.avro'] 1147 | self.client.write(psp.join('foo', fnames[0]), 'first') 1148 | self.client.write(psp.join('foo', fnames[1]), 'second') 1149 | assert self.client.parts('foo') == fnames 1150 | 1151 | def test_folder_with_multiple_parts_and_others(self): 1152 | fnames = ['part-m-00000.avro', 'part-m-00001.avro'] 1153 | self.client.write(psp.join('foo', '.header'), 'metadata') 1154 | self.client.write(psp.join('foo', fnames[0]), 'first') 1155 | self.client.write(psp.join('foo', fnames[1]), 'second') 1156 | assert self.client.parts('foo') == fnames 1157 | 1158 | def test_with_selection(self): 1159 | fnames = ['part-m-00000.avro', 'part-m-00001.avro'] 1160 | self.client.write(psp.join('foo', '.header'), 'metadata') 1161 | self.client.write(psp.join('foo', fnames[0]), 'first') 1162 | self.client.write(psp.join('foo', fnames[1]), 'second') 1163 | parts = self.client.parts('foo', parts=1) 1164 | assert len(parts) == 1 1165 | assert parts[0] in fnames 1166 | 1167 | def test_with_selection(self): 1168 | fnames = ['part-m-00000.avro', 'part-m-00001.avro'] 1169 | self.client.write(psp.join('foo', '.header'), 'metadata') 1170 | self.client.write(psp.join('foo', fnames[0]), 'first') 1171 | self.client.write(psp.join('foo', fnames[1]), 'second') 1172 | assert self.client.parts('foo', parts=[1]) == fnames[1:] 1173 | 1174 | def test_with_status(self): 1175 | fname = 'part-m-00000.avro' 1176 | fpath = psp.join('foo', fname) 1177 | self.client.write(fpath, 'first') 1178 | status = self.client.status(fpath) 1179 | status['pathSuffix'] = fname 1180 | assert self.client.parts('foo', status=True) == [(fname, status)] 1181 | 1182 | 1183 | class TestMakeDirs(_IntegrationTest): 1184 | 1185 | def test_simple(self): 1186 | self.client.makedirs('foo') 1187 | assert self.client.status('foo')['type'] == 'DIRECTORY' 1188 | 1189 | def test_nested(self): 1190 | self.client.makedirs('foo/bar') 1191 | assert self.client.status('foo/bar')['type'] == 'DIRECTORY' 1192 | 1193 | def test_with_permission(self): 1194 | self.client.makedirs('foo', permission='733') 1195 | assert self.client.status('foo')['permission'] == '733' 1196 | 1197 | def test_overwrite_file(self): 1198 | with pytest.raises(HdfsError): 1199 | self.client.write('foo', 'hello') 1200 | self.client.makedirs('foo') 1201 | 1202 | def test_overwrite_directory_with_permission(self): 1203 | self.client.makedirs('foo', permission='733') 1204 | self.client.makedirs('foo/bar', permission='722') 1205 | assert self.client.status('foo')['permission'] == '733' 1206 | assert self.client.status('foo/bar')['permission'] == '722' 1207 | 1208 | 1209 | class TestSetTimes(_IntegrationTest): 1210 | 1211 | def test_none(self): 1212 | with pytest.raises(ValueError): 1213 | self.client.makedirs('foo') 1214 | self.client.set_times('foo') 1215 | 1216 | def test_missing(self): 1217 | with pytest.raises(HdfsError): 1218 | self.client.set_times('foo', 1234) 1219 | 1220 | @pytest.mark.skip() # HttpFS doesn't raise an error here. 1221 | def test_negative(self): 1222 | with pytest.raises(HdfsError): 1223 | self.client.write('foo', 'hello') 1224 | self.client.set_times('foo', access_time=-1234) 1225 | 1226 | def test_file(self): 1227 | self.client.write('foo', 'hello') 1228 | self.client.set_times('foo', access_time=1234) 1229 | assert self.client.status('foo')['accessTime'] == 1234 1230 | self.client.set_times('foo', modification_time=12345) 1231 | assert self.client.status('foo')['modificationTime'] == 12345 1232 | self.client.set_times('foo', access_time=1, modification_time=2) 1233 | status = self.client.status('foo') 1234 | assert status['accessTime'] == 1 1235 | assert status['modificationTime'] == 2 1236 | 1237 | def test_folder(self): 1238 | self.client.write('foo/bar', 'hello') 1239 | self.client.set_times('foo', access_time=1234) 1240 | assert self.client.status('foo')['accessTime'] == 1234 1241 | self.client.set_times('foo', modification_time=12345) 1242 | assert self.client.status('foo')['modificationTime'] == 12345 1243 | self.client.set_times('foo', access_time=1, modification_time=2) 1244 | status = self.client.status('foo') 1245 | assert status['accessTime'] == 1 1246 | assert status['modificationTime'] == 2 1247 | 1248 | 1249 | class TestChecksum(_IntegrationTest): 1250 | 1251 | def test_missing(self): 1252 | with pytest.raises(HdfsError): 1253 | self.client.checksum('foo') 1254 | 1255 | def test_folder(self): 1256 | with pytest.raises(HdfsError): 1257 | self.client.makedirs('foo') 1258 | self.client.checksum('foo') 1259 | 1260 | def test_file(self): 1261 | self.client.write('foo', 'hello') 1262 | checksum = self.client.checksum('foo') 1263 | assert {'algorithm', 'bytes', 'length'} == set(checksum) 1264 | 1265 | 1266 | class TestSetReplication(_IntegrationTest): 1267 | 1268 | def test_missing(self): 1269 | with pytest.raises(HdfsError): 1270 | self.client.set_replication('foo', 1) 1271 | 1272 | def test_folder(self): 1273 | with pytest.raises(HdfsError): 1274 | self.client.makedirs('foo') 1275 | self.client.set_replication('foo', 1) 1276 | 1277 | def test_invalid_replication(self): 1278 | with pytest.raises(HdfsError): 1279 | self.client.write('foo', 'hello') 1280 | self.client.set_replication('foo', 0) 1281 | 1282 | def test_file(self): 1283 | self.client.write('foo', 'hello') 1284 | replication = self.client.status('foo')['replication'] + 1 1285 | self.client.set_replication('foo', replication) 1286 | assert self.client.status('foo')['replication'] == replication 1287 | 1288 | 1289 | class TestTokenClient(object): 1290 | 1291 | def test_without_session(self): 1292 | client = TokenClient('url', '123') 1293 | assert client._session.params['delegation'] == '123' 1294 | 1295 | def test_with_session(self): 1296 | session = rq.Session() 1297 | client = TokenClient('url', '123', session=session) 1298 | assert session.params['delegation'] == '123' 1299 | 1300 | 1301 | class TestSnapshot(_IntegrationTest): 1302 | 1303 | @classmethod 1304 | def setup_class(cls): 1305 | super(TestSnapshot, cls).setup_class() 1306 | if cls.client: 1307 | cls.client._mkdirs('foo') 1308 | try: 1309 | cls.client.allow_snapshot('foo') 1310 | except HdfsError as err: 1311 | if 'java.lang.IllegalArgumentException: No enum constant' in str(err): 1312 | cls.client = None 1313 | # Skip these tests if we get this error message from HDFS (currently 1314 | # happens using HTTPFS) which causes all snapshot operations to fail. 1315 | else: 1316 | raise err 1317 | 1318 | def test_allow_snapshot(self): 1319 | self.client._mkdirs('foo') 1320 | self.client.allow_snapshot('foo') 1321 | 1322 | def test_allow_snapshot_double(self): 1323 | self.client._mkdirs('foo') 1324 | self.client.allow_snapshot('foo') 1325 | self.client.allow_snapshot('foo') 1326 | 1327 | def test_disallow_snapshot(self): 1328 | self.client._mkdirs('foo') 1329 | self.client.allow_snapshot('foo') 1330 | self.client.disallow_snapshot('foo') 1331 | 1332 | def test_disallow_no_allow(self): 1333 | self.client._mkdirs('foo') 1334 | self.client.disallow_snapshot('foo') 1335 | 1336 | def test_allow_snapshot_not_exists(self): 1337 | with pytest.raises(HdfsError): 1338 | self.client.allow_snapshot('foo') 1339 | 1340 | def test_disallow_snapshot_not_exists(self): 1341 | with pytest.raises(HdfsError): 1342 | self.client.disallow_snapshot('foo') 1343 | 1344 | def test_allow_snapshot_file(self): 1345 | with pytest.raises(HdfsError): 1346 | self._write('foo', b'hello') 1347 | self.client.allow_snapshot('foo') 1348 | 1349 | def test_disallow_snapshot_file(self): 1350 | with pytest.raises(HdfsError): 1351 | self._write('foo', b'hello') 1352 | self.client.disallow_snapshot('foo') 1353 | 1354 | def test_create_delete_snapshot(self): 1355 | # One cannot test creation and deletion separately, as one cannot 1356 | # clean HDFS for test isolation if a created snapshot remains 1357 | # undeleted. 1358 | self.client._mkdirs('foo') 1359 | self.client.allow_snapshot('foo') 1360 | self.client.create_snapshot('foo', 'mysnap') 1361 | self.client.delete_snapshot('foo', 'mysnap') 1362 | 1363 | def test_create_snapshot_name(self): 1364 | self.client._mkdirs('foo') 1365 | self.client.allow_snapshot('foo') 1366 | try: 1367 | snapshot_path = self.client.create_snapshot('foo', 'mysnap') 1368 | assert re.search(r'/foo/\.snapshot/mysnap$',snapshot_path) 1369 | finally: 1370 | # Cleanup, as it breaks other tests otherwise: the dir cannot be 1371 | # removed with an active snapshots. 1372 | self.client.delete_snapshot('foo', 'mysnap') 1373 | 1374 | def test_delete_snapshot_other(self): 1375 | with pytest.raises(HdfsError): 1376 | self.client._mkdirs('foo') 1377 | self.client.allow_snapshot('foo') 1378 | self.client.create_snapshot('foo', 'mysnap') 1379 | try: 1380 | self.client.delete_snapshot('foo', 'othersnap') 1381 | finally: 1382 | # Cleanup, as it breaks other tests otherwise: the dir cannot be 1383 | # removed with an active snapshots. 1384 | self.client.delete_snapshot('foo', 'mysnap') 1385 | 1386 | def test_disallow_snapshot_exists(self): 1387 | with pytest.raises(HdfsError): 1388 | self.client._mkdirs('foo_disallow') 1389 | self.client.allow_snapshot('foo_disallow') 1390 | self.client.create_snapshot('foo_disallow', 'mysnap') 1391 | try: 1392 | self.client.disallow_snapshot('foo_disallow') 1393 | finally: 1394 | # Cleanup, as it breaks other tests otherwise: the dir cannot be 1395 | # removed with an active snapshots. 1396 | self.client.delete_snapshot('foo_disallow', 'mysnap') 1397 | 1398 | def test_create_snapshot_noallow(self): 1399 | with pytest.raises(HdfsError): 1400 | self.client._mkdirs('foo') 1401 | self.client.create_snapshot('foo', 'mysnap') 1402 | 1403 | def test_delete_snapshot_noallow(self): 1404 | with pytest.raises(HdfsError): 1405 | self.client._mkdirs('foo') 1406 | self.client.delete_snapshot('foo', 'mysnap') 1407 | 1408 | def test_create_snapshot_noexist(self): 1409 | with pytest.raises(HdfsError): 1410 | self.client.create_snapshot('foo', 'mysnap') 1411 | 1412 | def test_rename_snapshot(self): 1413 | self.client._mkdirs('foo') 1414 | self.client.allow_snapshot('foo') 1415 | self.client.create_snapshot('foo', 'myspan') 1416 | try: 1417 | self.client.rename_snapshot('foo', 'myspan', 'yourspan') 1418 | finally: 1419 | self.client.delete_snapshot('foo', 'yourspan') 1420 | 1421 | def test_rename_snapshot_not_exists(self): 1422 | with pytest.raises(HdfsError): 1423 | self.client.rename_snapshot('foo', 'myspan', 'yourspan') 1424 | 1425 | def test_rename_snapshot_not_overwrite(self): 1426 | with pytest.raises(HdfsError): 1427 | self.client._mkdirs('foo') 1428 | self.client.allow_snapshot('foo') 1429 | self.client.create_snapshot('foo', 'myspan') 1430 | self.client.create_snapshot('foo', 'yourspan') 1431 | try: 1432 | self.client.rename_snapshot('foo', 'myspan', 'yourspan') 1433 | finally: 1434 | self.client.delete_snapshot('foo', 'myspan') 1435 | self.client.delete_snapshot('foo', 'yourspan') 1436 | -------------------------------------------------------------------------------- /test/test_config.py: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env python 2 | # encoding: utf-8 3 | 4 | """Test configuration module.""" 5 | 6 | from hdfs.client import Client 7 | from hdfs.config import Config 8 | from hdfs.util import HdfsError, temppath 9 | from logging.handlers import TimedRotatingFileHandler 10 | from string import Template 11 | from test.util import save_config 12 | import logging as lg 13 | import os 14 | import os.path as osp 15 | import pytest 16 | import sys 17 | 18 | 19 | class TestConfig(object): 20 | 21 | @pytest.mark.skip(reason="TODO: Find cross-platform way to reset the environment variable.") 22 | def test_config_path(self): 23 | path = os.getenv('HDFSCLI_CONFIG') 24 | try: 25 | with temppath() as tpath: 26 | os.environ['HDFSCLI_CONFIG'] = tpath 27 | with open(tpath, 'w') as writer: 28 | writer.write('[foo]\nbar=hello') 29 | assert Config().get('foo', 'bar') == 'hello' 30 | finally: 31 | if path: 32 | os['HDFSCLI_CONFIG'] = path 33 | else: 34 | del os['HDFSCLI_CONFIG'] 35 | 36 | def _write_client_module(self, path, class_name): 37 | template = osp.join(osp.dirname(__file__), 'dat', 'client_template.py') 38 | with open(template) as reader: 39 | contents = Template(reader.read()).substitute({ 40 | 'class_name': class_name, 41 | }) 42 | with open(path, 'w') as writer: 43 | writer.write(contents) 44 | 45 | def test_autoload_client_from_path(self): 46 | with temppath() as module_path: 47 | self._write_client_module(module_path, 'PathClient') 48 | with temppath() as config_path: 49 | config = Config(config_path) 50 | config.add_section(config.global_section) 51 | config.set(config.global_section, 'autoload.paths', module_path) 52 | config._autoload() 53 | client = Client.from_options({'url': ''}, 'PathClient') 54 | assert client.one == 1 55 | 56 | def test_autoload_missing_path(self): 57 | with pytest.raises(SystemExit): 58 | with temppath() as module_path: 59 | with temppath() as config_path: 60 | config = Config(config_path) 61 | config.add_section(config.global_section) 62 | config.set(config.global_section, 'autoload.paths', module_path) 63 | config._autoload() 64 | 65 | def test_autoload_client_from_module(self): 66 | with temppath() as module_dpath: 67 | os.mkdir(module_dpath) 68 | sys.path.append(module_dpath) 69 | module_fpath = osp.join(module_dpath, 'mclient.py') 70 | self._write_client_module(module_fpath, 'ModuleClient') 71 | try: 72 | with temppath() as config_path: 73 | config = Config(config_path) 74 | config.add_section(config.global_section) 75 | config.set(config.global_section, 'autoload.modules', 'mclient') 76 | config._autoload() 77 | client = Client.from_options({'url': ''}, 'ModuleClient') 78 | assert client.one == 1 79 | finally: 80 | sys.path.remove(module_dpath) 81 | 82 | def test_create_client_with_alias(self): 83 | with temppath() as tpath: 84 | config = Config(path=tpath) 85 | section = 'dev.alias' 86 | config.add_section(section) 87 | config.set(section, 'url', 'http://host:port') 88 | save_config(config) 89 | Config(path=tpath).get_client('dev') 90 | 91 | def test_create_client_with_alias_and_timeout(self): 92 | with temppath() as tpath: 93 | config = Config(path=tpath) 94 | section = 'dev.alias' 95 | config.add_section(section) 96 | config.set(section, 'url', 'http://host:port') 97 | config.set(section, 'timeout', '1') 98 | save_config(config) 99 | assert Config(path=tpath).get_client('dev')._timeout == 1 100 | config.set(section, 'timeout', '1,2') 101 | save_config(config) 102 | assert Config(path=tpath).get_client('dev')._timeout == (1,2) 103 | 104 | def test_create_client_with_missing_alias(self): 105 | with pytest.raises(HdfsError): 106 | with temppath() as tpath: 107 | Config(tpath).get_client('dev') 108 | 109 | def test_create_client_with_no_alias_without_default(self): 110 | with pytest.raises(HdfsError): 111 | with temppath() as tpath: 112 | Config(tpath).get_client() 113 | 114 | def test_create_client_with_default_alias(self): 115 | with temppath() as tpath: 116 | config = Config(tpath) 117 | config.add_section(config.global_section) 118 | config.set(config.global_section, 'default.alias', 'dev') 119 | section = 'dev.alias' 120 | config.add_section(section) 121 | config.set(section, 'url', 'http://host:port') 122 | save_config(config) 123 | Config(tpath).get_client() 124 | 125 | def test_get_file_handler(self): 126 | with temppath() as tpath: 127 | config = Config(tpath) 128 | handler = config.get_log_handler('cmd') 129 | assert isinstance(handler, TimedRotatingFileHandler) 130 | 131 | def test_disable_file_logging(self): 132 | with temppath() as tpath: 133 | config = Config(tpath) 134 | config.add_section('cmd.command') 135 | config.set('cmd.command', 'log.disable', 'true') 136 | save_config(config) 137 | config = Config(tpath) 138 | handler = config.get_log_handler('cmd') 139 | assert not isinstance(handler, TimedRotatingFileHandler) 140 | -------------------------------------------------------------------------------- /test/test_examples.py: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env python 2 | # encoding: utf-8 3 | 4 | """Test that the examples run correctly.""" 5 | 6 | from hdfs import Config 7 | from hdfs.config import _load_source 8 | from six import add_metaclass 9 | from test.util import _IntegrationTest 10 | import os 11 | import os.path as osp 12 | import pytest 13 | 14 | class _ExamplesType(type): 15 | 16 | """Metaclass generating a test for each example.""" 17 | 18 | dpath = osp.join(osp.dirname(__file__), os.pardir, 'examples') 19 | 20 | def __new__(mcs, cls, bases, attrs): 21 | 22 | def make_test(fname): 23 | fpath = osp.join(mcs.dpath, fname) 24 | module = osp.splitext(fname)[0] 25 | 26 | def test(self): 27 | try: 28 | _load_source(module, fpath) 29 | except ImportError: 30 | # Unmet dependency. 31 | pytest.skip() 32 | 33 | test.__name__ = 'test_{}'.format(module) 34 | test.__doc__ = 'Test for example {}.'.format(fpath) 35 | return test 36 | 37 | for fname in os.listdir(mcs.dpath): 38 | if osp.splitext(fname)[1] == '.py': 39 | test = make_test(fname) 40 | attrs[test.__name__] = test 41 | return super(_ExamplesType, mcs).__new__(mcs, cls, bases, attrs) 42 | 43 | 44 | @add_metaclass(_ExamplesType) 45 | class TestExamples(_IntegrationTest): 46 | 47 | """Empty since tests are injected by the metaclass.""" 48 | 49 | _get_client = None 50 | 51 | @classmethod 52 | def setup_class(cls): 53 | super(TestExamples, cls).setup_class() 54 | cls._get_client = Config.get_client 55 | Config.get_client = staticmethod(lambda: cls.client) 56 | 57 | @classmethod 58 | def teardown_class(cls): 59 | Config.get_client = cls._get_client 60 | super(TestExamples, cls).teardown_class() 61 | -------------------------------------------------------------------------------- /test/test_ext_avro.py: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env python 2 | # encoding: utf-8 3 | 4 | """Test Avro extension.""" 5 | 6 | from hdfs.util import HdfsError, temppath 7 | from json import dumps, load, loads 8 | from test.util import _IntegrationTest 9 | import os 10 | import os.path as osp 11 | import pytest 12 | 13 | try: 14 | from hdfs.ext.avro import (_SeekableReader, _SchemaInferrer, AvroReader, 15 | AvroWriter) 16 | from hdfs.ext.avro.__main__ import main 17 | except ImportError: 18 | SKIP = True 19 | else: 20 | SKIP = False 21 | 22 | 23 | class TestSeekableReader(object): 24 | 25 | def setup_method(self): 26 | if SKIP: 27 | pytest.skip() 28 | 29 | def test_normal_read(self): 30 | with temppath() as tpath: 31 | with open(tpath, 'w') as writer: 32 | writer.write('abcd') 33 | with open(tpath) as reader: 34 | sreader = _SeekableReader(reader) 35 | assert sreader.read(3) == 'abc' 36 | assert sreader.read(2) == 'd' 37 | assert not sreader.read(1) 38 | 39 | def test_buffered_read(self): 40 | with temppath() as tpath: 41 | with open(tpath, 'w') as writer: 42 | writer.write('abcdefghi') 43 | with open(tpath) as reader: 44 | sreader = _SeekableReader(reader, 3) 45 | assert sreader.read(1) == 'a' 46 | assert sreader.read(3) == 'bcd' 47 | sreader.seek(-3, os.SEEK_CUR) 48 | assert sreader.read(2) == 'bc' 49 | assert sreader.read(6) == 'defghi' 50 | assert not sreader.read(1) 51 | 52 | 53 | class TestInferSchema(object): 54 | 55 | def setup_method(self): 56 | if SKIP: 57 | pytest.skip() 58 | 59 | 60 | def test_array(self): 61 | assert ( 62 | _SchemaInferrer().infer({'foo': 1, 'bar': ['hello']}) == 63 | { 64 | 'type': 'record', 65 | 'name': '__Record1', 66 | 'fields': [ 67 | {'type': {'type': 'array', 'items': 'string'}, 'name': 'bar'}, 68 | {'type': 'int', 'name': 'foo'}, 69 | ] 70 | }) 71 | 72 | def test_flat_record(self): 73 | assert ( 74 | _SchemaInferrer().infer({'foo': 1, 'bar': 'hello'}) == 75 | { 76 | 'type': 'record', 77 | 'name': '__Record1', 78 | 'fields': [ 79 | {'type': 'string', 'name': 'bar'}, 80 | {'type': 'int', 'name': 'foo'}, 81 | ] 82 | }) 83 | 84 | def test_nested_record(self): 85 | assert ( 86 | _SchemaInferrer().infer({'foo': {'bax': 2}, 'bar': {'baz': 3}}) == 87 | { 88 | 'type': 'record', 89 | 'name': '__Record1', 90 | 'fields': [ 91 | { 92 | 'type': { 93 | 'type': 'record', 94 | 'name': '__Record2', 95 | 'fields': [{'type': 'int', 'name': 'baz'}] 96 | }, 97 | 'name': 'bar', 98 | }, 99 | { 100 | 'type': { 101 | 'type': 'record', 102 | 'name': '__Record3', 103 | 'fields': [{'type': 'int', 'name': 'bax'}] 104 | }, 105 | 'name': 'foo', 106 | }, 107 | ] 108 | }) 109 | 110 | 111 | class _AvroIntegrationTest(_IntegrationTest): 112 | 113 | dpath = osp.join(osp.dirname(__file__), 'dat') 114 | schema = None 115 | records = None 116 | 117 | @classmethod 118 | def setup_class(cls): 119 | if SKIP: 120 | return 121 | super(_AvroIntegrationTest, cls).setup_class() 122 | with open(osp.join(cls.dpath, 'weather.avsc')) as reader: 123 | cls.schema = loads(reader.read()) 124 | with open(osp.join(cls.dpath, 'weather.jsonl')) as reader: 125 | cls.records = [loads(line) for line in reader] 126 | 127 | @classmethod 128 | def _get_data_bytes(cls, fpath): 129 | # Get Avro bytes, skipping header (order of schema fields is undefined) and 130 | # sync marker. This assumes that the file can be written in a single block. 131 | with open(fpath, 'rb') as reader: 132 | reader.seek(-16, os.SEEK_END) # Sync marker always last 16 bytes. 133 | sync_marker = reader.read() 134 | reader.seek(0) 135 | content = reader.read() 136 | sync_pos = content.find(sync_marker) 137 | return content[sync_pos + 16:-16] 138 | 139 | 140 | class TestRead(_AvroIntegrationTest): 141 | 142 | def test_read(self): 143 | self.client.upload('weather.avro', osp.join(self.dpath, 'weather.avro')) 144 | with AvroReader(self.client, 'weather.avro') as reader: 145 | assert list(reader) == self.records 146 | 147 | def test_read_with_same_schema(self): 148 | self.client.upload('w.avro', osp.join(self.dpath, 'weather.avro')) 149 | with AvroReader(self.client, 'w.avro', reader_schema=self.schema) as reader: 150 | assert list(reader) == self.records 151 | 152 | def test_read_with_compatible_schema(self): 153 | self.client.upload('w.avro', osp.join(self.dpath, 'weather.avro')) 154 | schema = { 155 | 'name': 'test.Weather', 156 | 'type': 'record', 157 | 'fields': [ 158 | {'name': 'temp', 'type': 'int'}, 159 | {'name': 'tag', 'type': 'string', 'default': ''}, 160 | ], 161 | } 162 | with AvroReader(self.client, 'w.avro', reader_schema=schema) as reader: 163 | assert ( 164 | list(reader) == 165 | [{'temp': r['temp'], 'tag': ''} for r in self.records]) 166 | 167 | 168 | class TestWriter(_AvroIntegrationTest): 169 | 170 | def test_write(self): 171 | writer = AvroWriter( 172 | self.client, 173 | 'weather.avro', 174 | schema=self.schema, 175 | ) 176 | with writer: 177 | for record in self.records: 178 | writer.write(record) 179 | with temppath() as tpath: 180 | self.client.download('weather.avro', tpath) 181 | assert ( 182 | self._get_data_bytes(osp.join(self.dpath, 'weather.avro')) == 183 | self._get_data_bytes(tpath)) 184 | 185 | def test_write_in_multiple_blocks(self): 186 | writer = AvroWriter( 187 | self.client, 188 | 'weather.avro', 189 | schema=self.schema, 190 | sync_interval=1 # Flush block on every write. 191 | ) 192 | with writer: 193 | for record in self.records: 194 | writer.write(record) 195 | with AvroReader(self.client, 'weather.avro') as reader: 196 | assert list(reader) == self.records 197 | 198 | def test_write_empty(self): 199 | with AvroWriter(self.client, 'empty.avro', schema=self.schema): 200 | pass 201 | with AvroReader(self.client, 'empty.avro') as reader: 202 | assert reader.schema == self.schema 203 | assert list(reader) == [] 204 | 205 | def test_write_overwrite_error(self): 206 | with pytest.raises(HdfsError): 207 | # To check that the background `AsyncWriter` thread doesn't hang. 208 | self.client.makedirs('weather.avro') 209 | with AvroWriter(self.client, 'weather.avro', schema=self.schema) as writer: 210 | for record in self.records: 211 | writer.write(record) 212 | 213 | def test_infer_schema(self): 214 | with AvroWriter(self.client, 'weather.avro') as writer: 215 | for record in self.records: 216 | writer.write(record) 217 | with AvroReader(self.client, 'weather.avro') as reader: 218 | assert list(reader) == self.records 219 | 220 | 221 | class TestMain(_AvroIntegrationTest): 222 | 223 | def test_schema(self): 224 | self.client.upload('weather.avro', osp.join(self.dpath, 'weather.avro')) 225 | with temppath() as tpath: 226 | with open(tpath, 'w') as writer: 227 | main(['schema', 'weather.avro'], client=self.client, stdout=writer) 228 | with open(tpath) as reader: 229 | schema = load(reader) 230 | assert self.schema == schema 231 | 232 | def test_read(self): 233 | self.client.upload('weather.avro', osp.join(self.dpath, 'weather.avro')) 234 | with temppath() as tpath: 235 | with open(tpath, 'w') as writer: 236 | main( 237 | ['read', 'weather.avro', '--num', '2'], 238 | client=self.client, 239 | stdout=writer 240 | ) 241 | with open(tpath) as reader: 242 | records = [loads(line) for line in reader] 243 | assert records == self.records[:2] 244 | 245 | def test_read_part_file(self): 246 | data = { 247 | 'part-m-00000.avro': [{'name': 'jane'}, {'name': 'bob'}], 248 | 'part-m-00001.avro': [{'name': 'john'}, {'name': 'liz'}], 249 | } 250 | for fname, records in data.items(): 251 | with AvroWriter(self.client, 'data.avro/{}'.format(fname)) as writer: 252 | for record in records: 253 | writer.write(record) 254 | with temppath() as tpath: 255 | with open(tpath, 'w') as writer: 256 | main( 257 | ['read', 'data.avro', '--parts', '1,'], 258 | client=self.client, 259 | stdout=writer 260 | ) 261 | with open(tpath) as reader: 262 | records = [loads(line) for line in reader] 263 | assert records == data['part-m-00001.avro'] 264 | 265 | def test_write(self): 266 | with open(osp.join(self.dpath, 'weather.jsonl')) as reader: 267 | main( 268 | [ 269 | 'write', 'weather.avro', 270 | '--schema', dumps(self.schema), 271 | '--codec', 'null', 272 | ], 273 | client=self.client, 274 | stdin=reader 275 | ) 276 | with temppath() as tpath: 277 | self.client.download('weather.avro', tpath) 278 | assert ( 279 | self._get_data_bytes(tpath) == 280 | self._get_data_bytes(osp.join(self.dpath, 'weather.avro'))) 281 | 282 | def test_write_codec(self): 283 | with open(osp.join(self.dpath, 'weather.jsonl')) as reader: 284 | main( 285 | [ 286 | 'write', 'weather.avro', 287 | '--schema', dumps(self.schema), 288 | '--codec', 'deflate', 289 | ], 290 | client=self.client, 291 | stdin=reader 292 | ) 293 | # Correct content. 294 | with AvroReader(self.client, 'weather.avro') as reader: 295 | records = list(reader) 296 | assert records == self.records 297 | # Different size (might not be smaller, since very small file). 298 | compressed_size = self.client.content('weather.avro')['length'] 299 | uncompressed_size = osp.getsize(osp.join(self.dpath, 'weather.avro')) 300 | assert compressed_size != uncompressed_size 301 | -------------------------------------------------------------------------------- /test/test_ext_dataframe.py: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env python 2 | # encoding: utf-8 3 | 4 | """Test Dataframe extension.""" 5 | 6 | from hdfs.util import HdfsError, temppath 7 | from json import loads 8 | from test.util import _IntegrationTest 9 | import os.path as osp 10 | 11 | try: 12 | from hdfs.ext.avro import AvroReader 13 | from hdfs.ext.dataframe import read_dataframe, write_dataframe 14 | from pandas.testing import assert_frame_equal 15 | import pandas as pd 16 | except ImportError: 17 | SKIP = True 18 | else: 19 | SKIP = False 20 | 21 | 22 | class _DataFrameIntegrationTest(_IntegrationTest): 23 | 24 | dpath = osp.join(osp.dirname(__file__), 'dat') 25 | records = None 26 | df = None 27 | 28 | @classmethod 29 | def setup_class(cls): 30 | if SKIP: 31 | return 32 | super(_DataFrameIntegrationTest, cls).setup_class() 33 | with open(osp.join(cls.dpath, 'weather.jsonl')) as reader: 34 | cls.records = [loads(line) for line in reader] 35 | cls.df = pd.DataFrame.from_records(cls.records) 36 | 37 | 38 | class TestReadDataFrame(_DataFrameIntegrationTest): 39 | 40 | def test_read(self): 41 | self.client.upload('weather.avro', osp.join(self.dpath, 'weather.avro')) 42 | assert_frame_equal( 43 | read_dataframe(self.client, 'weather.avro'), 44 | self.df 45 | ) 46 | 47 | 48 | class TestWriteDataFrame(_DataFrameIntegrationTest): 49 | 50 | def test_write(self): 51 | write_dataframe(self.client, 'weather.avro', self.df) 52 | with AvroReader(self.client, 'weather.avro') as reader: 53 | assert list(reader) == self.records 54 | 55 | 56 | class TestReadWriteDataFrame(_DataFrameIntegrationTest): 57 | 58 | def test_column_order(self): 59 | # Column order should be preserved, not just alphabetical. 60 | df = self.df[['temp', 'station', 'time']] 61 | write_dataframe(self.client, 'weather-ordered.avro', df) 62 | assert_frame_equal( 63 | read_dataframe(self.client, 'weather-ordered.avro'), 64 | df 65 | ) 66 | -------------------------------------------------------------------------------- /test/test_ext_kerberos.py: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env python 2 | # encoding: utf-8 3 | 4 | """Test Kerberos extension.""" 5 | 6 | from threading import Lock, Thread 7 | from time import sleep, time 8 | import sys 9 | 10 | 11 | class MockHTTPKerberosAuth(object): 12 | 13 | def __init__(self, **kwargs): 14 | self._lock = Lock() 15 | self._calls = set() 16 | self._items = [] 17 | 18 | def __call__(self, n): 19 | with self._lock: 20 | assert not self._items 21 | self._items.append(n) 22 | sleep(0.25) 23 | with self._lock: 24 | thread = self._items.pop() 25 | assert thread == n 26 | self._calls.add(thread) 27 | 28 | 29 | class MockModule(object): 30 | def __init__(self): 31 | self.HTTPKerberosAuth = MockHTTPKerberosAuth 32 | 33 | 34 | sys.modules['requests_kerberos'] = MockModule() 35 | 36 | from hdfs.ext.kerberos import _HdfsHTTPKerberosAuth 37 | 38 | 39 | class TestKerberosClient(object): 40 | 41 | def test_max_concurrency(self): 42 | auth = _HdfsHTTPKerberosAuth(1, mutual_auth='OPTIONAL') 43 | t1 = Thread(target=auth.__call__, args=(1, )) 44 | t1.start() 45 | t2 = Thread(target=auth.__call__, args=(2, )) 46 | t2.start() 47 | t1.join() 48 | t2.join() 49 | assert auth._calls == {1, 2} 50 | -------------------------------------------------------------------------------- /test/test_main.py: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env python 2 | # encoding: utf-8 3 | 4 | """Test CLI.""" 5 | 6 | from hdfs.__main__ import _Progress, configure_client, main, parse_arg 7 | from hdfs.config import Config, NullHandler 8 | from hdfs.util import HdfsError, temppath 9 | from logging.handlers import TimedRotatingFileHandler 10 | from test.util import _IntegrationTest 11 | import filecmp 12 | import logging as lg 13 | import os 14 | import os.path as osp 15 | import pytest 16 | import sys 17 | 18 | 19 | class TestParseArg(object): 20 | 21 | def test_parse_invalid(self): 22 | with pytest.raises(HdfsError): 23 | parse_arg({'foo': 'a'}, 'foo', int) 24 | 25 | def test_parse_int(self): 26 | assert parse_arg({'foo': '1'}, 'foo', int) == 1 27 | assert parse_arg({'foo': '1'}, 'foo', int, ',') == 1 28 | 29 | def test_parse_float(self): 30 | assert parse_arg({'foo': '123.4'}, 'foo', float) == 123.4 31 | 32 | def test_parse_int_list(self): 33 | assert parse_arg({'foo': '1,'}, 'foo', int, ',') == [1] 34 | assert parse_arg({'foo': '1,2'}, 'foo', int, ',') == [1,2] 35 | 36 | 37 | class TestConfigureClient(object): 38 | 39 | def test_with_alias(self): 40 | url = 'http://host:port' 41 | with temppath() as tpath: 42 | config = Config(path=tpath) 43 | section = 'dev.alias' 44 | config.add_section(section) 45 | config.set(section, 'url', url) 46 | args = {'--alias': 'dev', '--log': False, '--verbose': 0} 47 | client = configure_client('test', args, config=config) 48 | assert client.url == url 49 | assert client.urls == [url] 50 | 51 | 52 | class TestProgress(object): 53 | 54 | def test_single_file(self): 55 | with temppath() as tpath: 56 | with open(tpath, 'w') as writer: 57 | progress = _Progress(100, 1, writer=writer) 58 | progress('foo', 60) 59 | assert progress._data['foo'] == 60 60 | assert progress._pending_files == 0 61 | assert progress._downloading_files == 1 62 | progress('foo', 40) 63 | progress('foo', -1) 64 | assert progress._downloading_files == 0 65 | assert progress._complete_files == 1 66 | 67 | def test_from_local_path(self): 68 | with temppath() as dpath: 69 | os.mkdir(dpath) 70 | fpath1 = osp.join(dpath, 'foo') 71 | with open(fpath1, 'w') as writer: 72 | writer.write('hey') 73 | os.mkdir(osp.join(dpath, 'bar')) 74 | fpath2 = osp.join(dpath, 'bar', 'baz') 75 | with open(fpath2, 'w') as writer: 76 | writer.write('hello') 77 | with temppath() as tpath: 78 | with open(tpath, 'w') as writer: 79 | progress = _Progress.from_local_path(dpath, writer=writer) 80 | assert progress._total_bytes == 8 81 | assert progress._pending_files == 2 82 | 83 | 84 | class TestMain(_IntegrationTest): 85 | 86 | dpath = osp.join(osp.dirname(__file__), 'dat') 87 | 88 | def setup_method(self): 89 | self._root_logger = lg.getLogger() 90 | self._handlers = self._root_logger.handlers 91 | super(TestMain, self).setup_method() 92 | 93 | def teardown_method(self): 94 | self._root_logger.handlers = self._handlers 95 | 96 | def _dircmp(self, dpath): 97 | dircmp = filecmp.dircmp(self.dpath, dpath) 98 | assert not dircmp.left_only 99 | assert not dircmp.right_only 100 | assert not dircmp.diff_files 101 | 102 | def test_download(self): 103 | self.client.upload('foo', self.dpath) 104 | with temppath() as tpath: 105 | main( 106 | ['download', 'foo', tpath, '--silent', '--threads', '1'], 107 | self.client 108 | ) 109 | self._dircmp(tpath) 110 | 111 | def test_download_stream(self): 112 | self.client.write('foo', 'hello') 113 | with temppath() as tpath: 114 | stdout = sys.stdout 115 | try: 116 | with open(tpath, 'wb') as writer: 117 | sys.stdout = writer 118 | main( 119 | ['download', 'foo', '-', '--silent', '--threads', '1'], 120 | self.client 121 | ) 122 | finally: 123 | sys.stdout = stdout 124 | with open(tpath) as reader: 125 | assert reader.read() == 'hello' 126 | 127 | def test_download_stream_multiple_files(self): 128 | with pytest.raises(SystemExit): 129 | self.client.upload('foo', self.dpath) 130 | main( 131 | ['download', 'foo', '-', '--silent', '--threads', '1'], 132 | self.client 133 | ) 134 | 135 | def test_download_overwrite(self): 136 | with pytest.raises(SystemExit): 137 | self.client.upload('foo', self.dpath) 138 | with temppath() as tpath: 139 | with open(tpath, 'w'): 140 | pass 141 | main( 142 | ['download', 'foo', tpath, '--silent', '--threads', '1'], 143 | self.client 144 | ) 145 | self._dircmp(tpath) 146 | 147 | def test_download_force(self): 148 | self.client.write('foo', 'hey') 149 | with temppath() as tpath: 150 | with open(tpath, 'w'): 151 | pass 152 | main( 153 | ['download', 'foo', tpath, '--silent', '--force', '--threads', '1'], 154 | self.client 155 | ) 156 | with open(tpath) as reader: 157 | assert reader.read() == 'hey' 158 | 159 | def test_upload(self): 160 | main( 161 | ['upload', self.dpath, 'bar', '--silent', '--threads', '1'], 162 | self.client 163 | ) 164 | with temppath() as tpath: 165 | self.client.download('bar', tpath) 166 | self._dircmp(tpath) 167 | 168 | def test_upload_overwrite(self): 169 | with pytest.raises(SystemExit): 170 | self.client.write('bar', 'hey') 171 | main( 172 | ['upload', self.dpath, 'bar', '--silent', '--threads', '1'], 173 | self.client 174 | ) 175 | 176 | def test_upload_force(self): 177 | self.client.write('bar', 'hey') 178 | main( 179 | ['upload', self.dpath, 'bar', '--silent', '--threads', '1', '--force'], 180 | self.client 181 | ) 182 | with temppath() as tpath: 183 | self.client.download('bar', tpath) 184 | self._dircmp(tpath) 185 | 186 | def test_upload_append(self): 187 | with temppath() as tpath: 188 | with open(tpath, 'w') as writer: 189 | writer.write('hey') 190 | main(['upload', tpath, 'bar', '--silent', '--threads', '1'], self.client) 191 | main( 192 | ['upload', tpath, 'bar', '--silent', '--threads', '1', '--append'], 193 | self.client 194 | ) 195 | with self.client.read('bar') as reader: 196 | assert reader.read() == b'heyhey' 197 | 198 | def test_upload_append_folder(self): 199 | with pytest.raises(SystemExit): 200 | with temppath() as tpath: 201 | main(['upload', self.dpath, '--silent', '--append'], self.client) 202 | -------------------------------------------------------------------------------- /test/test_util.py: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env python 2 | # encoding: utf-8 3 | 4 | """Test Hdfs client interactions with HDFS.""" 5 | 6 | from hdfs.util import * 7 | import pytest 8 | 9 | 10 | class TestAsyncWriter(object): 11 | 12 | def test_basic(self): 13 | result = [] 14 | def consumer(gen): 15 | result.append(list(gen)) 16 | with AsyncWriter(consumer) as writer: 17 | writer.write('one') 18 | writer.write('two') 19 | assert result == [['one','two']] 20 | 21 | def test_multiple_writer_uses(self): 22 | result = [] 23 | def consumer(gen): 24 | result.append(list(gen)) 25 | writer = AsyncWriter(consumer) 26 | with writer: 27 | writer.write('one') 28 | writer.write('two') 29 | with writer: 30 | writer.write('three') 31 | writer.write('four') 32 | assert result == [['one','two'],['three','four']] 33 | 34 | def test_multiple_consumer_uses(self): 35 | result = [] 36 | def consumer(gen): 37 | result.append(list(gen)) 38 | with AsyncWriter(consumer) as writer: 39 | writer.write('one') 40 | writer.write('two') 41 | with AsyncWriter(consumer) as writer: 42 | writer.write('three') 43 | writer.write('four') 44 | assert result == [['one','two'],['three','four']] 45 | 46 | def test_nested(self): 47 | with pytest.raises(ValueError): 48 | result = [] 49 | def consumer(gen): 50 | result.append(list(gen)) 51 | with AsyncWriter(consumer) as _writer: 52 | _writer.write('one') 53 | with _writer as writer: 54 | writer.write('two') 55 | 56 | def test_child_error(self): 57 | with pytest.raises(HdfsError): 58 | def consumer(gen): 59 | for value in gen: 60 | if value == 'two': 61 | raise HdfsError('Yo') 62 | with AsyncWriter(consumer) as writer: 63 | writer.write('one') 64 | writer.write('two') 65 | 66 | def test_parent_error(self): 67 | with pytest.raises(HdfsError): 68 | def consumer(gen): 69 | for value in gen: 70 | pass 71 | def invalid(w): 72 | w.write('one') 73 | raise HdfsError('Ya') 74 | with AsyncWriter(consumer) as writer: 75 | invalid(writer) 76 | 77 | 78 | class TestTemppath(object): 79 | 80 | def test_new(self): 81 | with temppath() as tpath: 82 | assert not osp.exists(tpath) 83 | 84 | def test_cleanup(self): 85 | with temppath() as tpath: 86 | with open(tpath, 'w') as writer: 87 | writer.write('hi') 88 | assert not osp.exists(tpath) 89 | 90 | def test_dpath(self): 91 | with temppath() as dpath: 92 | os.mkdir(dpath) 93 | with temppath(dpath) as tpath: 94 | assert osp.dirname(tpath) == dpath 95 | -------------------------------------------------------------------------------- /test/util.py: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env python 2 | # encoding: utf-8 3 | 4 | """Test helpers.""" 5 | 6 | from hdfs import InsecureClient 7 | from hdfs.config import Config 8 | from hdfs.util import HdfsError 9 | from requests.exceptions import ConnectionError 10 | from six.moves.configparser import NoOptionError, NoSectionError 11 | from time import sleep 12 | import os 13 | import posixpath as psp 14 | import pytest 15 | 16 | 17 | def save_config(config, path=None): 18 | """Save configuration to file. 19 | 20 | :param config: :class:`~hdfs.config.Config` instance. 21 | 22 | """ 23 | with open(path or config.path, 'w') as writer: 24 | config.write(writer) 25 | 26 | 27 | class _IntegrationTest(object): 28 | 29 | """Base class to run tests using remote HDFS. 30 | 31 | These tests are run only if a `HDFSCLI_TEST_ALIAS` or `HDFSCLI_TEST_URL` 32 | environment variable is defined (the former taking precedence). For safety, 33 | a suffix is appended to any defined root. 34 | 35 | .. warning:: 36 | 37 | The new root directory used is entirely cleaned during tests! 38 | 39 | Also contains a few helper functions. 40 | 41 | """ 42 | 43 | client = None 44 | delay = 0.5 # Delay in seconds between tests. 45 | root_suffix = '.hdfscli' # Also used as default root if none specified. 46 | 47 | @classmethod 48 | def setup_class(cls): 49 | alias = os.getenv('HDFSCLI_TEST_ALIAS') 50 | url = os.getenv('HDFSCLI_TEST_URL') 51 | if alias: 52 | cls.client = Config().get_client(alias) 53 | if cls.client.root: 54 | cls.client.root = psp.join(cls.client.root, cls.root_suffix) 55 | else: 56 | cls.client.root = cls.root_suffix 57 | elif url: 58 | cls.client = InsecureClient(url, root=cls.root_suffix) 59 | 60 | @classmethod 61 | def teardown_class(cls): 62 | if cls.client: 63 | cls.client.delete('', recursive=True) 64 | 65 | def setup_method(self): 66 | if not self.client: 67 | pytest.skip() 68 | else: 69 | try: 70 | self.client.delete('', recursive=True) 71 | # Wrapped inside a `ConnectionError` block because this causes failures 72 | # when trying to reuse some streamed connections when they aren't fully 73 | # read (even though it is closed explicitly, it acts differently than 74 | # when all its content has been read), but only on HttpFS. A test which 75 | # needs this for example is `test_ext_avro.py:TestMain.test_schema`. 76 | # This seems related to this issue: 77 | # https://github.com/kennethreitz/requests/issues/1915 (even on more 78 | # recent versions of `requests` though). 79 | # 80 | # Here is a simple test case that will pass on WebHDFS but fail on 81 | # HttpFS: 82 | # 83 | # .. code:: python 84 | # 85 | # client = Config().get_client('test-webhdfs') 86 | # client.write('foo', 'hello') 87 | # with client.read('foo') as reader: 88 | # pass # Will succeed if this is replaced by `reader.read()`. 89 | # client.delete('foo') 90 | # 91 | except ConnectionError: 92 | self.client.delete('', recursive=True) # Retry. 93 | finally: 94 | sleep(self.delay) 95 | 96 | # Helpers. 97 | 98 | def _read(self, hdfs_path, encoding=None): 99 | with self.client.read(hdfs_path, encoding=encoding) as reader: 100 | return reader.read() 101 | 102 | def _write(self, hdfs_path, data, encoding=None): 103 | with self.client.write(hdfs_path, encoding=encoding) as writer: 104 | return writer.write(data) 105 | 106 | def _exists(self, hdfs_path): 107 | return bool(self.client.status(hdfs_path, strict=False)) 108 | --------------------------------------------------------------------------------