├── .github
    └── workflows
    │   ├── ci.yml
    │   ├── pr.yml
    │   └── release.yml
├── .readthedocs.yaml
├── AUTHORS
├── CHANGES
├── LICENSE
├── MANIFEST.in
├── README.md
├── doc
    ├── advanced.rst
    ├── api.rst
    ├── conf.py
    ├── index.rst
    ├── quickstart.rst
    └── requirements.txt
├── etc
    └── hadoop
    │   ├── core-site.xml
    │   ├── hdfs-site.xml
    │   ├── httpfs-site.xml
    │   └── log4j.properties
├── examples
    ├── avro-example.py
    ├── dataframe-example.py
    └── json-example.py
├── hdfs
    ├── __init__.py
    ├── __main__.py
    ├── client.py
    ├── config.py
    ├── ext
    │   ├── __init__.py
    │   ├── avro
    │   │   ├── __init__.py
    │   │   └── __main__.py
    │   ├── dataframe.py
    │   └── kerberos.py
    └── util.py
├── scripts
    ├── hadoop.sh
    └── version.sh
├── setup.py
└── test
    ├── __init__.py
    ├── dat
        ├── client_template.py
        ├── weather.avro
        ├── weather.avsc
        └── weather.jsonl
    ├── test_client.py
    ├── test_config.py
    ├── test_examples.py
    ├── test_ext_avro.py
    ├── test_ext_dataframe.py
    ├── test_ext_kerberos.py
    ├── test_main.py
    ├── test_util.py
    └── util.py


/.github/workflows/ci.yml:
--------------------------------------------------------------------------------
 1 | name: CI
 2 | on:
 3 |   push:
 4 |     branches:
 5 |       - master
 6 |     paths-ignore:
 7 |       - '**.md'
 8 |       - .readthedocs.yaml
 9 |       - doc/*
10 | jobs:
11 |   test:
12 |     name: Test
13 |     runs-on: ubuntu-latest
14 |     strategy:
15 |       fail-fast: false
16 |       matrix:
17 |         python-version:
18 |           # - '3.6' (see https://github.com/actions/setup-python/issues/544)
19 |           - '3.7'
20 |           - '3.8'
21 |           - '3.9'
22 |           - '3.10'
23 |           - '3.11'
24 |           - '3.12'
25 |     steps:
26 |       - name: Check out
27 |         uses: actions/checkout@v3
28 |       - name: Setup Java
29 |         uses: actions/setup-java@v3
30 |         with:
31 |           distribution: 'adopt'
32 |           java-version: '8'
33 |       - name: Setup Python
34 |         uses: actions/setup-python@v4
35 |         with:
36 |           python-version: ${{ matrix.python-version }}
37 |       - name: Download Hadoop
38 |         run: |
39 |           echo "HADOOP_HOME=$(./scripts/hadoop.sh download)" >>"$GITHUB_ENV"
40 |       - name: Configure Hadoop
41 |         run: |
42 |           echo "HADOOP_CONF_DIR=$(./scripts/hadoop.sh config)" >>"$GITHUB_ENV"
43 |       - name: Start HDFS
44 |         run: |
45 |           ./scripts/hadoop.sh start
46 |           echo "WEBHDFS_URL=http://$("$HADOOP_HOME/bin/hdfs" getconf -confKey dfs.namenode.http-address)" >>"$GITHUB_ENV"
47 |           echo "HTTPFS_URL=http://localhost:14000" >>"$GITHUB_ENV"
48 |           sleep 5 # TODO: Find a better way to wait for all datanodes to become reachable.
49 |       - name: Install
50 |         run: pip install .[avro] coverage mock pytest pytest-cov pandas
51 |       - name: Test on WebHDFS
52 |         run: HDFSCLI_TEST_URL="$WEBHDFS_URL" python -m pytest --cov=hdfs
53 |       - name: Test on HTTPFS
54 |         run: HDFSCLI_TEST_URL="$HTTPFS_URL" HDFSCLI_NOSNAPSHOT=1 python -m pytest --cov=hdfs
55 |       - name: Stop HDFS
56 |         if: always()
57 |         run: ./scripts/hadoop.sh stop
58 |   tag:
59 |     name: Tag
60 |     needs:
61 |       - test
62 |     runs-on: ubuntu-latest
63 |     steps:
64 |       - name: Check out
65 |         uses: actions/checkout@v3
66 |       - name: Extract version
67 |         id: extract-version
68 |         run: |
69 |           PACKAGE_VERSION="$(./scripts/version.sh)"
70 |           echo "version=$PACKAGE_VERSION" >>"$GITHUB_OUTPUT"
71 |       - name: Check if tag exists
72 |         uses: mukunku/tag-exists-action@v1.1.0
73 |         id: check-version
74 |         with:
75 |           tag: v${{ steps.extract-version.outputs.version }}
76 |         env:
77 |           GITHUB_TOKEN: ${{ secrets.GITHUB_TOKEN }}
78 |       - name: Create tag
79 |         if: steps.check-version.outputs.exists == 'false'
80 |         uses: pkgdeps/git-tag-action@v2
81 |         with:
82 |           git_commit_sha: ${{ github.sha }}
83 |           git_tag_prefix: v
84 |           github_repo: ${{ github.repository }}
85 |           github_token: ${{ secrets.GITHUB_TOKEN }}
86 |           version: ${{ steps.extract-version.outputs.version }}
87 | 


--------------------------------------------------------------------------------
/.github/workflows/pr.yml:
--------------------------------------------------------------------------------
 1 | name: PR
 2 | on:
 3 |   pull_request:
 4 |     branches:
 5 |       - master
 6 |     paths-ignore:
 7 |       - '**.md'
 8 |       - .readthedocs.yaml
 9 |       - doc/*
10 | jobs:
11 |   test:
12 |     name: Test
13 |     runs-on: ubuntu-latest
14 |     strategy:
15 |       matrix:
16 |         python-version:
17 |           # - '3.6' (see https://github.com/actions/setup-python/issues/544)
18 |           - '3.7'
19 |           - '3.8'
20 |           - '3.9'
21 |           - '3.10'
22 |           - '3.11'
23 |           - '3.12'
24 |     steps:
25 |       - name: Check out
26 |         uses: actions/checkout@v3
27 |       - name: Setup Java
28 |         uses: actions/setup-java@v3
29 |         with:
30 |           distribution: 'adopt'
31 |           java-version: '8'
32 |       - name: Setup Python
33 |         uses: actions/setup-python@v4
34 |         with:
35 |           python-version: ${{ matrix.python-version }}
36 |       - name: Download Hadoop
37 |         run: |
38 |           echo "HADOOP_HOME=$(./scripts/hadoop.sh download)" >>"$GITHUB_ENV"
39 |       - name: Configure Hadoop
40 |         run: |
41 |           echo "HADOOP_CONF_DIR=$(./scripts/hadoop.sh config)" >>"$GITHUB_ENV"
42 |       - name: Start HDFS
43 |         run: |
44 |           ./scripts/hadoop.sh start
45 |           echo "WEBHDFS_URL=http://$("$HADOOP_HOME/bin/hdfs" getconf -confKey dfs.namenode.http-address)" >>"$GITHUB_ENV"
46 |           echo "HTTPFS_URL=http://localhost:14000" >>"$GITHUB_ENV"
47 |           sleep 5 # TODO: Find a better way to wait for all datanodes to become reachable.
48 |       - name: Install
49 |         run: pip install .[avro] coverage mock pytest pytest-cov pandas
50 |       - name: Test on WebHDFS
51 |         run: HDFSCLI_TEST_URL="$WEBHDFS_URL" python -m pytest --cov=hdfs
52 |       - name: Test on HTTPFS
53 |         run: HDFSCLI_TEST_URL="$HTTPFS_URL" HDFSCLI_NOSNAPSHOT=1 python -m pytest --cov=hdfs
54 |       - name: Stop HDFS
55 |         if: always()
56 |         run: ./scripts/hadoop.sh stop
57 | 


--------------------------------------------------------------------------------
/.github/workflows/release.yml:
--------------------------------------------------------------------------------
 1 | name: Release
 2 | on:
 3 |   release:
 4 |     types:
 5 |       - published
 6 | jobs:
 7 |   test:
 8 |     name: Publish
 9 |     timeout-minutes: 2
10 |     runs-on: ubuntu-latest
11 |     steps:
12 |       - name: Check out
13 |         uses: actions/checkout@v2
14 |       - name: Setup Python
15 |         uses: actions/setup-python@v4
16 |         with:
17 |           python-version: '3.10'
18 |       - name: Install
19 |         run: pip install twine
20 |       - name: Publish
21 |         run: |
22 |           python setup.py sdist
23 |           twine upload dist/*
24 |         env:
25 |           TWINE_USERNAME: __token__
26 |           TWINE_PASSWORD: ${{ secrets.PYPI_TOKEN }}
27 | 


--------------------------------------------------------------------------------
/.readthedocs.yaml:
--------------------------------------------------------------------------------
 1 | version: 2
 2 | build:
 3 |   os: ubuntu-22.04
 4 |   tools:
 5 |     python: '3.9'
 6 | sphinx:
 7 |   configuration: doc/conf.py
 8 | python:
 9 |   install:
10 |     - path: .
11 |       method: pip
12 |       extra_requirements:
13 |         - avro
14 |     - requirements: doc/requirements.txt
15 | 


--------------------------------------------------------------------------------
/AUTHORS:
--------------------------------------------------------------------------------
1 | Matthieu Monsch <monsch@alum.mit.edu>
2 | Artemy Kolchinsky <akolchin@indiana.edu>
3 | Evan Borgstrom <eborgstrom@linkedin.com>
4 | Wes McKinney <wes@cloudera.com>
5 | Isaac Hodes <isaachodes@gmail.com>
6 | 


--------------------------------------------------------------------------------
/CHANGES:
--------------------------------------------------------------------------------
  1 | HdfsCLI
  2 | =======
  3 | 
  4 | Version 2.0 (2015/20/08)
  5 | ------------------------
  6 | 
  7 | * Add python 3 support (the Kerberos extension's requirements must however be 
  8 |   manually installed).
  9 | * Allow specifying relative client roots. These will be assumed relative to the 
 10 |   user's home directory.
 11 | * Add several client methods: `makedirs`, `set_times`, `checksum`, 
 12 |   `set_replication`, etc.
 13 | * Add `progress` argument to `Client.read`, `Client.upload`, and 
 14 |   `Client.download`. Also add a `chunk_size` argument to the latter to allow 
 15 |   better tracking.
 16 | * Add `strict` option to `Client.status` and `Client.content` to perform path 
 17 |   existence checks.
 18 | * `Client.write` now can be used as a context manager (returning a file-like 
 19 |   object).
 20 | * `Client.read` and `Client.write` can now return file-like objects (supporting 
 21 |   `read` and `write` calls respectively).
 22 | * Improve robustness of `KerberosClient`. In particular add the 
 23 |   `max_concurrency` parameter which can be tuned to prevent authentication 
 24 |   errors when too many simultaneous requests are being made. Along with the new 
 25 |   delay parameter, this lets us remove the timeouts in `Client.download` and 
 26 |   `Client.upload`, which both simplify and speed up these functions.
 27 | * Add `Config` class which handles all CLI configuration (e.g. aliases and 
 28 |   logging).
 29 | * Add `autoload.modules` and `autoload.paths` configuration options.
 30 | * Rename alias configuration sections to `ALIAS.alias` (the old format, 
 31 |   `ALIAS_alias` is still supported).
 32 | * Add `--verbose` option to CLI, enabling logging at various levels.
 33 | * Switch Avro extension to using `fastavro`. This speeds up `AvroWriter` and 
 34 |   `AvroReader` by a significant amount (~5 times faster on a reasonable 
 35 |   connection).
 36 | * Add `write` command to Avro CLI.
 37 | * Remove CSV support for dataframe extension.
 38 | 
 39 | Breaking changes:
 40 | 
 41 | * Change default configuration file path (to `~/.hdfscli.cfg`).
 42 | * Change location of `default.alias` option in configuration file (from command 
 43 |   specific section to `global`).
 44 | * Add `session` argument to `Client` and remove newly redundant parameters 
 45 |   (e.g. `verify`, `cert`).
 46 | * Remove `Client.from_alias` method (delegated to the new `Config` class). The 
 47 |   `Client.from_options` method is now public (renamed from 
 48 |   `Client._from_options`).
 49 | * Change default entry point name to `hdfscli` to avoid clashing with Hadoop 
 50 |   HDFS script.
 51 | * `Client.delete` now returns a boolean indicating success (rather than failing 
 52 |   if the path was already empty).
 53 | * `Client.read` must now be used inside a `with` block. This ensures that 
 54 |   connections are properly closed. The context manager now returns a file like 
 55 |   object (useful for composing with other functions, e.g. to decode Avro). 
 56 |   Setting `chunk_size` to a positive value will make it return a generator 
 57 |   similar to the previous behavior.
 58 | * Rename `Client.set_permissions` to `Client.set_permission` to make 
 59 |   `permission` argument uniform across `Client` methods (always singular, 
 60 |   consistent with WebHDFS API).
 61 | * `Client.parts` will now throw an error if called on a normal file.
 62 | * Make most client attributes private (e.g. `cert`, `timeout`, etc.), except 
 63 |   `url` and `root`.
 64 | * Remove `--` prefix from CLI commands. Also simplify CLI to only interactive, 
 65 |   download and upload commands (write and read behavior can be achieved by 
 66 |   passing '-' as local path). The `Client` API changes should make it more 
 67 |   convenient to perform these from a python shell.
 68 | * Rename several CLI options (e.g. `--log`, `--force`, `--version`).
 69 | * Change meaning of `n_threads` option in `Client.download` and 
 70 |   `Client.upload`. `0` now means one thread per part-file rather than a single 
 71 |   thread.
 72 | * Change `Client.walk` to be consistent with `os.walk`. Also change meaning of
 73 |   `depth` option (`0` being unlimited).
 74 | * Add `status` option to `Client.list`, `Client.walk`, and `Client.parts`. By
 75 |   default these functions now only return the names of the relevant files and
 76 |   folders.
 77 | * Remove `Client.append` method (replaced by `append` keyword argument to 
 78 |   `Client.write`).
 79 | * Symbols exported by extensions aren't imported in the main `hdfs` module 
 80 |   anymore. This removes the need for some custom error handling (when 
 81 |   dependency requirements weren't met).
 82 | * Remove Bash autocompletion file (for now).
 83 | * Remove compatibility layer for entry point configuration (i.e. 
 84 |   `HDFS_ENTRY_POINT` isn't supported anymore).
 85 | 
 86 | 
 87 | Version 1.4.0 (2015/07/24)
 88 | --------------------------
 89 | 
 90 | * Add support for download and upload of arbitrary folders.
 91 | * Deprecate `Client.append` (in favor of `append` argument to `Client.write`).
 92 | 
 93 | 
 94 | Version 1.1.0 (2015/06/23)
 95 | --------------------------
 96 | 
 97 | * Rename Avro extension entry point to `hdfs-avro`.
 98 | 
 99 | 
100 | Version 1.0.1 (2015/06/17)
101 | --------------------------
102 | 
103 | * Added support for Windows.
104 | * Added support for remote filepaths with `=` characters.
105 | 
106 | 
107 | Version 0.3.0 (2014/11/14)
108 | --------------------------
109 | 
110 | * Added `--interactive` command.
111 | 
112 | Breaking changes:
113 | 
114 | * Renamed `--info` command to `--list`.
115 | * Made `--interactive` the new default command.
116 | 
117 | 
118 | Version 0.2.6 (2014/08/04)
119 | --------------------------
120 | 
121 | * Added parallelized downloading.
122 | * Added Avro-format reading and writing.
123 | * Added `hdfs.ext.dataframe` extension.
124 | 
125 | 
126 | Version 0.2.0 (2014/04/26)
127 | --------------------------
128 | 
129 | * Added `Client.status` and `Client.content` methods.
130 | * Added callback to `Client.write`.
131 | 
132 | Breaking changes:
133 | 
134 | * Removed content from `Client.walk`.
135 | * Simplified CLI. All download and uploads are normalized through standard in, 
136 |   and standard out.
137 | 
138 | 
139 | Version 0.1.0 (2014/03/25)
140 | --------------------------
141 | 
142 | * Initial release
143 | 


--------------------------------------------------------------------------------
/LICENSE:
--------------------------------------------------------------------------------
 1 | Copyright (c) 2014, Matthieu Monsch.
 2 | 
 3 | Permission is hereby granted, free of charge, to any person obtaining a copy of
 4 | this software and associated documentation files (the "Software"), to deal in
 5 | the Software without restriction, including without limitation the rights to
 6 | use, copy, modify, merge, publish, distribute, sublicense, and/or sell copies
 7 | of the Software, and to permit persons to whom the Software is furnished to do
 8 | so, subject to the following conditions:
 9 | 
10 | The above copyright notice and this permission notice shall be included in all
11 | copies or substantial portions of the Software.
12 | 
13 | THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
14 | IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
15 | FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
16 | AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
17 | LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
18 | OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
19 | SOFTWARE.
20 | 


--------------------------------------------------------------------------------
/MANIFEST.in:
--------------------------------------------------------------------------------
1 | include LICENSE README.md
2 | 


--------------------------------------------------------------------------------
/README.md:
--------------------------------------------------------------------------------
 1 | # HdfsCLI [![CI](https://github.com/mtth/hdfs/actions/workflows/ci.yml/badge.svg)](https://github.com/mtth/hdfs/actions/workflows/ci.yml) [![Pypi badge](https://badge.fury.io/py/hdfs.svg)](https://pypi.python.org/pypi/hdfs/) [![Downloads badge](https://img.shields.io/pypi/dm/hdfs.svg)](https://pypistats.org/packages/hdfs)
 2 | 
 3 | API and command line interface for HDFS.
 4 | 
 5 | ```
 6 | $ hdfscli --alias=dev
 7 | 
 8 | Welcome to the interactive HDFS python shell.
 9 | The HDFS client is available as `CLIENT`.
10 | 
11 | In [1]: CLIENT.list('models/')
12 | Out[1]: ['1.json', '2.json']
13 | 
14 | In [2]: CLIENT.status('models/2.json')
15 | Out[2]: {
16 |   'accessTime': 1439743128690,
17 |   'blockSize': 134217728,
18 |   'childrenNum': 0,
19 |   'fileId': 16389,
20 |   'group': 'supergroup',
21 |   'length': 48,
22 |   'modificationTime': 1439743129392,
23 |   'owner': 'drwho',
24 |   'pathSuffix': '',
25 |   'permission': '755',
26 |   'replication': 1,
27 |   'storagePolicy': 0,
28 |   'type': 'FILE'
29 | }
30 | 
31 | In [3]: with CLIENT.read('models/2.json', encoding='utf-8') as reader:
32 |   ...:     from json import load
33 |   ...:     model = load(reader)
34 |   ...:
35 | ```
36 | 
37 | ## Features
38 | 
39 | * Python 3 bindings for the [WebHDFS][] (and [HttpFS][]) API,
40 |   supporting both secure and insecure clusters.
41 | * Command line interface to transfer files and start an interactive client
42 |   shell, with aliases for convenient namenode URL caching.
43 | * Additional functionality through optional extensions:
44 | 
45 |   + `avro`, to [read and write Avro files directly from HDFS][].
46 |   + `dataframe`, to [load and save Pandas dataframes][].
47 |   + `kerberos`, to [support Kerberos authenticated clusters][].
48 | 
49 | See the [documentation][] to learn more.
50 | 
51 | ## Getting started
52 | 
53 | ```sh
54 | $ pip install hdfs
55 | ```
56 | 
57 | Then hop on over to the [quickstart][] guide. A [Conda
58 | feedstock](https://github.com/conda-forge/python-hdfs-feedstock) is also
59 | available.
60 | 
61 | ## Testing
62 | 
63 | HdfsCLI is tested against both [WebHDFS][] and [HttpFS][]. There are two ways
64 | of running tests (see `scripts/` for helpers to set up a test HDFS cluster):
65 | 
66 | ```sh
67 | $ HDFSCLI_TEST_URL=http://localhost:50070 pytest # Using a namenode's URL.
68 | $ HDFSCLI_TEST_ALIAS=dev pytest # Using an alias.
69 | ```
70 | 
71 | ## Contributing
72 | 
73 | We'd love to hear what you think on the [issues][] page. Pull requests are also
74 | most welcome!
75 | 
76 | [HttpFS]: http://hadoop.apache.org/docs/current/hadoop-hdfs-httpfs/
77 | [WebHDFS]: http://hadoop.apache.org/docs/current/hadoop-project-dist/hadoop-hdfs/WebHDFS.html
78 | [read and write Avro files directly from HDFS]: https://hdfscli.readthedocs.io/en/latest/api.html#module-hdfs.ext.avro
79 | [load and save Pandas dataframes]: https://hdfscli.readthedocs.io/en/latest/api.html#module-hdfs.ext.dataframe
80 | [support Kerberos authenticated clusters]: https://hdfscli.readthedocs.io/en/latest/api.html#module-hdfs.ext.kerberos
81 | [documentation]: https://hdfscli.readthedocs.io/
82 | [quickstart]: https://hdfscli.readthedocs.io/en/latest/quickstart.html
83 | [issues]: https://github.com/mtth/hdfs/issues
84 | 


--------------------------------------------------------------------------------
/doc/advanced.rst:
--------------------------------------------------------------------------------
  1 | .. default-role:: code
  2 | 
  3 | 
  4 | .. _advanced_usage:
  5 | 
  6 | Advanced usage
  7 | ==============
  8 | 
  9 | 
 10 | Path expansion
 11 | --------------
 12 | 
 13 | All :class:`~hdfs.client.Client` methods provide a path expansion functionality 
 14 | via the :meth:`~hdfs.client.Client.resolve` method. It enables the use of 
 15 | special markers to identify paths. For example, it currently supports the 
 16 | `#LATEST` marker which expands to the last modified file inside a given folder.
 17 | 
 18 | .. code-block:: python
 19 | 
 20 |   # Load the most recent data in the `tracking` folder.
 21 |   with client.read('tracking/#LATEST') as reader:
 22 |     data = reader.read()
 23 | 
 24 | See the method's documentation for more information.
 25 | 
 26 | 
 27 | .. _custom_client:
 28 | 
 29 | Custom client support
 30 | ---------------------
 31 | 
 32 | In order for the CLI to be able to instantiate arbitrary client classes, it has 
 33 | to be able to discover these first. This is done by specifying where they are 
 34 | defined in the `global` section of HdfsCLI's configuration file. For example, 
 35 | here is how we can make the :class:`~hdfs.ext.kerberos.KerberosClient` class 
 36 | available:
 37 | 
 38 | .. code-block:: cfg
 39 | 
 40 |   [global]
 41 |   autoload.modules = hdfs.ext.kerberos
 42 | 
 43 | More precisely, there are two options for telling the CLI where to load the 
 44 | clients from:
 45 | 
 46 | + `autoload.modules`, a comma-separated list of modules (which must be on 
 47 |   python's path).
 48 | + `autoload.paths`, a comma-separated list of paths to python files.
 49 | 
 50 | Implementing custom clients can be particularly useful for passing default 
 51 | options (e.g. a custom `session` argument to each client). We describe below a 
 52 | working example implementing a secure client with optional custom certificate 
 53 | support.
 54 | 
 55 | We first implement our new client and save it somewhere, for example 
 56 | `/etc/hdfscli.py`.
 57 | 
 58 | .. code-block:: python
 59 | 
 60 |   from hdfs import Client
 61 |   from requests import Session
 62 | 
 63 |   class SecureClient(Client):
 64 | 
 65 |     """A new client subclass for handling HTTPS connections.
 66 | 
 67 |     :param url: URL to namenode.
 68 |     :param cert: Local certificate. See `requests` documentation for details
 69 |       on how to use this.
 70 |     :param verify: Whether to check the host's certificate.
 71 |     :param \*\*kwargs: Keyword arguments passed to the default `Client` 
 72 |       constructor.
 73 | 
 74 |     """
 75 | 
 76 |     def __init__(self, url, cert=None, verify=True, **kwargs):
 77 |       session = Session()
 78 |       if ',' in cert:
 79 |         session.cert = [path.strip() for path in cert.split(',')]
 80 |       else:
 81 |         session.cert = cert
 82 |       if isinstance(verify, basestring): # Python 2.
 83 |         verify = verify.lower() in ('true', 'yes', 'ok')
 84 |       session.verify = verify
 85 |       super(SecureClient, self).__init__(url, session=session, **kwargs)
 86 | 
 87 | We then edit our configuration to tell the CLI how to load this module and 
 88 | define a `prod` alias using our new client:
 89 | 
 90 | .. code-block:: cfg
 91 | 
 92 |   [global]
 93 |   autoload.paths = /etc/hdfscli.py
 94 | 
 95 |   [prod.alias]
 96 |   client = SecureClient
 97 |   url = https://host:port
 98 |   cert = /etc/server.crt, /etc/key
 99 | 
100 | 
101 | Note that options used to instantiate clients from the CLI (using 
102 | :meth:`hdfs.client.Client.from_options` under the hood) are always passed in as 
103 | strings. This is why we had to implement some parsing logic in the 
104 | `SecureClient` constructor above.
105 | 
106 | 
107 | Tracking transfer progress
108 | --------------------------
109 | 
110 | The :meth:`~hdfs.client.Client.read`, :meth:`~hdfs.client.Client.upload`, 
111 | :meth:`~hdfs.client.Client.download` client methods accept a `progress` 
112 | callback argument which can be used to track transfers. The passed function 
113 | will be called every `chunk_size` bytes with two arguments:
114 | 
115 | + The source path of the file currently being transferred.
116 | + The number of bytes currently transferred for this file or `-1` to signal 
117 |   that this file's transfer has just finished.
118 | 
119 | Below is an implementation of a toy tracker which simply outputs to standard 
120 | error the total number of transferred bytes each time a file transfer completes 
121 | (we must still take care to ensure correct behavior even during multi-threaded 
122 | transfers).
123 | 
124 | .. code-block:: python
125 | 
126 |   from sys import stderr
127 |   from threading import Lock
128 | 
129 |   class Progress(object):
130 | 
131 |     """Basic progress tracker callback."""
132 | 
133 |     def __init__(self):
134 |       self._data = {}
135 |       self._lock = Lock()
136 | 
137 |     def __call__(self, hdfs_path, nbytes):
138 |       with self._lock:
139 |         if nbytes >= 0:
140 |             self._data[hdfs_path] = nbytes
141 |         else:
142 |           stderr.write('%s\n' % (sum(self._data.values()), ))
143 | 
144 | Finally, note that the :meth:`~hdfs.client.Client.write` method doesn't expose 
145 | a `progress` argument since this functionality can be replicated by passing a 
146 | custom `data` generator (or within the context manager).
147 | 
148 | 
149 | Logging configuration
150 | ---------------------
151 | 
152 | It is possible to configure and disable where the CLI logs are written for each 
153 | entry point. To do this, we can set the following options in its corresponding 
154 | section (the entry point's name suffixed with `.command`). For example:
155 | 
156 | .. code-block:: cfg
157 | 
158 |   [hdfscli-avro.command]
159 |   log.level = INFO
160 |   log.path = /tmp/hdfscli/avro.log
161 | 
162 | The following options are available:
163 | 
164 | + `log.level`, handler log level (defaults to `DEBUG`).
165 | + `log.path`, path to log file. The log is rotated every day (keeping a single 
166 |   copy). The default is a file named `COMMAND.log` in your current temporary 
167 |   directory. It is possible to view the currently active log file at any time 
168 |   by using the `--log` option at the command line.
169 | + `log.disable`, disable logging to a file entirely (defaults to `False`).
170 | 
171 | 
172 | Renaming entry points
173 | ---------------------
174 | 
175 | By default the command line entry point will be named `hdfscli`. You can choose 
176 | another name by specifying the `HDFSCLI_ENTRY_POINT` environment variable at 
177 | installation time:
178 | 
179 | .. code-block:: bash
180 | 
181 |   $ HDFSCLI_ENTRY_POINT=hdfs pip install hdfs
182 | 
183 | Extension prefixes will be adjusted similarly (e.g. in the previous example, 
184 | `hdfscli-avro` would become `hdfs-avro`).
185 | 


--------------------------------------------------------------------------------
/doc/api.rst:
--------------------------------------------------------------------------------
 1 | .. default-role:: code
 2 | 
 3 | 
 4 | .. _api_reference:
 5 | 
 6 | API reference
 7 | =============
 8 | 
 9 | 
10 | Client
11 | ------
12 | 
13 | .. automodule:: hdfs.client
14 |     :members:
15 |     :show-inheritance:
16 | 
17 | 
18 | Extensions
19 | ----------
20 | 
21 | The following extensions are currently available:
22 | 
23 | 
24 | .. _kerberos_extension:
25 | 
26 | Kerberos
27 | ********
28 | 
29 | .. automodule:: hdfs.ext.kerberos
30 |     :members:
31 |     :show-inheritance:
32 | 
33 | 
34 | Avro
35 | ****
36 | 
37 | .. automodule:: hdfs.ext.avro
38 |     :members:
39 |     :show-inheritance:
40 | 
41 | 
42 | Dataframe
43 | *********
44 | 
45 | .. automodule:: hdfs.ext.dataframe
46 |     :members:
47 | 
48 | 
49 | Configuration
50 | -------------
51 | 
52 | .. automodule:: hdfs.config
53 |     :members:
54 |     :show-inheritance:
55 | 
56 | 
57 | Utilities
58 | ---------
59 | 
60 | .. automodule:: hdfs.util
61 |     :members:
62 |     :show-inheritance:
63 | 


--------------------------------------------------------------------------------
/doc/conf.py:
--------------------------------------------------------------------------------
  1 | # -*- coding: utf-8 -*-
  2 | #
  3 | # hdfs documentation build configuration file, created by
  4 | # sphinx-quickstart on Thu Mar  6 16:04:56 2014.
  5 | #
  6 | # This file is execfile()d with the current directory set to its
  7 | # containing dir.
  8 | #
  9 | # Note that not all possible configuration values are present in this
 10 | # autogenerated file.
 11 | #
 12 | # All configuration values have a default; values that are commented out
 13 | # serve to show the default.
 14 | 
 15 | import os
 16 | import sys
 17 | try:
 18 |   from unittest import mock
 19 | except ImportError:
 20 |   import mock
 21 | 
 22 | MOCK_MODULES = ['fastavro', 'pandas', 'requests_kerberos']
 23 | for mod_name in MOCK_MODULES:
 24 |   sys.modules[mod_name] = mock.Mock()
 25 | 
 26 | # If extensions (or modules to document with autodoc) are in another directory,
 27 | # add these directories to sys.path here. If the directory is relative to the
 28 | # documentation root, use os.path.abspath to make it absolute, like shown here.
 29 | #sys.path.insert(0, os.path.abspath('.'))
 30 | 
 31 | # -- General configuration ------------------------------------------------
 32 | 
 33 | # If your documentation needs a minimal Sphinx version, state it here.
 34 | #needs_sphinx = '1.0'
 35 | 
 36 | # Add any Sphinx extension module names here, as strings. They can be
 37 | # extensions coming with Sphinx (named 'sphinx.ext.*') or your custom
 38 | # ones.
 39 | extensions = [
 40 |     'sphinx.ext.autodoc',
 41 |     'sphinx.ext.viewcode',
 42 | ]
 43 | 
 44 | # Add any paths that contain templates here, relative to this directory.
 45 | templates_path = ['_templates']
 46 | 
 47 | # The suffix of source filenames.
 48 | source_suffix = '.rst'
 49 | 
 50 | # The encoding of source files.
 51 | #source_encoding = 'utf-8-sig'
 52 | 
 53 | # The master toctree document.
 54 | master_doc = 'index'
 55 | 
 56 | # General information about the project.
 57 | project = u'HdfsCLI'
 58 | copyright = u'2014, Matthieu Monsch'
 59 | 
 60 | # The version info for the project you're documenting, acts as replacement for
 61 | # |version| and |release|, also used in various other places throughout the
 62 | # built documents.
 63 | #
 64 | import hdfs
 65 | # The short X.Y version.
 66 | version = hdfs.__version__.rsplit('.', 1)[0]
 67 | # The full version, including alpha/beta/rc tags.
 68 | release = hdfs.__version__
 69 | 
 70 | # The language for content autogenerated by Sphinx. Refer to documentation
 71 | # for a list of supported languages.
 72 | #language = None
 73 | 
 74 | # There are two options for replacing |today|: either, you set today to some
 75 | # non-false value, then it is used:
 76 | #today = ''
 77 | # Else, today_fmt is used as the format for a strftime call.
 78 | #today_fmt = '%B %d, %Y'
 79 | 
 80 | # List of patterns, relative to source directory, that match files and
 81 | # directories to ignore when looking for source files.
 82 | exclude_patterns = ['_build']
 83 | 
 84 | # The reST default role (used for this markup: `text`) to use for all
 85 | # documents.
 86 | #default_role = None
 87 | 
 88 | # If true, '()' will be appended to :func: etc. cross-reference text.
 89 | #add_function_parentheses = True
 90 | 
 91 | # If true, the current module name will be prepended to all description
 92 | # unit titles (such as .. function::).
 93 | #add_module_names = True
 94 | 
 95 | # If true, sectionauthor and moduleauthor directives will be shown in the
 96 | # output. They are ignored by default.
 97 | #show_authors = False
 98 | 
 99 | # The name of the Pygments (syntax highlighting) style to use.
100 | pygments_style = 'sphinx'
101 | 
102 | # A list of ignored prefixes for module index sorting.
103 | #modindex_common_prefix = []
104 | 
105 | # If true, keep warnings as "system message" paragraphs in the built documents.
106 | #keep_warnings = False
107 | 
108 | # Autodoc
109 | 
110 | autoclass_content = 'both'
111 | 
112 | 
113 | # -- Options for HTML output ----------------------------------------------
114 | 
115 | # The theme to use for HTML and HTML Help pages.  See the documentation for
116 | # a list of builtin themes.
117 | html_theme = 'default'
118 | 
119 | # Theme options are theme-specific and customize the look and feel of a theme
120 | # further.  For a list of options available for each theme, see the
121 | # documentation.
122 | #html_theme_options = {}
123 | 
124 | # Add any paths that contain custom themes here, relative to this directory.
125 | #html_theme_path = []
126 | 
127 | # The name for this set of Sphinx documents.  If None, it defaults to
128 | # "<project> v<release> documentation".
129 | #html_title = None
130 | 
131 | # A shorter title for the navigation bar.  Default is the same as html_title.
132 | #html_short_title = None
133 | 
134 | # The name of an image file (relative to this directory) to place at the top
135 | # of the sidebar.
136 | #html_logo = None
137 | 
138 | # The name of an image file (within the static path) to use as favicon of the
139 | # docs.  This file should be a Windows icon file (.ico) being 16x16 or 32x32
140 | # pixels large.
141 | #html_favicon = None
142 | 
143 | # Add any paths that contain custom static files (such as style sheets) here,
144 | # relative to this directory. They are copied after the builtin static files,
145 | # so a file named "default.css" will overwrite the builtin "default.css".
146 | #html_static_path = ['_static']
147 | html_static_path = []
148 | 
149 | # Add any extra paths that contain custom files (such as robots.txt or
150 | # .htaccess) here, relative to this directory. These files are copied
151 | # directly to the root of the documentation.
152 | #html_extra_path = []
153 | 
154 | # If not '', a 'Last updated on:' timestamp is inserted at every page bottom,
155 | # using the given strftime format.
156 | #html_last_updated_fmt = '%b %d, %Y'
157 | 
158 | # If true, SmartyPants will be used to convert quotes and dashes to
159 | # typographically correct entities.
160 | #html_use_smartypants = True
161 | 
162 | # Custom sidebar templates, maps document names to template names.
163 | #html_sidebars = {}
164 | 
165 | # Additional templates that should be rendered to pages, maps page names to
166 | # template names.
167 | #html_additional_pages = {}
168 | 
169 | # If false, no module index is generated.
170 | #html_domain_indices = True
171 | 
172 | # If false, no index is generated.
173 | #html_use_index = True
174 | 
175 | # If true, the index is split into individual pages for each letter.
176 | #html_split_index = False
177 | 
178 | # If true, links to the reST sources are added to the pages.
179 | #html_show_sourcelink = True
180 | 
181 | # If true, "Created using Sphinx" is shown in the HTML footer. Default is True.
182 | #html_show_sphinx = True
183 | 
184 | # If true, "(C) Copyright ..." is shown in the HTML footer. Default is True.
185 | #html_show_copyright = True
186 | 
187 | # If true, an OpenSearch description file will be output, and all pages will
188 | # contain a <link> tag referring to it.  The value of this option must be the
189 | # base URL from which the finished HTML is served.
190 | #html_use_opensearch = ''
191 | 
192 | # This is the file name suffix for HTML files (e.g. ".xhtml").
193 | #html_file_suffix = None
194 | 
195 | # Output file base name for HTML help builder.
196 | htmlhelp_basename = 'hdfsdoc'
197 | 
198 | 
199 | # -- Options for LaTeX output ---------------------------------------------
200 | 
201 | latex_elements = {
202 | # The paper size ('letterpaper' or 'a4paper').
203 | #'papersize': 'letterpaper',
204 | 
205 | # The font size ('10pt', '11pt' or '12pt').
206 | #'pointsize': '10pt',
207 | 
208 | # Additional stuff for the LaTeX preamble.
209 | #'preamble': '',
210 | }
211 | 
212 | # Grouping the document tree into LaTeX files. List of tuples
213 | # (source start file, target name, title,
214 | #  author, documentclass [howto, manual, or own class]).
215 | latex_documents = [
216 |   ('index', 'hdfs.tex', u'hdfs Documentation',
217 |    u'Author', 'manual'),
218 | ]
219 | 
220 | # The name of an image file (relative to this directory) to place at the top of
221 | # the title page.
222 | #latex_logo = None
223 | 
224 | # For "manual" documents, if this is true, then toplevel headings are parts,
225 | # not chapters.
226 | #latex_use_parts = False
227 | 
228 | # If true, show page references after internal links.
229 | #latex_show_pagerefs = False
230 | 
231 | # If true, show URL addresses after external links.
232 | #latex_show_urls = False
233 | 
234 | # Documents to append as an appendix to all manuals.
235 | #latex_appendices = []
236 | 
237 | # If false, no module index is generated.
238 | #latex_domain_indices = True
239 | 
240 | 
241 | # -- Options for manual page output ---------------------------------------
242 | 
243 | # One entry per manual page. List of tuples
244 | # (source start file, name, description, authors, manual section).
245 | man_pages = [
246 |     ('index', 'hdfs', u'hdfs documentation',
247 |      [u'Author'], 1)
248 | ]
249 | 
250 | # If true, show URL addresses after external links.
251 | #man_show_urls = False
252 | 
253 | 
254 | # -- Options for Texinfo output -------------------------------------------
255 | 
256 | # Grouping the document tree into Texinfo files. List of tuples
257 | # (source start file, target name, title, author,
258 | #  dir menu entry, description, category)
259 | texinfo_documents = [
260 |   ('index', 'hdfs', u'hdfs documentation',
261 |    u'Author', 'hdfs', 'One line description of project.',
262 |    'Miscellaneous'),
263 | ]
264 | 
265 | # Documents to append as an appendix to all manuals.
266 | #texinfo_appendices = []
267 | 
268 | # If false, no module index is generated.
269 | #texinfo_domain_indices = True
270 | 
271 | # How to display URL addresses: 'footnote', 'no', or 'inline'.
272 | #texinfo_show_urls = 'footnote'
273 | 
274 | # If true, do not generate a @detailmenu in the "Top" node's menu.
275 | #texinfo_no_detailmenu = False
276 | 
277 | 
278 | # -- Options for Epub output ----------------------------------------------
279 | 
280 | # Bibliographic Dublin Core info.
281 | epub_title = u'hdfs'
282 | epub_author = u'Author'
283 | epub_publisher = u'Author'
284 | epub_copyright = u'2014, Matthieu Monsch'
285 | 
286 | # The basename for the epub file. It defaults to the project name.
287 | #epub_basename = u'hdfs'
288 | 
289 | # The HTML theme for the epub output. Since the default themes are not optimized
290 | # for small screen space, using the same theme for HTML and epub output is
291 | # usually not wise. This defaults to 'epub', a theme designed to save visual
292 | # space.
293 | #epub_theme = 'epub'
294 | 
295 | # The language of the text. It defaults to the language option
296 | # or en if the language is not set.
297 | #epub_language = ''
298 | 
299 | # The scheme of the identifier. Typical schemes are ISBN or URL.
300 | #epub_scheme = ''
301 | 
302 | # The unique identifier of the text. This can be a ISBN number
303 | # or the project homepage.
304 | #epub_identifier = ''
305 | 
306 | # A unique identification for the text.
307 | #epub_uid = ''
308 | 
309 | # A tuple containing the cover image and cover page html template filenames.
310 | #epub_cover = ()
311 | 
312 | # A sequence of (type, uri, title) tuples for the guide element of content.opf.
313 | #epub_guide = ()
314 | 
315 | # HTML files that should be inserted before the pages created by sphinx.
316 | # The format is a list of tuples containing the path and title.
317 | #epub_pre_files = []
318 | 
319 | # HTML files that should be inserted after the pages created by sphinx.
320 | # The format is a list of tuples containing the path and title.
321 | #epub_post_files = []
322 | 
323 | # A list of files that should not be packed into the epub file.
324 | epub_exclude_files = ['search.html']
325 | 
326 | # The depth of the table of contents in toc.ncx.
327 | #epub_tocdepth = 3
328 | 
329 | # Allow duplicate toc entries.
330 | #epub_tocdup = True
331 | 
332 | # Choose between 'default' and 'includehidden'.
333 | #epub_tocscope = 'default'
334 | 
335 | # Fix unsupported image types using the PIL.
336 | #epub_fix_images = False
337 | 
338 | # Scale large images.
339 | #epub_max_image_width = 0
340 | 
341 | # How to display URL addresses: 'footnote', 'no', or 'inline'.
342 | #epub_show_urls = 'inline'
343 | 
344 | # If false, no index is generated.
345 | #epub_use_index = True
346 | 


--------------------------------------------------------------------------------
/doc/index.rst:
--------------------------------------------------------------------------------
 1 | .. default-role:: code
 2 | 
 3 | 
 4 | HdfsCLI
 5 | =======
 6 | 
 7 | API and command line interface for HDFS.
 8 | 
 9 | + `Project homepage on GitHub`_
10 | + `PyPI entry`_
11 | 
12 | 
13 | Installation
14 | ------------
15 | 
16 | Using pip_:
17 | 
18 | .. code-block:: bash
19 | 
20 |   $ pip install hdfs
21 | 
22 | By default none of the package requirements for extensions are installed. To do 
23 | so simply suffix the package name with the desired extensions:
24 | 
25 | .. code-block:: bash
26 | 
27 |   $ pip install hdfs[avro,dataframe,kerberos]
28 | 
29 | 
30 | User guide
31 | ----------
32 | 
33 | .. toctree::
34 |   :maxdepth: 2
35 | 
36 |   quickstart
37 |   advanced
38 |   api
39 | 
40 | 
41 | Sample script
42 | -------------
43 | 
44 | .. literalinclude:: ../examples/json.py
45 | 
46 | More examples can be found in the `examples/` folder on GitHub.
47 | 
48 | 
49 | .. _Project homepage on GitHub: https://github.com/mtth/hdfs
50 | .. _PyPI entry: https://pypi.python.org/pypi/hdfs/
51 | .. _pip: http://www.pip-installer.org/en/latest/
52 | 


--------------------------------------------------------------------------------
/doc/quickstart.rst:
--------------------------------------------------------------------------------
  1 | .. default-role:: code
  2 | 
  3 | 
  4 | Quickstart
  5 | ==========
  6 | 
  7 | This page first goes through the steps required to configure HdfsCLI's command 
  8 | line interface then gives an overview of the python API. If you are only 
  9 | interested in using HdfsCLI as a library, then feel free to jump ahead to the 
 10 | `Python bindings`_ section.
 11 | 
 12 | 
 13 | Configuration
 14 | -------------
 15 | 
 16 | HdfsCLI uses *aliases* to figure out how to connect to different HDFS clusters. 
 17 | These are defined in HdfsCLI's configuration file, located by default at 
 18 | `~/.hdfscli.cfg` (or elsewhere by setting the `HDFSCLI_CONFIG` environment 
 19 | variable correspondingly). See below for a sample configuration defining two 
 20 | aliases, `dev` and `prod`:
 21 | 
 22 | .. code-block:: cfg
 23 | 
 24 |   [global]
 25 |   default.alias = dev
 26 | 
 27 |   [dev.alias]
 28 |   url = http://dev.namenode:port
 29 |   user = ann
 30 | 
 31 |   [prod.alias]
 32 |   url = http://prod.namenode:port
 33 |   root = /jobs/
 34 | 
 35 | Each alias is defined as its own `ALIAS.alias` section which must at least 
 36 | contain a `url` option with the URL to the namenode (including protocol and 
 37 | port). All other options can be omitted. If specified, `client` determines 
 38 | which :class:`hdfs.client.Client` class to use and the remaining options are 
 39 | passed as keyword arguments to the appropriate constructor. The currently 
 40 | available client classes are:
 41 | 
 42 | + :class:`~hdfs.client.InsecureClient` (the default)
 43 | + :class:`~hdfs.client.TokenClient`
 44 | 
 45 | See the :ref:`Kerberos extension <kerberos_extension>` to enable the 
 46 | :class:`~hdfs.ext.kerberos.KerberosClient` and :ref:`custom_client` to learn 
 47 | how to use other client classes.
 48 | 
 49 | The `url` option can be configured to support High Availability namenodes when using WebHDFS,
 50 | simply add more URLs by delimiting with a semicolon (`;`).
 51 | 
 52 | Finally, note the `default.alias` entry in the global configuration section 
 53 | which will be used as default alias if none is specified.
 54 | 
 55 | 
 56 | Command line interface
 57 | ----------------------
 58 | 
 59 | HdfsCLI comes by default with a single entry point `hdfscli` which provides a 
 60 | convenient interface to perform common actions. All its commands accept an 
 61 | `--alias` argument (described above), which defines against which cluster to 
 62 | operate.
 63 | 
 64 | 
 65 | Downloading and uploading files
 66 | *******************************
 67 | 
 68 | HdfsCLI supports downloading and uploading files and folders transparently from 
 69 | HDFS (we can also specify the degree of parallelism by using the `--threads` 
 70 | option).
 71 | 
 72 | .. code-block:: bash
 73 | 
 74 |   $ # Write a single file to HDFS.
 75 |   $ hdfscli upload --alias=dev weights.json models/
 76 |   $ # Read all files inside a folder from HDFS and store them locally.
 77 |   $ hdfscli download export/results/ "results-$(date +%F)"
 78 | 
 79 | If reading (resp. writing) a single file, its contents can also be streamed to 
 80 | standard out (resp. from standard in) by using `-` as path argument:
 81 | 
 82 | .. code-block:: bash
 83 | 
 84 |   $ # Read a file from HDFS and append its contents to a local log file.
 85 |   $ hdfscli download logs/1987-03-23.txt - >>logs
 86 | 
 87 | By default HdfsCLI will throw an error if trying to write to an existing path 
 88 | (either locally or on HDFS). We can force the path to be overwritten with the 
 89 | `--force` option.
 90 | 
 91 | 
 92 | .. _interactive_shell:
 93 | 
 94 | Interactive shell
 95 | *****************
 96 | 
 97 | The `interactive` command (used also when no command is specified) will create 
 98 | an HDFS client and expose it inside a python shell (using IPython_ if 
 99 | available). This makes is convenient to perform file system operations on HDFS 
100 | and interact with its data. See :ref:`python_bindings` below for an overview of 
101 | the methods available.
102 | 
103 | .. code-block:: bash
104 | 
105 |   $ hdfscli --alias=dev
106 | 
107 |   Welcome to the interactive HDFS python shell.
108 |   The HDFS client is available as `CLIENT`.
109 | 
110 |   In [1]: CLIENT.list('data/')
111 |   Out[1]: ['1.json', '2.json']
112 | 
113 |   In [2]: CLIENT.status('data/2.json')
114 |   Out[2]: {
115 |     'accessTime': 1439743128690,
116 |     'blockSize': 134217728,
117 |     'childrenNum': 0,
118 |     'fileId': 16389,
119 |     'group': 'supergroup',
120 |     'length': 2,
121 |     'modificationTime': 1439743129392,
122 |     'owner': 'drwho',
123 |     'pathSuffix': '',
124 |     'permission': '755',
125 |     'replication': 1,
126 |     'storagePolicy': 0,
127 |     'type': 'FILE'
128 |   }
129 | 
130 |   In [3]: CLIENT.delete('data/2.json')
131 |   Out[3]: True
132 | 
133 | Using the full power of python lets us easily perform more complex operations 
134 | such as renaming folder which match some pattern, deleting files which haven't 
135 | been accessed for some duration, finding all paths owned by a certain user, 
136 | etc.
137 | 
138 | 
139 | More
140 | ****
141 | 
142 | Cf. `hdfscli --help` for the full list of commands and options.
143 | 
144 | 
145 | .. _python_bindings:
146 | 
147 | Python bindings
148 | ---------------
149 | 
150 | 
151 | Instantiating a client
152 | **********************
153 | 
154 | The simplest way of getting a :class:`hdfs.client.Client` instance is by using 
155 | the :ref:`interactive_shell` described above, where the client will be 
156 | automatically available. To instantiate a client programmatically, there are 
157 | two options:
158 | 
159 | The first is to import the client class and call its constructor directly. This 
160 | is the most straightforward and flexible, but doesn't let us reuse our 
161 | configured aliases:
162 | 
163 | .. code-block:: python
164 | 
165 |   from hdfs import InsecureClient
166 |   client = InsecureClient('http://host:port', user='ann')
167 | 
168 | The second leverages the :class:`hdfs.config.Config` class to load an existing 
169 | configuration file (defaulting to the same one as the CLI) and create clients 
170 | from existing aliases:
171 | 
172 | .. code-block:: python
173 | 
174 |   from hdfs import Config
175 |   client = Config().get_client('dev')
176 | 
177 | 
178 | Reading and writing files
179 | *************************
180 | 
181 | The :meth:`~hdfs.client.Client.read` method provides a file-like interface for 
182 | reading files from HDFS. It must be used in a `with` block (making sure that 
183 | connections are always properly closed):
184 | 
185 | .. code-block:: python
186 | 
187 |   # Loading a file in memory.
188 |   with client.read('features') as reader:
189 |     features = reader.read()
190 | 
191 |   # Directly deserializing a JSON object.
192 |   with client.read('model.json', encoding='utf-8') as reader:
193 |     from json import load
194 |     model = load(reader)
195 | 
196 | If a `chunk_size` argument is passed, the method will return a generator 
197 | instead, making it sometimes simpler to stream the file's contents.
198 | 
199 | .. code-block:: python
200 | 
201 |   # Stream a file.
202 |   with client.read('features', chunk_size=8096) as reader:
203 |     for chunk in reader:
204 |       pass
205 | 
206 | Similarly, if a `delimiter` argument is passed, the method will return a 
207 | generator of the delimited chunks.
208 | 
209 | .. code-block:: python
210 | 
211 |   with client.read('samples.csv', encoding='utf-8', delimiter='\n') as reader:
212 |     for line in reader:
213 |       pass
214 | 
215 | Writing files to HDFS is done using the :meth:`~hdfs.client.Client.write` 
216 | method which returns a file-like writable object:
217 | 
218 | .. code-block:: python
219 | 
220 |   # Writing part of a file.
221 |   with open('samples') as reader, client.write('samples') as writer:
222 |     for line in reader:
223 |       if line.startswith('-'):
224 |         writer.write(line)
225 | 
226 |   # Writing a serialized JSON object.
227 |   with client.write('model.json', encoding='utf-8') as writer:
228 |     from json import dump
229 |     dump(model, writer)
230 | 
231 | For convenience, it is also possible to pass an iterable `data` argument 
232 | directly to the method.
233 | 
234 | .. code-block:: python
235 | 
236 |   # This is equivalent to the JSON example above.
237 |   from json import dumps
238 |   client.write('model.json', dumps(model))
239 | 
240 | 
241 | Exploring the file system
242 | *************************
243 | 
244 | All :class:`~hdfs.client.Client` subclasses expose a variety of methods to 
245 | interact with HDFS. Most are modeled directly after the WebHDFS operations, a 
246 | few of these are shown in the snippet below:
247 | 
248 | .. code-block:: python
249 | 
250 |   # Retrieving a file or folder content summary.
251 |   content = client.content('dat')
252 | 
253 |   # Listing all files inside a directory.
254 |   fnames = client.list('dat')
255 | 
256 |   # Retrieving a file or folder status.
257 |   status = client.status('dat/features')
258 | 
259 |   # Renaming ("moving") a file.
260 |   client.rename('dat/features', 'features')
261 | 
262 |   # Deleting a file or folder.
263 |   client.delete('dat', recursive=True)
264 | 
265 | Other methods build on these to provide more advanced features:
266 | 
267 | .. code-block:: python
268 | 
269 |   # Download a file or folder locally.
270 |   client.download('dat', 'dat', n_threads=5)
271 | 
272 |   # Get all files under a given folder (arbitrary depth).
273 |   import posixpath as psp
274 |   fpaths = [
275 |     psp.join(dpath, fname)
276 |     for dpath, _, fnames in client.walk('predictions')
277 |     for fname in fnames
278 |   ]
279 | 
280 | See the :ref:`api_reference` for the comprehensive list of methods available.
281 | 
282 | 
283 | Checking path existence
284 | ***********************
285 | 
286 | Most of the methods described above will raise an :class:`~hdfs.util.HdfsError` 
287 | if called on a missing path. The recommended way of checking whether a path 
288 | exists is using the :meth:`~hdfs.client.Client.content` or 
289 | :meth:`~hdfs.client.Client.status` methods with a `strict=False` argument (in 
290 | which case they will return `None` on a missing path).
291 | 
292 | 
293 | More
294 | ****
295 | 
296 | See the :ref:`advanced_usage` section to learn more.
297 | 
298 | 
299 | .. _IPython: http://ipython.org/
300 | 


--------------------------------------------------------------------------------
/doc/requirements.txt:
--------------------------------------------------------------------------------
1 | mock; python_version<'3.3'
2 | 


--------------------------------------------------------------------------------
/etc/hadoop/core-site.xml:
--------------------------------------------------------------------------------
 1 | <configuration>
 2 |   <property>
 3 |     <name>fs.defaultFS</name>
 4 |     <value>hdfs://localhost:51000</value>
 5 |   </property>
 6 |   <property>
 7 |     <name>fs.trash.interval</name>
 8 |     <value>10</value>
 9 |   </property>
10 |   <property>
11 |     <name>fs.trash.checkpoint.interval</name>
12 |     <value>1</value>
13 |   </property>
14 |   <property>
15 |     <name>hadoop.proxyuser.#USER#.hosts</name>
16 |     <value>*</value>
17 |   </property>
18 |   <property>
19 |     <name>hadoop.proxyuser.#USER#.groups</name>
20 |     <value>*</value>
21 |   </property>
22 | </configuration>
23 | 


--------------------------------------------------------------------------------
/etc/hadoop/hdfs-site.xml:
--------------------------------------------------------------------------------
 1 | <configuration>
 2 |   <property>
 3 |     <name>dfs.replication</name>
 4 |     <value>1</value>
 5 |   </property>
 6 |   <property>
 7 |     <name>dfs.support.append</name>
 8 |     <value>true</value>
 9 |   </property>
10 |   <property>
11 |     <name>dfs.webhdfs.enabled</name>
12 |     <value>true</value>
13 |   </property>
14 |   <property>
15 |     <name>dfs.namenode.acls.enabled</name>
16 |     <value>true</value>
17 |   </property>
18 | </configuration>
19 | 


--------------------------------------------------------------------------------
/etc/hadoop/httpfs-site.xml:
--------------------------------------------------------------------------------
1 | <configuration>
2 |   <property>
3 |     <name>httpfs.authentication.signature.secret.file</name>
4 |     <value>${httpfs.config.dir}/httpfs-site.xml</value>
5 |   </property>
6 | </configuration>
7 | 


--------------------------------------------------------------------------------
/etc/hadoop/log4j.properties:
--------------------------------------------------------------------------------
1 | log4j.appender.console.layout.ConversionPattern=%d{yy/MM/dd HH:mm:ss} %p %c{2}: %m%n
2 | log4j.appender.console.layout=org.apache.log4j.PatternLayout
3 | log4j.appender.console.target=System.err
4 | log4j.appender.console=org.apache.log4j.ConsoleAppender
5 | log4j.rootLogger=INFO,console
6 | 


--------------------------------------------------------------------------------
/examples/avro-example.py:
--------------------------------------------------------------------------------
 1 | #!/usr/bin/env python
 2 | # encoding: utf-8
 3 | 
 4 | """Avro extension example."""
 5 | 
 6 | from hdfs import Config
 7 | from hdfs.ext.avro import AvroReader, AvroWriter
 8 | 
 9 | 
10 | # Get the default alias' client.
11 | client = Config().get_client()
12 | 
13 | # Some sample data.
14 | records = [
15 |   {'name': 'Ann', 'age': 23},
16 |   {'name': 'Bob', 'age': 22},
17 | ]
18 | 
19 | # Write an Avro File to HDFS (since our records' schema is very simple, we let
20 | # the writer infer it automatically, otherwise we would pass it as argument).
21 | with AvroWriter(client, 'names.avro', overwrite=True) as writer:
22 |   for record in records:
23 |     writer.write(record)
24 | 
25 | # Read it back.
26 | with AvroReader(client, 'names.avro') as reader:
27 |   schema = reader.schema # The inferred schema.
28 |   content = reader.content # The remote file's HDFS content object.
29 |   assert list(reader) == records # The records match!
30 | 


--------------------------------------------------------------------------------
/examples/dataframe-example.py:
--------------------------------------------------------------------------------
 1 | #!/usr/bin/env python
 2 | # encoding: utf-8
 3 | 
 4 | """Dataframe extension example."""
 5 | 
 6 | from hdfs import Config
 7 | from hdfs.ext.dataframe import read_dataframe, write_dataframe
 8 | import pandas as pd
 9 | 
10 | 
11 | # Get the default alias' client.
12 | client = Config().get_client()
13 | 
14 | # A sample dataframe.
15 | df = pd.DataFrame.from_records([
16 |   {'A': 1, 'B': 2},
17 |   {'A': 11, 'B': 23}
18 | ])
19 | 
20 | # Write dataframe to HDFS using Avro serialization.
21 | write_dataframe(client, 'data.avro', df, overwrite=True)
22 | 
23 | # Read the Avro file back from HDFS.
24 | _df = read_dataframe(client, 'data.avro')
25 | 
26 | # The frames match!
27 | pd.testing.assert_frame_equal(df, _df)
28 | 


--------------------------------------------------------------------------------
/examples/json-example.py:
--------------------------------------------------------------------------------
 1 | #!/usr/bin/env python
 2 | # encoding: utf-8
 3 | 
 4 | """Sample HdfsCLI script.
 5 | 
 6 | This example shows how to write files to HDFS, read them back, and perform a
 7 | few other simple filesystem operations.
 8 | 
 9 | """
10 | 
11 | from hdfs import Config
12 | from json import dump, load
13 | 
14 | 
15 | # Get the default alias' client. (See the quickstart section in the
16 | # documentation to learn more about this.)
17 | client = Config().get_client()
18 | 
19 | # Some fake data that we are interested in uploading to HDFS.
20 | model = {
21 |   '(intercept)': 48.,
22 |   'first_feature': 2.,
23 |   'second_feature': 12.,
24 | }
25 | 
26 | # First, we delete any existing `models/` folder on HDFS.
27 | client.delete('models', recursive=True)
28 | 
29 | # We can now upload the data, first as CSV.
30 | with client.write('models/1.csv', encoding='utf-8') as writer:
31 |   for item in model.items():
32 |     writer.write(u'%s,%s\n' % item)
33 | 
34 | # We can also serialize it to JSON and directly upload it.
35 | with client.write('models/1.json', encoding='utf-8') as writer:
36 |   dump(model, writer)
37 | 
38 | # We can check that the files exist and get their properties.
39 | assert client.list('models') == ['1.csv', '1.json']
40 | status = client.status('models/1.csv')
41 | content = client.content('models/1.json')
42 | 
43 | # Later, we can download the files back. The `delimiter` option makes it
44 | # convenient to read CSV files.
45 | with client.read('models/1.csv', delimiter='\n', encoding='utf-8') as reader:
46 |   items = (line.split(',') for line in reader if line)
47 |   assert {name: float(value) for name, value in items} == model
48 | 
49 | # Loading JSON directly from HDFS is even simpler.
50 | with client.read('models/1.json', encoding='utf-8') as reader:
51 |   assert load(reader) == model
52 | 


--------------------------------------------------------------------------------
/hdfs/__init__.py:
--------------------------------------------------------------------------------
 1 | #!/usr/bin/env python
 2 | # encoding: utf-8
 3 | 
 4 | """HdfsCLI: API and command line interface for HDFS."""
 5 | 
 6 | from .client import Client, InsecureClient, TokenClient
 7 | from .config import Config, NullHandler
 8 | from .util import HdfsError
 9 | import logging as lg
10 | 
11 | 
12 | __version__ = '2.7.3'
13 | __license__ = 'MIT'
14 | 
15 | 
16 | lg.getLogger(__name__).addHandler(NullHandler())
17 | 


--------------------------------------------------------------------------------
/hdfs/__main__.py:
--------------------------------------------------------------------------------
  1 | #!/usr/bin/env python
  2 | # encoding: utf-8
  3 | 
  4 | """HdfsCLI: a command line interface for HDFS.
  5 | 
  6 | Usage:
  7 |   hdfscli [interactive] [-a ALIAS] [-v...]
  8 |   hdfscli download [-fsa ALIAS] [-v...] [-t THREADS] HDFS_PATH LOCAL_PATH
  9 |   hdfscli upload [-sa ALIAS] [-v...] [-A | -f] [-t THREADS] LOCAL_PATH HDFS_PATH
 10 |   hdfscli -L | -V | -h
 11 | 
 12 | Commands:
 13 |   download                      Download a file or folder from HDFS. If a
 14 |                                 single file is downloaded, - can be
 15 |                                 specified as LOCAL_PATH to stream it to
 16 |                                 standard out.
 17 |   interactive                   Start the client and expose it via the python
 18 |                                 interpreter (using iPython if available).
 19 |   upload                        Upload a file or folder to HDFS. - can be
 20 |                                 specified as LOCAL_PATH to read from standard
 21 |                                 in.
 22 | 
 23 | Arguments:
 24 |   HDFS_PATH                     Remote HDFS path.
 25 |   LOCAL_PATH                    Path to local file or directory.
 26 | 
 27 | Options:
 28 |   -A --append                   Append data to an existing file. Only supported
 29 |                                 if uploading a single file or from standard in.
 30 |   -L --log                      Show path to current log file and exit.
 31 |   -V --version                  Show version and exit.
 32 |   -a ALIAS --alias=ALIAS        Alias of namenode to connect to.
 33 |   -f --force                    Allow overwriting any existing files.
 34 |   -s --silent                   Don't display progress status.
 35 |   -t THREADS --threads=THREADS  Number of threads to use for parallelization.
 36 |                                 0 allocates a thread per file. [default: 0]
 37 |   -v --verbose                  Enable log output. Can be specified up to three
 38 |                                 times (increasing verbosity each time).
 39 | 
 40 | Examples:
 41 |   hdfscli -a prod /user/foo
 42 |   hdfscli download features.avro dat/
 43 |   hdfscli download logs/1987-03-23 - >>logs
 44 |   hdfscli upload -f - data/weights.tsv <weights.tsv
 45 | 
 46 | HdfsCLI exits with return status 1 if an error occurred and 0 otherwise.
 47 | 
 48 | """
 49 | 
 50 | from . import __version__
 51 | from .config import Config, NullHandler, catch
 52 | from .util import HdfsError
 53 | from docopt import docopt
 54 | from threading import Lock
 55 | import logging as lg
 56 | import os
 57 | import os.path as osp
 58 | import sys
 59 | 
 60 | 
 61 | def parse_arg(args, name, parser, separator=None):
 62 |   """Parse command line argument, raising an appropriate error on failure.
 63 | 
 64 |   :param args: Arguments dictionary.
 65 |   :param name: Name of option to look up.
 66 |   :param parser: Function to parse option.
 67 |   :param separator: For parsing lists.
 68 | 
 69 |   """
 70 |   value = args[name]
 71 |   if not value:
 72 |     return
 73 |   try:
 74 |     if separator and separator in value:
 75 |       return [parser(part) for part in value.split(separator) if part]
 76 |     else:
 77 |       return parser(value)
 78 |   except ValueError:
 79 |     raise HdfsError('Invalid %r option: %r.', name, args[name])
 80 | 
 81 | def configure_client(command, args, config=None):
 82 |   """Instantiate configuration from arguments dictionary.
 83 | 
 84 |   :param command: Command name, used to set up the appropriate log handler.
 85 |   :param args: Arguments returned by `docopt`.
 86 |   :param config: CLI configuration, used for testing.
 87 | 
 88 |   If the `--log` argument is set, this method will print active file handler
 89 |   paths and exit the process.
 90 | 
 91 |   """
 92 |   logger = lg.getLogger()
 93 |   logger.setLevel(lg.DEBUG)
 94 |   lg.getLogger('requests_kerberos.kerberos_').setLevel(lg.INFO)
 95 |   # TODO: Filter only at handler level.
 96 |   if not config:
 97 |     levels = {0: lg.ERROR, 1: lg.WARNING, 2: lg.INFO}
 98 |     config = Config(stream_log_level=levels.get(args['--verbose'], lg.DEBUG))
 99 |   handler = config.get_log_handler(command)
100 |   if args['--log']:
101 |     if isinstance(handler, NullHandler):
102 |       sys.stdout.write('No log file active.\n')
103 |       sys.exit(1)
104 |     else:
105 |       sys.stdout.write('{}\n'.format(handler.baseFilename))
106 |       sys.exit(0)
107 |   logger.addHandler(handler)
108 |   return config.get_client(args['--alias'])
109 | 
110 | 
111 | class _Progress(object):
112 | 
113 |   """Progress tracker callback.
114 | 
115 |   :param nbytes: Total number of bytes that will be transferred.
116 |   :param nfiles: Total number of files that will be transferred.
117 |   :param writer: Writable file-object where the progress will be written.
118 |     Defaults to standard error.
119 | 
120 |   """
121 | 
122 |   def __init__(self, nbytes, nfiles, writer=None):
123 |     self._total_bytes = nbytes
124 |     self._pending_files = nfiles
125 |     self._writer = writer or sys.stderr
126 |     self._downloading_files = 0
127 |     self._complete_files = 0
128 |     self._lock = Lock()
129 |     self._data = {}
130 | 
131 |   def __call__(self, hdfs_path, nbytes):
132 |     # TODO: Improve lock granularity.
133 |     with self._lock:
134 |       data = self._data
135 |       if hdfs_path not in data:
136 |         self._pending_files -= 1
137 |         self._downloading_files += 1
138 |       if nbytes == -1:
139 |         self._downloading_files -= 1
140 |         self._complete_files += 1
141 |       else:
142 |         data[hdfs_path] = nbytes
143 |       if self._pending_files + self._downloading_files > 0:
144 |         self._writer.write(
145 |           '%3.1f%%\t[ pending: %d | downloading: %d | complete: %d ]   \r' %
146 |           (
147 |             100. * sum(data.values()) / self._total_bytes,
148 |             self._pending_files,
149 |             self._downloading_files,
150 |             self._complete_files,
151 |           )
152 |         )
153 |       else:
154 |         self._writer.write('%79s\r' % ('', ))
155 | 
156 |   @classmethod
157 |   def from_hdfs_path(cls, client, hdfs_path, writer=None):
158 |     """Instantiate from remote path.
159 | 
160 |     :param client: HDFS client.
161 |     :param hdfs_path: HDFS path.
162 | 
163 |     """
164 |     content = client.content(hdfs_path)
165 |     return cls(content['length'], content['fileCount'], writer=writer)
166 | 
167 |   @classmethod
168 |   def from_local_path(cls, local_path, writer=None):
169 |     """Instantiate from a local path.
170 | 
171 |     :param local_path: Local path.
172 | 
173 |     """
174 |     if osp.isdir(local_path):
175 |       nbytes = 0
176 |       nfiles = 0
177 |       for dpath, _, fnames in os.walk(local_path):
178 |         for fname in fnames:
179 |           nbytes += osp.getsize(osp.join(dpath, fname))
180 |           nfiles += 1
181 |     elif osp.exists(local_path):
182 |       nbytes = osp.getsize(local_path)
183 |       nfiles = 1
184 |     else:
185 |       raise HdfsError('No file found at: %s', local_path)
186 |     return cls(nbytes, nfiles, writer=writer)
187 | 
188 | @catch(HdfsError)
189 | def main(argv=None, client=None):
190 |   """Entry point.
191 | 
192 |   :param argv: Arguments list.
193 |   :param client: For testing.
194 | 
195 |   """
196 |   args = docopt(__doc__, argv=argv, version=__version__)
197 |   if not client:
198 |     client = configure_client('hdfscli', args)
199 |   elif args['--log']:
200 |     raise HdfsError('Logging is only available when no client is specified.')
201 |   hdfs_path = args['HDFS_PATH']
202 |   local_path = args['LOCAL_PATH']
203 |   n_threads = parse_arg(args, '--threads', int)
204 |   force = args['--force']
205 |   silent = args['--silent']
206 |   if args['download']:
207 |     chunk_size = 2 ** 16
208 |     if local_path == '-':
209 |       if not sys.stdout.isatty() and sys.stderr.isatty() and not silent:
210 |         progress = _Progress.from_hdfs_path(client, hdfs_path)
211 |       else:
212 |         progress = None
213 |       with client.read(
214 |         hdfs_path,
215 |         chunk_size=chunk_size,
216 |         progress=progress,
217 |       ) as reader:
218 |         # https://stackoverflow.com/a/23932488/1062617
219 |         stdout = getattr(sys.stdout, 'buffer', sys.stdout)
220 |         for chunk in reader:
221 |           stdout.write(chunk)
222 |     else:
223 |       if sys.stderr.isatty() and not silent:
224 |         progress = _Progress.from_hdfs_path(client, hdfs_path)
225 |       else:
226 |         progress = None
227 |       client.download(
228 |         hdfs_path,
229 |         local_path,
230 |         overwrite=force,
231 |         n_threads=n_threads,
232 |         chunk_size=chunk_size,
233 |         progress=progress,
234 |       )
235 |   elif args['upload']:
236 |     append = args['--append']
237 |     if local_path == '-':
238 |       client.write(
239 |         hdfs_path,
240 |         (line for line in sys.stdin), # Doesn't work with stdin.
241 |         append=append,
242 |         overwrite=force,
243 |       )
244 |     else:
245 |       if append:
246 |         # TODO: Add progress tracking here.
247 |         if osp.isfile(local_path):
248 |           with open(local_path) as reader:
249 |             client.write(hdfs_path, reader, append=True)
250 |         else:
251 |           raise HdfsError('Can only append when uploading a single file.')
252 |       else:
253 |         if sys.stderr.isatty() and not silent:
254 |           progress = _Progress.from_local_path(local_path)
255 |         else:
256 |           progress = None
257 |         client.upload(
258 |           hdfs_path,
259 |           local_path,
260 |           overwrite=force,
261 |           n_threads=n_threads,
262 |           progress=progress,
263 |         )
264 |   else:
265 |     banner = (
266 |       '\n'
267 |       'Welcome to the interactive HDFS python shell.\n'
268 |       'The HDFS client is available as `CLIENT`.\n'
269 |     )
270 |     namespace = {'CLIENT': client}
271 |     try:
272 |       from IPython import embed
273 |     except ImportError:
274 |       from code import interact
275 |       interact(banner=banner, local=namespace)
276 |     else:
277 |       embed(banner1=banner, user_ns=namespace)
278 | 
279 | if __name__ == '__main__':
280 |   main()
281 | 


--------------------------------------------------------------------------------
/hdfs/config.py:
--------------------------------------------------------------------------------
  1 | #!/usr/bin/env python
  2 | # encoding: utf-8
  3 | 
  4 | """Command line interface configuration module.
  5 | 
  6 | This module provides programmatic access to HdfsCLI's configuration settings.
  7 | In particular it exposes the ability to instantiate clients from aliases (see
  8 | :meth:`Config.get_client`).
  9 | 
 10 | """
 11 | 
 12 | from .client import Client
 13 | from .util import HdfsError
 14 | from functools import wraps
 15 | from logging.handlers import TimedRotatingFileHandler
 16 | from six.moves.configparser import ParsingError, RawConfigParser
 17 | from tempfile import gettempdir
 18 | import importlib.util
 19 | import importlib.machinery
 20 | import logging as lg
 21 | import os
 22 | import os.path as osp
 23 | import sys
 24 | 
 25 | _logger = lg.getLogger(__name__)
 26 | 
 27 | 
 28 | def _load_source(modname, filename):
 29 |   """Imitate the old imp.load_source() function, removed in Python 3.12"""
 30 |   # Based on sample code in https://docs.python.org/3.12/whatsnew/3.12.html.
 31 |   loader = importlib.machinery.SourceFileLoader(modname, filename)
 32 |   spec = importlib.util.spec_from_file_location(modname, filename, loader=loader)
 33 |   module = importlib.util.module_from_spec(spec)
 34 |   sys.modules[module.__name__] = module
 35 |   loader.exec_module(module)
 36 |   return module
 37 | 
 38 | 
 39 | class NullHandler(lg.Handler):
 40 | 
 41 |   """Pass-through logging handler.
 42 | 
 43 |   This is required for python <2.7.
 44 | 
 45 |   """
 46 | 
 47 |   def emit(self, record):
 48 |     """Do nothing."""
 49 |     pass
 50 | 
 51 | 
 52 | class Config(RawConfigParser):
 53 | 
 54 |   """Configuration class.
 55 | 
 56 |   :param path: path to configuration file. If no file exists at that location,
 57 |     the configuration parser will be empty. If not specified, the value of the
 58 |     `HDFSCLI_CONFIG` environment variable is used if it exists, otherwise it
 59 |     defaults to `~/.hdfscli.cfg`.
 60 |   :param stream_log_level: Stream handler log level, attached to the root
 61 |     logger. A false-ish value will disable this handler. This is particularly
 62 |     useful with the :func:`catch` function which reports exceptions as log
 63 |     messages.
 64 | 
 65 |   On instantiation, the configuration object will attempt to load modules
 66 |   defined in the `autoload` global options (see :ref:`custom_client` for more
 67 |   information).
 68 | 
 69 |   """
 70 | 
 71 |   default_path = osp.expanduser('~/.hdfscli.cfg')
 72 |   global_section = 'global'
 73 | 
 74 |   def __init__(self, path=None, stream_log_level=None):
 75 |     RawConfigParser.__init__(self)
 76 |     self._clients = {}
 77 |     self.path = path or os.getenv('HDFSCLI_CONFIG', self.default_path)
 78 |     if stream_log_level:
 79 |       stream_handler = lg.StreamHandler()
 80 |       stream_handler.setLevel(stream_log_level)
 81 |       fmt = '%(levelname)s\t%(message)s'
 82 |       stream_handler.setFormatter(lg.Formatter(fmt))
 83 |       lg.getLogger().addHandler(stream_handler)
 84 |     if osp.exists(self.path):
 85 |       try:
 86 |         self.read(self.path)
 87 |       except ParsingError:
 88 |         raise HdfsError('Invalid configuration file %r.', self.path)
 89 |       else:
 90 |         self._autoload()
 91 |       _logger.info('Instantiated configuration from %r.', self.path)
 92 |     else:
 93 |       _logger.info('Instantiated empty configuration.')
 94 | 
 95 |   def __repr__(self):
 96 |     return '<Config(path={!r})>'.format(self.path)
 97 | 
 98 |   def get_client(self, alias=None):
 99 |     """Load HDFS client.
100 | 
101 |     :param alias: The client to look up. If not specified, the default alias be
102 |       used (`default.alias` option in the `global` section) if available and an
103 |       error will be raised otherwise.
104 | 
105 |     Further calls to this method for the same alias will return the same client
106 |     instance (in particular, any option changes to this alias will not be taken
107 |     into account).
108 | 
109 |     """
110 |     if not alias:
111 |       if (
112 |         not self.has_section(self.global_section) or
113 |         not self.has_option(self.global_section, 'default.alias')
114 |       ):
115 |         raise HdfsError('No alias specified and no default alias found.')
116 |       alias = self.get(self.global_section, 'default.alias')
117 |     if not alias in self._clients:
118 |       for suffix in ('.alias', '_alias'):
119 |         section = '{}{}'.format(alias, suffix)
120 |         if self.has_section(section):
121 |           options = dict(self.items(section))
122 |           class_name = options.pop('client', 'InsecureClient')
123 |           # Massage options.
124 |           if 'timeout' in options:
125 |             timeout = tuple(int(s) for s in options['timeout'].split(','))
126 |             options['timeout'] = timeout[0] if len(timeout) == 1 else timeout
127 |           self._clients[alias] = Client.from_options(options, class_name)
128 |           break
129 |       else:
130 |         raise HdfsError('Alias %r not found in %r.', alias, self.path)
131 |     return self._clients[alias]
132 | 
133 |   def get_log_handler(self, command):
134 |     """Configure and return log handler.
135 | 
136 |     :param command: The command to load the configuration for. All options will
137 |       be looked up in the `[COMMAND.command]` section. This is currently only
138 |       used for configuring the file handler for logging. If logging is disabled
139 |       for the command, a :class:`NullHandler` will be returned, else a
140 |       :class:`TimedRotatingFileHandler`.
141 | 
142 |     """
143 |     section = '{}.command'.format(command)
144 |     path = osp.join(gettempdir(), '{}.log'.format(command))
145 |     level = lg.DEBUG
146 |     if self.has_section(section):
147 |       key = 'log.disable'
148 |       if self.has_option(section, key) and self.getboolean(section, key):
149 |         return NullHandler()
150 |       if self.has_option(section, 'log.path'):
151 |         path = self.get(section, 'log.path') # Override default path.
152 |       if self.has_option(section, 'log.level'):
153 |         level = getattr(lg, self.get(section, 'log.level').upper())
154 |     file_handler = TimedRotatingFileHandler(
155 |       path,
156 |       when='midnight', # Daily backups.
157 |       backupCount=1,
158 |       encoding='utf-8',
159 |     )
160 |     fmt = '%(asctime)s\t%(name)-16s\t%(levelname)-5s\t%(message)s'
161 |     file_handler.setFormatter(lg.Formatter(fmt))
162 |     file_handler.setLevel(level)
163 |     return file_handler
164 | 
165 |   def _autoload(self):
166 |     """Load modules to find clients."""
167 | 
168 |     def _load(suffix, loader):
169 |       """Generic module loader."""
170 |       option = 'autoload.{}'.format(suffix)
171 |       if self.has_option(self.global_section, option):
172 |         entries = self.get(self.global_section, option)
173 |         for entry in entries.split(','):
174 |           module = entry.strip()
175 |           try:
176 |             loader(module)
177 |           except Exception: # pylint: disable=broad-except
178 |             _logger.exception(
179 |               'Unable to load %r defined at %r.',
180 |               module, self.path
181 |             )
182 |             sys.exit(1)
183 | 
184 | 
185 |     _load('modules', __import__)
186 |     _load('paths', lambda path: _load_source(
187 |       osp.splitext(osp.basename(path))[0],
188 |       path
189 |     ))
190 | 
191 | 
192 | def catch(*error_classes):
193 |   r"""Returns a decorator that catches errors and prints messages to stderr.
194 | 
195 |   :param \*error_classes: Error classes.
196 | 
197 |   Also exits with status 1 if any errors are caught.
198 | 
199 |   """
200 |   def decorator(func):
201 |     """Decorator."""
202 |     @wraps(func)
203 |     def wrapper(*args, **kwargs):
204 |       """Wrapper. Finally."""
205 |       try:
206 |         return func(*args, **kwargs)
207 |       except error_classes as err:
208 |         _logger.error(err)
209 |         sys.exit(1)
210 |       except Exception: # pylint: disable=broad-except
211 |         _logger.exception('Unexpected exception.')
212 |         sys.exit(1)
213 |     return wrapper
214 |   return decorator
215 | 


--------------------------------------------------------------------------------
/hdfs/ext/__init__.py:
--------------------------------------------------------------------------------
1 | #!/usr/bin/env python
2 | # encoding: utf-8
3 | 
4 | """Extensions."""
5 | 


--------------------------------------------------------------------------------
/hdfs/ext/avro/__init__.py:
--------------------------------------------------------------------------------
  1 | #!/usr/bin/env python
  2 | # encoding: utf-8
  3 | # pylint: disable=protected-access
  4 | 
  5 | """Read and write Avro_ files directly from HDFS.
  6 | 
  7 | This extension enables streaming decoding and encoding of files from and to
  8 | HDFS. It requires the `fastavro` library.
  9 | 
 10 | + :class:`AvroWriter` writes Avro files on HDFS from python objects.
 11 | + :class:`AvroReader` reads Avro files from HDFS into an iterable of records.
 12 | 
 13 | Sample usage:
 14 | 
 15 | .. literalinclude:: ../examples/avro.py
 16 | 
 17 | It also features an entry point (named `hdfscli-avro` by default) which
 18 | provides access to the above functionality from the shell. For usage examples
 19 | and more information:
 20 | 
 21 | .. code-block:: bash
 22 | 
 23 |   $ hdfscli-avro --help
 24 | 
 25 | .. _Avro: https://avro.apache.org/docs/1.7.7/index.html
 26 | 
 27 | """
 28 | 
 29 | from ...util import AsyncWriter, HdfsError
 30 | from json import dumps
 31 | from six import integer_types, string_types
 32 | import fastavro
 33 | import io
 34 | import logging as lg
 35 | import os
 36 | import posixpath as psp
 37 | import sys
 38 | 
 39 | 
 40 | _logger = lg.getLogger(__name__)
 41 | 
 42 | 
 43 | # The number of bytes in a sync marker (http://mtth.xyz/_9lc9t3hjtx69x54).
 44 | SYNC_SIZE = 16
 45 | 
 46 | class _SchemaInferrer(object):
 47 | 
 48 |   """Utility to infer Avro schemas from python values."""
 49 | 
 50 |   def __init__(self):
 51 |     self.record_index = 0
 52 | 
 53 |   def infer(self, obj):
 54 |     """Infer Avro type corresponding to a python object.
 55 | 
 56 |     :param obj: Python primitive.
 57 | 
 58 |     There are multiple limitations with this functions, among which:
 59 | 
 60 |     + Nullable fields aren't supported.
 61 |     + Only Avro integers will be inferred, so some values may overflow.
 62 |     + Record names are auto-generated.
 63 | 
 64 |     """
 65 |     if isinstance(obj, bool):
 66 |       return 'boolean'
 67 |     elif isinstance(obj, string_types):
 68 |       return 'string'
 69 |     elif isinstance(obj, integer_types): # Python 3 doesn't have `long`.
 70 |       return 'int'
 71 |     elif isinstance(obj, float):
 72 |       return 'float'
 73 |     elif isinstance(obj, list):
 74 |       if not obj:
 75 |         raise ValueError('Cannot infer type of empty array.')
 76 |       return {
 77 |         'type': 'array',
 78 |         'items': self.infer(obj[0])
 79 |       }
 80 |     elif isinstance(obj, dict):
 81 |       if not obj:
 82 |         raise ValueError('Cannot infer type of empty record.')
 83 |       self.record_index += 1
 84 |       return {
 85 |         'name': '__Record{}'.format(self.record_index),
 86 |         'type': 'record',
 87 |         'fields': [
 88 |           {'name': k, 'type': self.infer(v)}
 89 |           for k, v in sorted(obj.items()) # Sort fields by name.
 90 |         ]
 91 |       }
 92 |     raise ValueError('Cannot infer type from {}: {!r}'.format(type(obj), obj))
 93 | 
 94 | 
 95 | class _SeekableReader(object):
 96 | 
 97 |   """Customized reader for Avro.
 98 | 
 99 |   :param reader: Non-seekable reader.
100 |   :param size: For testing.
101 | 
102 |   It detects reads of sync markers' sizes and will buffer these. Note that this
103 |   reader is heavily particularized to how the `fastavro` library performs Avro
104 |   decoding.
105 | 
106 |   """
107 | 
108 |   def __init__(self, reader, size=None):
109 |     self._reader = reader
110 |     self._size = size or SYNC_SIZE
111 |     self._buffer = None
112 |     self._saught = False
113 | 
114 |   def read(self, nbytes):
115 |     """Read bytes, caching the read if the size matches."""
116 |     buf = self._buffer
117 |     if self._saught:
118 |       assert buf
119 |       missing_bytes = nbytes - len(buf)
120 |       if missing_bytes < 0:
121 |         chunk = buf[:nbytes]
122 |         self._buffer = buf[nbytes:]
123 |       else:
124 |         chunk = buf
125 |         if missing_bytes:
126 |           chunk += self._reader.read(missing_bytes)
127 |         self._buffer = None
128 |         self._saught = False
129 |     else:
130 |       self._buffer = None
131 |       chunk = self._reader.read(nbytes)
132 |       if nbytes == self._size:
133 |         self._buffer = chunk
134 |     return chunk
135 | 
136 |   def seek(self, offset, whence):
137 |     """Go back using the cached bytes."""
138 |     assert offset == - self._size
139 |     assert whence == os.SEEK_CUR
140 |     assert self._buffer
141 |     self._saught = True
142 | 
143 | 
144 | class AvroReader(object):
145 | 
146 |   """HDFS Avro file reader.
147 | 
148 |   :param client: :class:`hdfs.client.Client` instance.
149 |   :param hdfs_path: Remote path.
150 |   :param parts: Part-files to read, when reading a distributed file. The
151 |     default is to read all part-files in order. See
152 |     :meth:`hdfs.client.Client.parts` for details.
153 |   :param reader_schema: Schema to read the data as. If specified, it must be
154 |     compatible with the writer's schema (the default).
155 | 
156 |   The contents of the file will be decoded in a streaming manner, as the data
157 |   is transferred. This makes it possible to use on files of arbitrary size. As
158 |   a convenience, the content summary object of the remote file is available on
159 |   the reader's `content` attribute.
160 | 
161 |   Usage:
162 | 
163 |   .. code-block:: python
164 | 
165 |     with AvroReader(client, 'foo.avro') as reader:
166 |       schema = reader.writer_schema # The remote file's Avro schema.
167 |       content = reader.content # Content metadata (e.g. size).
168 |       for record in reader:
169 |         pass # and its records
170 | 
171 |   """
172 | 
173 |   def __init__(self, client, hdfs_path, parts=None, reader_schema=None):
174 |     self.content = client.content(hdfs_path) #: Content summary of Avro file.
175 |     self.metadata = None #: Avro header metadata.
176 |     self.reader_schema = reader_schema #: Input reader schema.
177 |     self._writer_schema = None
178 |     if self.content['directoryCount']:
179 |       # This is a folder.
180 |       self._paths = [
181 |         psp.join(hdfs_path, fname)
182 |         for fname in client.parts(hdfs_path, parts)
183 |       ]
184 |     else:
185 |       # This is a single file.
186 |       self._paths = [hdfs_path]
187 |     self._client = client
188 |     self._records = None
189 |     _logger.debug('Instantiated %r.', self)
190 | 
191 |   def __repr__(self):
192 |     return '<AvroReader(paths={!r})>'.format(self._paths)
193 | 
194 |   def __enter__(self):
195 | 
196 |     def _reader():
197 |       """Record generator over all part-files."""
198 |       for path in self._paths:
199 |         with self._client.read(path) as bytes_reader:
200 |           reader = fastavro.reader(
201 |             _SeekableReader(bytes_reader),
202 |             reader_schema=self.reader_schema
203 |           )
204 |           if not self._writer_schema:
205 |             schema = reader.writer_schema
206 |             _logger.debug('Read schema from %r.', path)
207 |             yield (schema, reader.metadata)
208 |           for record in reader:
209 |             yield record
210 | 
211 |     self._records = _reader()
212 |     self._writer_schema, self.metadata = next(self._records)
213 |     return self
214 | 
215 |   def __exit__(self, exc_type, exc_value, traceback):
216 |     self._records.close()
217 |     _logger.debug('Closed records iterator for %r.', self)
218 | 
219 |   def __iter__(self): # pylint: disable=non-iterator-returned
220 |     if not self._records:
221 |       raise HdfsError('Iteration is only supported inside a `with` block.')
222 |     return self._records
223 | 
224 |   @property
225 |   def writer_schema(self):
226 |     """Get the underlying file's schema.
227 | 
228 |     The schema will only be available after entering the reader's corresponding
229 |     `with` block.
230 | 
231 |     """
232 |     if not self._writer_schema:
233 |       raise HdfsError('Schema not yet inferred.')
234 |     return self._writer_schema
235 | 
236 |   # Legacy property, preserved for backwards-compatibility.
237 |   schema = writer_schema
238 | 
239 | 
240 | class AvroWriter(object):
241 | 
242 |   r"""Write an Avro file on HDFS from python dictionaries.
243 | 
244 |   :param client: :class:`hdfs.client.Client` instance.
245 |   :param hdfs_path: Remote path.
246 |   :param schema: Avro schema. If not specified, the writer will try to infer it
247 |     from the first record sent. There are however limitations regarding what
248 |     can be inferred.
249 |   :param codec: Compression codec. The default is `'null'` (no compression).
250 |   :param sync_interval: Number of bytes after which a block will be written.
251 |   :param sync_marker: 16 byte tag used for synchronization. If not specified,
252 |     one will be generated at random.
253 |   :param metadata: Additional metadata to include in the container file's
254 |     header. Keys starting with `'avro.'` are reserved.
255 |   :param \*\*kwargs: Keyword arguments forwarded to
256 |     :meth:`hdfs.client.Client.write`.
257 | 
258 |   Usage:
259 | 
260 |   .. code-block:: python
261 | 
262 |     with AvroWriter(client, 'data.avro') as writer:
263 |       for record in records:
264 |         writer.write(record)
265 | 
266 |   """
267 | 
268 |   def __init__(self, client, hdfs_path, schema=None, codec=None,
269 |     sync_interval=None, sync_marker=None, metadata=None, **kwargs):
270 |     self._hdfs_path = hdfs_path
271 |     self._fo = client.write(hdfs_path, **kwargs)
272 |     self._schema = schema
273 |     self._writer_kwargs = {
274 |       'codec': codec or 'null',
275 |       'metadata': metadata,
276 |       'sync_interval': sync_interval or 1000 * SYNC_SIZE,
277 |       'sync_marker': sync_marker or os.urandom(SYNC_SIZE),
278 |     }
279 |     self._entered = False
280 |     self._writer = None
281 |     _logger.info('Instantiated %r.', self)
282 | 
283 |   def __repr__(self):
284 |     return '<AvroWriter(hdfs_path={!r})>'.format(self._hdfs_path)
285 | 
286 |   def __enter__(self):
287 |     if self._entered:
288 |       raise HdfsError('Avro writer cannot be reused.')
289 |     self._entered = True
290 |     if self._schema:
291 |       self._start_writer()
292 |     return self
293 | 
294 |   def __exit__(self, *exc_info):
295 |     if not self._writer:
296 |       return # No header or records were written.
297 |     try:
298 |       self._writer.__exit__(*exc_info)
299 |       _logger.debug('Closed underlying writer.')
300 |     finally:
301 |       self._fo.__exit__(*exc_info)
302 | 
303 |   @property
304 |   def schema(self):
305 |     """Avro schema."""
306 |     if not self._schema:
307 |       raise HdfsError('Schema not yet inferred.')
308 |     return self._schema
309 | 
310 |   def write(self, record):
311 |     """Store a record.
312 | 
313 |     :param record: Record object to store.
314 | 
315 |     Only available inside the `with` block.
316 | 
317 |     """
318 |     if not self._entered:
319 |       raise HdfsError('Avro writer not available outside context block.')
320 |     if not self._schema:
321 |       self._schema = _SchemaInferrer().infer(record)
322 |       _logger.info('Inferred schema: %s', dumps(self._schema))
323 |       self._start_writer()
324 |     self._writer.write(record)
325 | 
326 |   def _start_writer(self):
327 |     _logger.debug('Starting underlying writer.')
328 | 
329 |     def write(records):
330 |       fastavro.writer(
331 |         fo=self._fo.__enter__(),
332 |         schema=self._schema,
333 |         records=records,
334 |         **self._writer_kwargs
335 |       )
336 | 
337 |     self._writer = AsyncWriter(write).__enter__()
338 | 


--------------------------------------------------------------------------------
/hdfs/ext/avro/__main__.py:
--------------------------------------------------------------------------------
  1 | #!/usr/bin/env python
  2 | # encoding: utf-8
  3 | 
  4 | """HdfsCLI Avro: an Avro extension for HdfsCLI.
  5 | 
  6 | Usage:
  7 |   hdfscli-avro schema [-a ALIAS] [-v...] HDFS_PATH
  8 |   hdfscli-avro read [-a ALIAS] [-v...] [-F FREQ | -n NUM] [-p PARTS] HDFS_PATH
  9 |   hdfscli-avro write [-fa ALIAS] [-v...] [-C CODEC] [-S SCHEMA] HDFS_PATH
 10 |   hdfscli-avro -L | -h
 11 | 
 12 | Commands:
 13 |   schema                        Pretty print schema.
 14 |   read                          Read an Avro file from HDFS and output records
 15 |                                 as JSON to standard out.
 16 |   write                         Read JSON records from standard in and
 17 |                                 serialize them into a single Avro file on HDFS.
 18 | 
 19 | Arguments:
 20 |   HDFS_PATH                     Remote path to Avro file or directory
 21 |                                 containing Avro part-files.
 22 | 
 23 | Options:
 24 |   -C CODEC --codec=CODEC        Compression codec. Available values are among:
 25 |                                 null, deflate, snappy. [default: deflate]
 26 |   -F FREQ --freq=FREQ           Probability of sampling a record.
 27 |   -L --log                      Show path to current log file and exit.
 28 |   -S SCHEMA --schema=SCHEMA     Schema for serializing records. If not passed,
 29 |                                 it will be inferred from the first record.
 30 |   -a ALIAS --alias=ALIAS        Alias of namenode to connect to.
 31 |   -f --force                    Overwrite any existing file.
 32 |   -h --help                     Show this message and exit.
 33 |   -n NUM --num=NUM              Cap number of records to output.
 34 |   -p PARTS --parts=PARTS        Part-files to read. Specify a number to
 35 |                                 randomly select that many, or a comma-separated
 36 |                                 list of numbers to read only these. Use a
 37 |                                 number followed by a comma (e.g. `1,`) to get a
 38 |                                 unique part-file. The default is to read all
 39 |                                 part-files.
 40 |   -v --verbose                  Enable log output. Can be specified up to three
 41 |                                 times (increasing verbosity each time).
 42 | 
 43 | Examples:
 44 |   hdfscli-avro schema /data/impressions.avro
 45 |   hdfscli-avro read -a dev snapshot.avro >snapshot.jsonl
 46 |   hdfscli-avro read -F 0.1 -p 2,3 clicks.avro
 47 |   hdfscli-avro write -f positives.avro <positives.jsonl -S "$(cat schema.avsc)"
 48 | 
 49 | """
 50 | 
 51 | from . import AvroReader, AvroWriter
 52 | from ...__main__ import configure_client, parse_arg
 53 | from ...config import catch
 54 | from ...util import HdfsError
 55 | from docopt import docopt
 56 | from itertools import islice
 57 | from json import JSONEncoder, dumps, loads
 58 | from random import random
 59 | import sys
 60 | 
 61 | 
 62 | class _Encoder(JSONEncoder):
 63 | 
 64 |   r"""Custom encoder to support bytes and fixed strings.
 65 | 
 66 |   :param \*\*kwargs: Keyword arguments forwarded to the base constructor.
 67 | 
 68 |   """
 69 | 
 70 |   encoding = 'ISO-8859-1'
 71 | 
 72 |   def __init__(self, **kwargs):
 73 |     kwargs.update({
 74 |       'check_circular': False,
 75 |       'separators': (',', ':'),
 76 |     })
 77 |     if sys.version_info[0] == 2: # Python 3 removed the `encoding` kwarg.
 78 |       kwargs['encoding'] = self.encoding
 79 |     super(_Encoder, self).__init__(**kwargs)
 80 | 
 81 |   def default(self, obj): # pylint: disable=method-hidden
 82 |     """This should only ever be run in python 3."""
 83 |     if isinstance(obj, bytes):
 84 |       return obj.decode(self.encoding)
 85 |     return super(_Encoder, self).default(self, obj)
 86 | 
 87 | 
 88 | @catch(HdfsError)
 89 | def main(argv=None, client=None, stdin=sys.stdin, stdout=sys.stdout):
 90 |   """Entry point.
 91 | 
 92 |   :param argv: Arguments list.
 93 |   :param client: For testing.
 94 | 
 95 |   """
 96 |   args = docopt(__doc__, argv=argv)
 97 |   if not client:
 98 |     client = configure_client('hdfscli-avro', args)
 99 |   elif args['--log']:
100 |     raise HdfsError('Logging is only available when no client is specified.')
101 |   overwrite = args['--force']
102 |   parts = parse_arg(args, '--parts', int, ',')
103 |   if args['write']:
104 |     writer = AvroWriter(
105 |       client,
106 |       args['HDFS_PATH'],
107 |       overwrite=overwrite,
108 |       schema=parse_arg(args, '--schema', loads),
109 |       codec=args['--codec'],
110 |     )
111 |     with writer:
112 |       records = (loads(line) for line in stdin)
113 |       for record in records:
114 |         writer.write(record)
115 |   else:
116 |     reader = AvroReader(client, args['HDFS_PATH'], parts=parts)
117 |     with reader:
118 |       if args['schema']:
119 |         stdout.write('{}\n'.format(dumps(reader.schema, indent=2)))
120 |       elif args['read']:
121 |         encoder = _Encoder()
122 |         num = parse_arg(args, '--num', int)
123 |         freq = parse_arg(args, '--freq', float)
124 |         if freq:
125 |           for record in reader:
126 |             if random() <= freq:
127 |               stdout.write(encoder.encode(record))
128 |               stdout.write('\n')
129 |         else:
130 |           for record in islice(reader, num):
131 |             stdout.write(encoder.encode(record))
132 |             stdout.write('\n')
133 | 
134 | if __name__ == '__main__':
135 |   main()
136 | 


--------------------------------------------------------------------------------
/hdfs/ext/dataframe.py:
--------------------------------------------------------------------------------
 1 | #!/usr/bin/env python
 2 | # encoding: utf-8
 3 | 
 4 | """Read and write Pandas_ dataframes directly from HDFS.
 5 | 
 6 | .. literalinclude:: /../examples/dataframe.py
 7 | 
 8 | This extension requires both the `avro` extension and `pandas` to be installed.
 9 | Currently only Avro serialization is supported.
10 | 
11 | .. _Pandas: http://pandas.pydata.org/
12 | 
13 | """
14 | 
15 | from .avro import AvroReader, AvroWriter
16 | import json
17 | import pandas as pd
18 | 
19 | 
20 | def read_dataframe(client, hdfs_path):
21 |   """Read dataframe from HDFS Avro file.
22 | 
23 |   :param client: :class:`hdfs.client.Client` instance.
24 |   :param hdfs_path: Remote path to an Avro file (potentially distributed).
25 | 
26 |   """
27 |   with AvroReader(client, hdfs_path) as reader:
28 |     # Hack-ish, but loading all elements in memory first to get length.
29 |     if 'pandas.columns' in reader.metadata:
30 |       columns = json.loads(reader.metadata['pandas.columns'])
31 |     else:
32 |       columns = None
33 |     return pd.DataFrame.from_records(list(reader), columns=columns)
34 | 
35 | 
36 | def write_dataframe(client, hdfs_path, df, **kwargs):
37 |   r"""Save dataframe to HDFS as Avro.
38 | 
39 |   :param client: :class:`hdfs.client.Client` instance.
40 |   :param hdfs_path: Remote path where the dataframe will be stored.
41 |   :param df: Dataframe to store.
42 |   :param \*\*kwargs: Keyword arguments passed through to
43 |     :class:`hdfs.ext.avro.AvroWriter`.
44 | 
45 |   """
46 |   metadata = {'pandas.columns': json.dumps(df.columns.tolist())}
47 |   with AvroWriter(client, hdfs_path, metadata=metadata, **kwargs) as writer:
48 |     for _, row in df.iterrows():
49 |       writer.write(row.to_dict())
50 | 


--------------------------------------------------------------------------------
/hdfs/ext/kerberos.py:
--------------------------------------------------------------------------------
  1 | #!/usr/bin/env python
  2 | # encoding: utf-8
  3 | 
  4 | """Support for clusters using Kerberos_ authentication.
  5 | 
  6 | This extension adds a new :class:`hdfs.client.Client` subclass,
  7 | :class:`KerberosClient`, which handles authentication appropriately with
  8 | Kerberized clusters:
  9 | 
 10 | .. code-block:: python
 11 | 
 12 |   from hdfs.ext.kerberos import KerberosClient
 13 |   client = KerberosClient('http://host:port')
 14 | 
 15 | To expose this class to the command line interface (so that it can be used by
 16 | aliases), we add the following line inside the `global` section of
 17 | `~/.hdfscli.cfg` (or wherever our configuration file is located):
 18 | 
 19 | .. code-block:: cfg
 20 | 
 21 |   autoload.modules = hdfs.ext.kerberos
 22 | 
 23 | Here is what our earlier configuration would look like if we updated it to
 24 | support a Kerberized production grid:
 25 | 
 26 | .. code-block:: cfg
 27 | 
 28 |   [global]
 29 |   default.alias = dev
 30 |   autoload.modules = hdfs.ext.kerberos
 31 | 
 32 |   [dev.alias]
 33 |   url = http://dev.namenode:port
 34 | 
 35 |   [prod.alias]
 36 |   url = http://prod.namenode:port
 37 |   client = KerberosClient
 38 | 
 39 | .. _Kerberos: http://web.mit.edu/kerberos/
 40 | 
 41 | """
 42 | 
 43 | from ..client import Client
 44 | from ..util import HdfsError
 45 | from six import string_types
 46 | from threading import Lock, Semaphore
 47 | from time import sleep, time
 48 | import requests as rq
 49 | import requests_kerberos # For mutual authentication globals.
 50 | 
 51 | 
 52 | class _HdfsHTTPKerberosAuth(requests_kerberos.HTTPKerberosAuth):
 53 | 
 54 |   """Kerberos authenticator which throttles authentication requests.
 55 | 
 56 |   Without it, authentication will otherwise fail if too many concurrent
 57 |   requests are being made. To avoid replay errors, a timeout of 1 ms is also
 58 |   enforced between requests.
 59 | 
 60 |   """
 61 | 
 62 |   _delay = 0.001 # Seconds.
 63 | 
 64 |   def __init__(self, max_concurrency, **kwargs):
 65 |     self._lock = Lock()
 66 |     self._sem = Semaphore(max_concurrency)
 67 |     self._timestamp = time() - self._delay
 68 |     super(_HdfsHTTPKerberosAuth, self).__init__(**kwargs)
 69 | 
 70 |   def __call__(self, req):
 71 |     with self._sem:
 72 |       with self._lock:
 73 |         delay = self._timestamp + self._delay - time()
 74 |         if delay > 0:
 75 |           sleep(delay) # Avoid replay errors.
 76 |         self._timestamp = time()
 77 |       return super(_HdfsHTTPKerberosAuth, self).__call__(req)
 78 | 
 79 | 
 80 | class KerberosClient(Client):
 81 | 
 82 |   r"""HDFS web client using Kerberos authentication.
 83 | 
 84 |   :param url: Hostname or IP address of HDFS namenode, prefixed with protocol,
 85 |     followed by WebHDFS port on namenode.
 86 |   :param mutual_auth: Whether to enforce mutual authentication or not (possible
 87 |     values: `'REQUIRED'`, `'OPTIONAL'`, `'DISABLED'`).
 88 |   :param max_concurrency: Maximum number of allowed concurrent authentication
 89 |     requests. This is required since requests exceeding the threshold allowed
 90 |     by the server will be unable to authenticate.
 91 |   :param proxy: User to proxy as.
 92 |   :param root: Root path, this will be prefixed to all HDFS paths passed to the
 93 |     client. If the root is relative, the path will be assumed relative to the
 94 |     user's home directory.
 95 |   :param timeout: Connection timeouts, forwarded to the request handler. How
 96 |     long to wait for the server to send data before giving up, as a float, or a
 97 |     `(connect_timeout, read_timeout)` tuple. If the timeout is reached, an
 98 |     appropriate exception will be raised. See the requests_ documentation for
 99 |     details.
100 |   :param session: `requests.Session` instance, used to emit all requests.
101 |   :param \*\*kwargs: Additional arguments passed to the underlying
102 |     :class:`~requests_kerberos.HTTPKerberosAuth` class.
103 | 
104 |   To avoid replay errors, a timeout of 1 ms is enforced between requests. If a
105 |   session argument is passed in, it will be modified in-place to support
106 |   authentication.
107 | 
108 |   """
109 | 
110 |   def __init__(self, url, mutual_auth='OPTIONAL', max_concurrency=1, root=None,
111 |     proxy=None, timeout=None, session=None, **kwargs):
112 |     # We allow passing in a string as mutual authentication value.
113 |     if isinstance(mutual_auth, string_types):
114 |       try:
115 |         mutual_auth = getattr(requests_kerberos, mutual_auth)
116 |       except AttributeError:
117 |         raise HdfsError('Invalid mutual authentication type: %r', mutual_auth)
118 |     kwargs['mutual_authentication'] = mutual_auth
119 |     if not session:
120 |       session = rq.Session()
121 |     session.auth = _HdfsHTTPKerberosAuth(int(max_concurrency), **kwargs)
122 |     super(KerberosClient, self).__init__(
123 |       url, root=root, proxy=proxy, timeout=timeout, session=session
124 |     )
125 | 


--------------------------------------------------------------------------------
/hdfs/util.py:
--------------------------------------------------------------------------------
  1 | #!/usr/bin/env python
  2 | # encoding: utf-8
  3 | 
  4 | """Common utilities."""
  5 | 
  6 | from contextlib import contextmanager
  7 | from shutil import rmtree
  8 | from six.moves.queue import Queue
  9 | from tempfile import mkstemp
 10 | from threading import Thread
 11 | import logging as lg
 12 | import os
 13 | import os.path as osp
 14 | 
 15 | 
 16 | _logger = lg.getLogger(__name__)
 17 | 
 18 | 
 19 | class HdfsError(Exception):
 20 | 
 21 |   """Base error class.
 22 | 
 23 |   :param message: Error message.
 24 |   :param args: optional Message formatting arguments.
 25 | 
 26 |   """
 27 | 
 28 |   def __init__(self, message, *args, **kwargs):
 29 |     self.message = message % args if args else message
 30 |     super(HdfsError, self).__init__(self.message)
 31 |     self.exception = kwargs.get("exception")
 32 | 
 33 | 
 34 | class AsyncWriter(object):
 35 | 
 36 |   """Asynchronous publisher-consumer.
 37 | 
 38 |   :param consumer: Function which takes a single generator as argument.
 39 | 
 40 |   This class can be used to transform functions which expect a generator into
 41 |   file-like writer objects. This can make it possible to combine different APIs
 42 |   together more easily. For example, to send streaming requests:
 43 | 
 44 |   .. code-block:: python
 45 | 
 46 |     import requests as rq
 47 | 
 48 |     with AsyncWriter(lambda data: rq.post(URL, data=data)) as writer:
 49 |       writer.write('Hello, world!')
 50 | 
 51 |   """
 52 | 
 53 |   # Expected by pandas to write csv files (https://github.com/mtth/hdfs/pull/130).
 54 |   __iter__ = None
 55 | 
 56 |   def __init__(self, consumer):
 57 |     self._consumer = consumer
 58 |     self._queue = None
 59 |     self._reader = None
 60 |     self._err = None
 61 |     _logger.debug('Instantiated %r.', self)
 62 | 
 63 |   def __repr__(self):
 64 |     return '<{}(consumer={!r})>'.format(self.__class__.__name__, self._consumer)
 65 | 
 66 |   def __enter__(self):
 67 |     if self._queue:
 68 |       raise ValueError('Cannot nest contexts.')
 69 |     self._queue = Queue()
 70 |     self._err = None
 71 | 
 72 |     def consumer(data):
 73 |       """Wrapped consumer that lets us get a child's exception."""
 74 |       try:
 75 |         _logger.debug('Starting consumer.')
 76 |         self._consumer(data)
 77 |       except Exception as err: # pylint: disable=broad-except
 78 |         _logger.exception('Exception in child.')
 79 |         self._err = err
 80 |       finally:
 81 |         _logger.debug('Finished consumer.')
 82 | 
 83 |     def reader(queue):
 84 |       """Generator read by the consumer."""
 85 |       while True:
 86 |         chunk = queue.get()
 87 |         if chunk is None:
 88 |           break
 89 |         yield chunk
 90 | 
 91 |     self._reader = Thread(target=consumer, args=(reader(self._queue), ))
 92 |     self._reader.start()
 93 |     _logger.debug('Started child thread.')
 94 |     return self
 95 | 
 96 |   def __exit__(self, exc_type, exc_value, traceback):
 97 |     if exc_value:
 98 |       _logger.debug('Exception in parent.')
 99 |     if self._reader and self._reader.is_alive():
100 |       _logger.debug('Signaling child.')
101 |       self._queue.put(None)
102 |       self._reader.join()
103 |     if self._err:
104 |       raise self._err # pylint: disable=raising-bad-type
105 |     else:
106 |       _logger.debug('Child terminated without errors.')
107 |     self._queue = None
108 | 
109 |   def flush(self):
110 |     """Pass-through implementation."""
111 |     pass
112 | 
113 |   def seekable(self):
114 |     """Implement file-like method expected by certain libraries.
115 | 
116 |     `fastavro` relies on it in python 3.
117 | 
118 |     """
119 |     return False
120 | 
121 |   def tell(self):
122 |     """No-op implementation."""
123 |     return 0
124 | 
125 |   def write(self, chunk):
126 |     """Stream data to the underlying consumer.
127 | 
128 |     :param chunk: Bytes to write. These will be buffered in memory until the
129 |       consumer reads them.
130 | 
131 |     """
132 |     if chunk:
133 |       # We skip empty chunks, otherwise they cause request to terminate the
134 |       # response stream. Note that these chunks can be produced by valid
135 |       # upstream encoders (e.g. bzip2).
136 |       self._queue.put(chunk)
137 | 
138 | 
139 | @contextmanager
140 | def temppath(dpath=None):
141 |   """Create a temporary path.
142 | 
143 |   :param dpath: Explicit directory name where to create the temporary path. A
144 |     system dependent default will be used otherwise (cf. `tempfile.mkstemp`).
145 | 
146 |   Usage::
147 | 
148 |     with temppath() as path:
149 |       pass # do stuff
150 | 
151 |   Any file or directory corresponding to the path will be automatically deleted
152 |   afterwards.
153 | 
154 |   """
155 |   (desc, path) = mkstemp(dir=dpath)
156 |   os.close(desc)
157 |   os.remove(path)
158 |   try:
159 |     _logger.debug('Created temporary path at %s.', path)
160 |     yield path
161 |   finally:
162 |     if osp.exists(path):
163 |       if osp.isdir(path):
164 |         rmtree(path)
165 |         _logger.debug('Deleted temporary directory at %s.', path)
166 |       else:
167 |         os.remove(path)
168 |         _logger.debug('Deleted temporary file at %s.', path)
169 |     else:
170 |       _logger.debug('No temporary file or directory to delete at %s.', path)
171 | 


--------------------------------------------------------------------------------
/scripts/hadoop.sh:
--------------------------------------------------------------------------------
 1 | #!/usr/bin/env bash
 2 | 
 3 | # Hadoop utilities to setup a standalone HDFS cluster for integration tests.
 4 | #
 5 | # The following commands will download Hadoop locally and start a single node
 6 | # HDFS cluster:
 7 | #
 8 | # ```bash
 9 | # $ export HADOOP_HOME="$(./scripts/hadoop.sh download)"
10 | # $ export HADOOP_CONF_DIR="$(./scripts/hadoop.sh config)"
11 | # $ ./scripts/hadoop.sh start
12 | # ```
13 | #
14 | # Later, to stop it:
15 | #
16 | # ```bash
17 | # $ ./scripts/hadoop.sh stop
18 | # ```
19 | #
20 | 
21 | set -o nounset
22 | set -o errexit
23 | 
24 | # Print  usage and exit.
25 | #
26 | # Refer to individual functions below for more information.
27 | #
28 | usage() {
29 |   echo "usage: $0 (config|download|start|stop)" >&2
30 |   exit 1
31 | }
32 | 
33 | # Download Hadoop binary.
34 | #
35 | # TODO: Test against several versions? (But they are very big...)
36 | #
37 | hadoop-download() {
38 |   # Verification as per https://web.archive.org/web/20211018165755/https://hadoop.apache.org/releases.html#to-verify-hadoop-releases-using-gpg
39 |   local hadoop=hadoop-2.9.2
40 |   cd "$(mktemp -d 2>/dev/null || mktemp -d -t hadoop)"
41 |   curl -O "https://archive.apache.org/dist/hadoop/common/$hadoop/$hadoop.tar.gz"
42 |   curl -O "https://archive.apache.org/dist/hadoop/common/$hadoop/$hadoop.tar.gz.asc"
43 |   curl -O https://downloads.apache.org/hadoop/common/KEYS
44 |   gpg -q --import KEYS
45 |   gpg --verify "$hadoop.tar.gz.asc" "$hadoop.tar.gz"
46 |   tar -xzf "$hadoop.tar.gz"
47 |   echo "$(pwd)/$hadoop"
48 | }
49 | 
50 | # Generate configuration and print corresponding path.
51 | #
52 | # The returned path is suitable to be used as environment variable
53 | # `$HADOOP_CONF_DIR`. Note that this is necessary because proxy users are
54 | # defined as property keys, so it's not possible to allow the current user
55 | # otherwise.
56 | #
57 | hadoop-config() {
58 |   local tpl_dir="$(cd "$(dirname "${BASH_SOURCE[0]}")" && pwd)/../etc/hadoop"
59 |   local conf_dir="$(mktemp -d 2>/dev/null || mktemp -d -t 'hadoop-conf')"
60 |   for i in "$tpl_dir"/*; do
61 |     sed -e "s/#USER#/$(whoami)/" "$i" >"${conf_dir}/$(basename "$i")"
62 |   done
63 |   echo "$conf_dir"
64 | }
65 | 
66 | # Start HDFS cluster (single namenode and datanode) and HttpFS server.
67 | #
68 | # This requires `$HADOOP_HOME` and `$HADOOP_CONF_DIR` to be set.
69 | #
70 | hadoop-start() {
71 |   "${HADOOP_HOME}/bin/hdfs" namenode -format -nonInteractive || :
72 |   "${HADOOP_HOME}/sbin/hadoop-daemon.sh" --config "$HADOOP_CONF_DIR" --script hdfs start namenode
73 |   "${HADOOP_HOME}/sbin/hadoop-daemon.sh" --config "$HADOOP_CONF_DIR" --script hdfs start datanode
74 |   HTTPFS_CONFIG="$HADOOP_CONF_DIR" "${HADOOP_HOME}/sbin/httpfs.sh" start
75 | }
76 | 
77 | # Stop HDFS cluster and HttpFS server.
78 | #
79 | # This requires `$HADOOP_HOME` to be set.
80 | #
81 | hadoop-stop() {
82 |   "${HADOOP_HOME}/sbin/httpfs.sh" stop
83 |   "${HADOOP_HOME}/sbin/hadoop-daemon.sh" --script hdfs stop datanode
84 |   "${HADOOP_HOME}/sbin/hadoop-daemon.sh" --script hdfs stop namenode
85 | }
86 | 
87 | if [[ $# -ne 1 ]]; then
88 |   usage
89 | fi
90 | 
91 | case "$1" in
92 |   download) hadoop-download ;;
93 |   config) hadoop-config ;;
94 |   start) hadoop-start ;;
95 |   stop) hadoop-stop ;;
96 |   *) usage ;;
97 | esac
98 | 


--------------------------------------------------------------------------------
/scripts/version.sh:
--------------------------------------------------------------------------------
 1 | #!/usr/bin/env bash
 2 | 
 3 | set -o nounset
 4 | set -o errexit
 5 | set -o pipefail
 6 | shopt -s nullglob
 7 | 
 8 | __dirname="$(cd "$(dirname "${BASH_SOURCE[0]}")" && pwd)"
 9 | 
10 | fail() { # MSG
11 |   echo "$1" >&2 && exit 1
12 | }
13 | 
14 | version_pattern="__version__ = '([^']+)'"
15 | 
16 | main() {
17 |   cd "$__dirname/.."
18 |   local line="$(grep __version__ hdfs/__init__.py)"
19 |   if ! [[ $line =~ $version_pattern ]]; then
20 |     fail 'missing version'
21 |   fi
22 |   echo "${BASH_REMATCH[1]}"
23 | }
24 | 
25 | main "$@"
26 | 


--------------------------------------------------------------------------------
/setup.py:
--------------------------------------------------------------------------------
 1 | #!/usr/bin/env python
 2 | 
 3 | """HdfsCLI: API and command line interface for HDFS."""
 4 | 
 5 | from os import environ
 6 | from setuptools import find_packages, setup
 7 | import re
 8 | 
 9 | 
10 | def _get_version():
11 |   """Extract version from package."""
12 |   with open('hdfs/__init__.py') as reader:
13 |     match = re.search(
14 |       r'^__version__\s*=\s*[\'"]([^\'"]*)[\'"]',
15 |       reader.read(),
16 |       re.MULTILINE
17 |     )
18 |     if match:
19 |       return match.group(1)
20 |     else:
21 |       raise RuntimeError('Unable to extract version.')
22 | 
23 | def _get_long_description():
24 |   """Get README contents."""
25 |   with open('README.md') as reader:
26 |     return reader.read()
27 | 
28 | # Allow configuration of the CLI alias.
29 | ENTRY_POINT = environ.get('HDFSCLI_ENTRY_POINT', 'hdfscli')
30 | 
31 | setup(
32 |   name='hdfs',
33 |   version=_get_version(),
34 |   description=__doc__,
35 |   long_description=_get_long_description(),
36 |   long_description_content_type='text/markdown',
37 |   author='Matthieu Monsch',
38 |   author_email='mtth@apache.org',
39 |   url='https://hdfscli.readthedocs.io',
40 |   license='MIT',
41 |   packages=find_packages(exclude=['test*']),
42 |   classifiers=[
43 |     'Development Status :: 5 - Production/Stable',
44 |     'Intended Audience :: Developers',
45 |     'License :: OSI Approved :: MIT License',
46 |     'Programming Language :: Python',
47 |     'Programming Language :: Python :: 3.7',
48 |     'Programming Language :: Python :: 3.8',
49 |     'Programming Language :: Python :: 3.9',
50 |     'Programming Language :: Python :: 3.10',
51 |     'Programming Language :: Python :: 3.11',
52 |     'Programming Language :: Python :: 3.12',
53 |   ],
54 |   install_requires=[
55 |     'docopt',
56 |     'requests>=2.7.0',
57 |     'six>=1.9.0',
58 |   ],
59 |   extras_require={
60 |     'avro': ['fastavro>=0.21.19'],
61 |     'kerberos': ['requests-kerberos>=0.7.0'],
62 |     'dataframe': ['fastavro>=0.21.19', 'pandas>=0.14.1'],
63 |   },
64 |   entry_points={'console_scripts': [
65 |     '{} = hdfs.__main__:main'.format(ENTRY_POINT),
66 |     '{}-avro = hdfs.ext.avro.__main__:main'.format(ENTRY_POINT),
67 |   ]},
68 | )
69 | 


--------------------------------------------------------------------------------
/test/__init__.py:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/mtth/hdfs/039a7f4730653a8264c092845b5602ccb692a7ef/test/__init__.py


--------------------------------------------------------------------------------
/test/dat/client_template.py:
--------------------------------------------------------------------------------
 1 | #!/usr/bin/env python
 2 | # encoding: utf-8
 3 | 
 4 | """A template for generating new clients.
 5 | 
 6 | This is used to test autoloading from `CliConfig` (see `test/test_main.py`).
 7 | 
 8 | """
 9 | 
10 | from hdfs import Client
11 | 
12 | 
13 | class $class_name(Client):
14 | 
15 |   one = 1
16 | 
17 |   def __init__(self, url):
18 |     super($class_name, self).__init__(url)
19 | 


--------------------------------------------------------------------------------
/test/dat/weather.avro:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/mtth/hdfs/039a7f4730653a8264c092845b5602ccb692a7ef/test/dat/weather.avro


--------------------------------------------------------------------------------
/test/dat/weather.avsc:
--------------------------------------------------------------------------------
 1 | {
 2 |   "name": "test.Weather",
 3 |   "type": "record",
 4 |   "fields": [
 5 |     {"name": "station", "type": "string"},
 6 |     {"name": "time", "type": "long"},
 7 |     {"name": "temp", "type": "int"}
 8 |   ]
 9 | }
10 | 


--------------------------------------------------------------------------------
/test/dat/weather.jsonl:
--------------------------------------------------------------------------------
1 | {"station":"gqxurbcrru","time":-3367677834113346249,"temp":209887781}
2 | {"station":"fdvvmtpedxsifd","time":6645465191399988678,"temp":-2056157190}
3 | {"station":"ci","time":6543782083632958711,"temp":-565739712}
4 | {"station":"xadxqapgjnk","time":-4449449961661895660,"temp":2065274889}
5 | {"station":"usafhhcjcfw","time":-6395806787784552082,"temp":254868980}
6 | 


--------------------------------------------------------------------------------
/test/test_client.py:
--------------------------------------------------------------------------------
   1 | #!/usr/bin/env python
   2 | # encoding: utf-8
   3 | 
   4 | """Test Hdfs client interactions with HDFS."""
   5 | 
   6 | from collections import defaultdict
   7 | from hdfs.client import *
   8 | from hdfs.util import HdfsError, temppath
   9 | from test.util import _IntegrationTest
  10 | from requests.exceptions import ConnectTimeout, ReadTimeout
  11 | from shutil import rmtree
  12 | from six import b
  13 | from tempfile import mkdtemp
  14 | import os
  15 | import os.path as osp
  16 | import posixpath as psp
  17 | import pytest
  18 | 
  19 | 
  20 | class TestLoad(object):
  21 | 
  22 |   """Test client loader."""
  23 | 
  24 |   def test_bare(self):
  25 |     client = Client.from_options({'url': 'foo'})
  26 |     assert isinstance(client, Client)
  27 | 
  28 |   def test_new_type(self):
  29 |     class NewClient(Client):
  30 |       def __init__(self, url, bar):
  31 |         super(NewClient, self).__init__(url)
  32 |         self.bar = bar
  33 |     client = Client.from_options({'url': 'bar', 'bar': 2}, 'NewClient')
  34 |     assert client.bar == 2
  35 | 
  36 |   def test_missing_options(self):
  37 |     with pytest.raises(HdfsError):
  38 |       Client.from_options({}, 'KerberosClient')
  39 | 
  40 |   def test_invalid_options(self):
  41 |     with pytest.raises(HdfsError):
  42 |       Client.from_options({'foo': 123})
  43 | 
  44 |   def test_missing_type(self):
  45 |     with pytest.raises(HdfsError):
  46 |       Client.from_options({}, 'MissingClient')
  47 | 
  48 |   def test_timeout(self):
  49 |     assert Client('')._timeout == None
  50 |     assert Client('', timeout=1)._timeout == 1
  51 |     assert Client('', timeout=(1,2))._timeout == (1,2)
  52 |     assert Client.from_options({'url': ''})._timeout == None
  53 | 
  54 | 
  55 | class TestOptions(_IntegrationTest):
  56 | 
  57 |   """Test client options."""
  58 | 
  59 |   @pytest.mark.skip(reason="TODO: Investigate why this fails in Python 3.7 and 3.9")
  60 |   def test_timeout(self):
  61 |     with pytest.raises(ConnectTimeout, ReadTimeout):
  62 |       self.client._timeout = 1e-6 # Small enough for it to always timeout.
  63 |       try:
  64 |         self.client.status('.')
  65 |       finally:
  66 |         self.client._timeout = None
  67 | 
  68 | 
  69 | class TestApi(_IntegrationTest):
  70 | 
  71 |   """Test client raw API interactions."""
  72 | 
  73 |   def test_list_status_absolute_root(self):
  74 |     assert self.client._list_status('/')
  75 | 
  76 |   def test_get_folder_status(self):
  77 |     self.client._mkdirs('foo')
  78 |     status = self.client._get_file_status('foo').json()['FileStatus']
  79 |     assert status['type'] == 'DIRECTORY'
  80 | 
  81 |   def test_get_home_directory(self):
  82 |     path = self.client._get_home_directory('/').json()['Path']
  83 |     assert '/user/' in path
  84 | 
  85 |   def test_delete_file(self):
  86 |     path = 'bar'
  87 |     self._write(path, b'hello')
  88 |     assert self.client._delete(path).json()['boolean']
  89 |     assert not self._exists(path)
  90 | 
  91 |   def test_delete_missing_file(self):
  92 |     path = 'bar2'
  93 |     assert not self.client._delete(path).json()['boolean']
  94 | 
  95 |   def test_rename_file(self):
  96 |     paths = ['foo', '{}/bar'.format(self.client.root.rstrip('/'))]
  97 |     self._write(paths[0], b'hello')
  98 |     assert self.client._rename(paths[0], destination=paths[1]).json()['boolean']
  99 |     assert not self._exists(paths[0])
 100 |     assert self.client._open(paths[1].rsplit('/', 1)[1]).content == b'hello'
 101 |     self.client._delete(paths[1])
 102 | 
 103 |   def test_rename_file_to_existing(self):
 104 |     p = ['foo', '{}/bar'.format(self.client.root.rstrip('/'))]
 105 |     self._write(p[0], b'hello')
 106 |     self._write(p[1], b'hi')
 107 |     try:
 108 |       assert not self.client._rename(p[0], destination=p[1]).json()['boolean']
 109 |     finally:
 110 |       self.client._delete(p[0])
 111 |       self.client._delete(p[1])
 112 | 
 113 |   def test_open_file(self):
 114 |     self._write('foo', b'hello')
 115 |     assert self.client._open('foo').content == b'hello'
 116 | 
 117 |   def test_get_file_checksum(self):
 118 |     self._write('foo', b'hello')
 119 |     data = self.client._get_file_checksum('foo').json()['FileChecksum']
 120 |     assert sorted(data) == ['algorithm', 'bytes', 'length']
 121 |     assert int(data['length'])
 122 | 
 123 |   def test_get_file_checksum_on_folder(self):
 124 |     with pytest.raises(HdfsError):
 125 |       self.client._get_file_checksum('')
 126 | 
 127 | 
 128 | class TestResolve(_IntegrationTest):
 129 | 
 130 |   def test_resolve_relative(self):
 131 |     assert Client('url', root='/').resolve('bar') == '/bar'
 132 |     assert Client('url', root='/foo').resolve('bar') == '/foo/bar'
 133 |     assert Client('url', root='/foo/').resolve('bar') == '/foo/bar'
 134 |     assert Client('url', root='/foo/').resolve('bar/') == '/foo/bar'
 135 |     assert Client('url', root='/foo/').resolve('/bar/') == '/bar'
 136 | 
 137 |   def test_resolve_relative_no_root(self):
 138 |     root = self.client.root
 139 |     try:
 140 |       self.client.root = None
 141 |       home = self.client._get_home_directory('/').json()['Path']
 142 |       assert self.client.resolve('bar') == psp.join(home, 'bar')
 143 |       assert self.client.root == home
 144 |     finally:
 145 |       self.client.root = root
 146 | 
 147 |   def test_resolve_relative_root(self):
 148 |     root = self.client.root
 149 |     try:
 150 |       self.client.root = 'bar'
 151 |       home = self.client._get_home_directory('/').json()['Path']
 152 |       assert self.client.resolve('foo') == psp.join(home, 'bar', 'foo')
 153 |       assert self.client.root == psp.join(home, 'bar')
 154 |     finally:
 155 |       self.client.root = root
 156 | 
 157 |   def test_resolve_absolute(self):
 158 |     assert Client('url').resolve('/bar') == '/bar'
 159 |     assert Client('url').resolve('/bar/foo/') == '/bar/foo'
 160 | 
 161 |   def test_create_file_with_percent(self):
 162 |     # `%` (`0x25`) is a special case because it seems to cause errors (even
 163 |     # though the action still goes through). Typical error message will be
 164 |     # `"Unknown exception in doAs"`.
 165 |     path = 'fo&o/a%a'
 166 |     try:
 167 |       self._write(path, b'hello')
 168 |     except HdfsError:
 169 |       pass
 170 |     assert self._read(path) == b'hello'
 171 | 
 172 | 
 173 | class TestWrite(_IntegrationTest):
 174 | 
 175 |   def test_create_from_string(self):
 176 |     self.client.write('up', b'hello, world!')
 177 |     assert self._read('up') == b'hello, world!'
 178 | 
 179 |   def test_create_from_string_with_encoding(self):
 180 |     self.client.write('up', u'hello, world!', encoding='utf-8')
 181 |     assert self._read('up') == b'hello, world!'
 182 | 
 183 |   def test_create_from_generator(self):
 184 |     data = (e for e in [b'hello, ', b'world!'])
 185 |     self.client.write('up', data)
 186 |     assert self._read('up') == b'hello, world!'
 187 | 
 188 |   def test_create_from_generator_with_encoding(self):
 189 |     data = (e for e in [u'hello, ', u'world!'])
 190 |     self.client.write('up', data, encoding='utf-8')
 191 |     assert self._read('up') == b'hello, world!'
 192 | 
 193 |   def test_create_from_file_object(self):
 194 |     with temppath() as tpath:
 195 |       with open(tpath, 'w') as writer:
 196 |         writer.write('hello, world!')
 197 |       with open(tpath) as reader:
 198 |         self.client.write('up', reader)
 199 |     assert self._read('up') == b'hello, world!'
 200 | 
 201 |   def test_create_set_permission(self):
 202 |     self.client.write('up', b'hello, world!', permission='722')
 203 |     assert self._read('up') == b'hello, world!'
 204 |     assert self.client.status('up')['permission'] == '722'
 205 | 
 206 |   def test_create_to_existing_file_without_overwrite(self):
 207 |     with pytest.raises(HdfsError):
 208 |       self.client.write('up', b'hello, world!')
 209 |       self.client.write('up', b'hello again, world!')
 210 | 
 211 |   def test_create_and_overwrite_file(self):
 212 |     self.client.write('up', b'hello, world!')
 213 |     self.client.write('up', b'hello again, world!', overwrite=True)
 214 |     assert self._read('up') == b'hello again, world!'
 215 | 
 216 |   def test_as_context_manager(self):
 217 |     with self.client.write('up') as writer:
 218 |       writer.write(b'hello, ')
 219 |       writer.write(b'world!')
 220 |     assert self._read('up') == b'hello, world!'
 221 | 
 222 |   def test_as_context_manager_with_encoding(self):
 223 |     with self.client.write('up', encoding='utf-8') as writer:
 224 |       writer.write(u'hello, ')
 225 |       writer.write(u'world!')
 226 |     assert self._read('up') == b'hello, world!'
 227 | 
 228 |   def test_dump_json(self):
 229 |     from json import dump, loads
 230 |     data = {'one': 1, 'two': 2}
 231 |     with self.client.write('up', encoding='utf-8') as writer:
 232 |       dump(data, writer)
 233 |     assert loads(self._read('up', encoding='utf-8')) == data
 234 | 
 235 |   def test_create_and_overwrite_directory(self):
 236 |     with pytest.raises(HdfsError):
 237 |       # can't overwrite a directory with a file
 238 |       self.client._mkdirs('up')
 239 |       self.client.write('up', b'hello, world!')
 240 | 
 241 |   def test_create_invalid_path(self):
 242 |     with pytest.raises(HdfsError):
 243 |       # conversely, can't overwrite a file with a directory
 244 |       self.client.write('up', b'hello, world!')
 245 |       self.client.write('up/up', b'hello again, world!')
 246 | 
 247 | 
 248 | class TestAppend(_IntegrationTest):
 249 | 
 250 |   @classmethod
 251 |   def setup_class(cls):
 252 |     super(TestAppend, cls).setup_class()
 253 |     if cls.client:
 254 |       try:
 255 |         cls.client.write('ap', b'') # We can't append to an empty file.
 256 |         cls.client.write('ap', b'', append=True) # Try a simple append.
 257 |       except HdfsError as err:
 258 |         if 'Append is not supported' in str(err):
 259 |           cls.client = None
 260 |           # Skip these tests if HDFS isn't configured to support appends.
 261 |         else:
 262 |           raise err
 263 | 
 264 |   def test_simple(self):
 265 |     self.client.write('ap', b'hello,')
 266 |     self.client.write('ap', b' world!', append=True)
 267 |     assert self._read('ap') == b'hello, world!'
 268 | 
 269 |   def test_missing_file(self):
 270 |     with pytest.raises(HdfsError):
 271 |       self.client.write('ap', b'hello!', append=True)
 272 | 
 273 |   def test_overwrite_and_append(self):
 274 |     with pytest.raises(ValueError):
 275 |       self.client.write('ap', b'hello!', overwrite=True, append=True)
 276 | 
 277 |   def test_set_permission_and_append(self):
 278 |     with pytest.raises(ValueError):
 279 |       self.client.write('ap', b'hello!', permission='777', append=True)
 280 | 
 281 | 
 282 | class TestUpload(_IntegrationTest):
 283 | 
 284 |   def test_upload_file(self):
 285 |     with temppath() as tpath:
 286 |       with open(tpath, 'w') as writer:
 287 |         writer.write('hello, world!')
 288 |       self.client.upload('up', tpath)
 289 |     assert self._read('up') == b'hello, world!'
 290 | 
 291 |   def test_upload_missing(self):
 292 |     with pytest.raises(HdfsError):
 293 |       with temppath() as tpath:
 294 |         self.client.upload('up', tpath)
 295 | 
 296 |   def test_upload_empty_directory(self):
 297 |     with pytest.raises(HdfsError):
 298 |       dpath = mkdtemp()
 299 |       try:
 300 |         self.client.upload('up', dpath)
 301 |       finally:
 302 |         os.rmdir(dpath)
 303 | 
 304 |   def test_upload_directory_to_existing_directory(self):
 305 |     dpath = mkdtemp()
 306 |     try:
 307 |       npath = osp.join(dpath, 'hi')
 308 |       os.mkdir(npath)
 309 |       with open(osp.join(npath, 'foo'), 'w') as writer:
 310 |         writer.write('hello!')
 311 |       os.mkdir(osp.join(npath, 'bar'))
 312 |       with open(osp.join(npath, 'bar', 'baz'), 'w') as writer:
 313 |         writer.write('world!')
 314 |       self.client._mkdirs('up')
 315 |       self.client.upload('up', npath)
 316 |       assert self._read('up/hi/foo') == b'hello!'
 317 |       assert self._read('up/hi/bar/baz') == b'world!'
 318 |     finally:
 319 |       rmtree(dpath)
 320 | 
 321 |   def test_upload_directory_to_missing(self):
 322 |     dpath = mkdtemp()
 323 |     try:
 324 |       with open(osp.join(dpath, 'foo'), 'w') as writer:
 325 |         writer.write('hello!')
 326 |       os.mkdir(osp.join(dpath, 'bar'))
 327 |       with open(osp.join(dpath, 'bar', 'baz'), 'w') as writer:
 328 |         writer.write('world!')
 329 |       self.client.upload('up', dpath)
 330 |       assert self._read('up/foo') == b'hello!'
 331 |       assert self._read('up/bar/baz') == b'world!'
 332 |     finally:
 333 |       rmtree(dpath)
 334 | 
 335 |   def test_upload_directory_overwrite_existing_file(self):
 336 |     dpath = mkdtemp()
 337 |     try:
 338 |       with open(osp.join(dpath, 'foo'), 'w') as writer:
 339 |         writer.write('hello!')
 340 |       os.mkdir(osp.join(dpath, 'bar'))
 341 |       with open(osp.join(dpath, 'bar', 'baz'), 'w') as writer:
 342 |         writer.write('world!')
 343 |       self._write('up', b'hi')
 344 |       self.client.upload('up', dpath, overwrite=True)
 345 |       assert self._read('up/foo') == b'hello!'
 346 |       assert self._read('up/bar/baz') == b'world!'
 347 |     finally:
 348 |       rmtree(dpath)
 349 | 
 350 |   def test_upload_overwrite(self):
 351 |     with temppath() as tpath:
 352 |       with open(tpath, 'w') as writer:
 353 |         writer.write('hello')
 354 |       self.client.upload('up', tpath)
 355 |     with temppath() as tpath:
 356 |       with open(tpath, 'w') as writer:
 357 |         writer.write('there')
 358 |       self.client.upload('up', tpath, overwrite=True)
 359 |     assert self._read('up') == b'there'
 360 | 
 361 |   def test_upload_overwrite_error(self):
 362 |     with pytest.raises(HdfsError):
 363 |       with temppath() as tpath:
 364 |         with open(tpath, 'w') as writer:
 365 |           writer.write('here')
 366 |         self.client.upload('up', tpath)
 367 |         self.client.upload('up', tpath)
 368 | 
 369 |   def test_upload_cleanup(self):
 370 |     dpath = mkdtemp()
 371 |     _write = self.client.write
 372 | 
 373 |     def write(hdfs_path, *args, **kwargs):
 374 |       if 'bar' in hdfs_path:
 375 |         raise RuntimeError()
 376 |       return _write(hdfs_path, *args, **kwargs)
 377 | 
 378 |     try:
 379 |       self.client.write = write
 380 |       npath = osp.join(dpath, 'hi')
 381 |       os.mkdir(npath)
 382 |       with open(osp.join(npath, 'foo'), 'w') as writer:
 383 |         writer.write('hello!')
 384 |       os.mkdir(osp.join(npath, 'bar'))
 385 |       with open(osp.join(npath, 'bar', 'baz'), 'w') as writer:
 386 |         writer.write('world!')
 387 |       try:
 388 |         self.client.upload('foo', dpath)
 389 |       except RuntimeError:
 390 |         assert not self._exists('foo')
 391 |       else:
 392 |         assert False # This shouldn't happen.
 393 |     finally:
 394 |       rmtree(dpath)
 395 |       self.client.write = _write
 396 | 
 397 |   def test_upload_no_cleanup(self):
 398 |     dpath = mkdtemp()
 399 |     _write = self.client.write
 400 | 
 401 |     def write(hdfs_path, *args, **kwargs):
 402 |       if 'bar' in hdfs_path:
 403 |         raise RuntimeError()
 404 |       return _write(hdfs_path, *args, **kwargs)
 405 | 
 406 |     try:
 407 |       self.client.write = write
 408 |       npath = osp.join(dpath, 'hi')
 409 |       os.mkdir(npath)
 410 |       with open(osp.join(npath, 'foo'), 'w') as writer:
 411 |         writer.write('hello!')
 412 |       os.mkdir(osp.join(npath, 'bar'))
 413 |       with open(osp.join(npath, 'bar', 'baz'), 'w') as writer:
 414 |         writer.write('world!')
 415 |       try:
 416 |         self.client.upload('foo', dpath, cleanup=False)
 417 |       except RuntimeError:
 418 |         # The outer folder still exists.
 419 |         assert self._exists('foo')
 420 |       else:
 421 |         assert False # This shouldn't happen.
 422 |     finally:
 423 |       rmtree(dpath)
 424 |       self.client.write = _write
 425 | 
 426 |   def test_upload_with_progress(self):
 427 | 
 428 |     def callback(path, nbytes, history=defaultdict(list)):
 429 |       history[path].append(nbytes)
 430 |       return history
 431 | 
 432 |     dpath = mkdtemp()
 433 |     try:
 434 |       path1 = osp.join(dpath, 'foo')
 435 |       with open(path1, 'w') as writer:
 436 |         writer.write('hello!')
 437 |       os.mkdir(osp.join(dpath, 'bar'))
 438 |       path2 = osp.join(dpath, 'bar', 'baz')
 439 |       with open(path2, 'w') as writer:
 440 |         writer.write('the world!')
 441 |       self.client.upload(
 442 |         'up',
 443 |         dpath,
 444 |         chunk_size=4,
 445 |         n_threads=1, # Callback isn't thread-safe.
 446 |         progress=callback
 447 |       )
 448 |       assert self._read('up/foo') == b'hello!'
 449 |       assert self._read('up/bar/baz') == b'the world!'
 450 |       assert (
 451 |         callback('', 0) ==
 452 |         {path1: [4, 6, -1], path2: [4, 8, 10, -1], '': [0]})
 453 |     finally:
 454 |       rmtree(dpath)
 455 | 
 456 | 
 457 | class TestDelete(_IntegrationTest):
 458 | 
 459 |   def test_delete_file(self):
 460 |     self._write('foo', b'hello, world!')
 461 |     assert self.client.delete('foo')
 462 |     assert not self._exists('foo')
 463 | 
 464 |   def test_delete_empty_directory(self):
 465 |     self.client._mkdirs('foo')
 466 |     assert self.client.delete('foo')
 467 |     assert not self._exists('foo')
 468 | 
 469 |   def test_delete_missing_file(self):
 470 |     assert not self.client.delete('foo')
 471 | 
 472 |   def test_delete_non_empty_directory(self):
 473 |     self._write('de/foo', b'hello, world!')
 474 |     assert self.client.delete('de', recursive=True)
 475 |     assert not self._exists('de')
 476 | 
 477 |   def test_delete_non_empty_directory_without_recursive(self):
 478 |     with pytest.raises(HdfsError):
 479 |       self._write('de/foo', b'hello, world!')
 480 |       self.client.delete('de')
 481 | 
 482 |   def test_trash_file(self):
 483 |     self._write('foo', b'hello, world!')
 484 |     assert self.client.delete('foo', skip_trash=False)
 485 |     assert self.client.status('foo', strict=False) == None
 486 | 
 487 |   def test_trash_missing_file(self):
 488 |     assert not self.client.delete('foo', skip_trash=False)
 489 | 
 490 |   def test_trash_directory_non_recursive(self):
 491 |     with pytest.raises(HdfsError):
 492 |       self._write('bar/foo', b'hello, world!')
 493 |       self.client.delete('bar', skip_trash=False)
 494 | 
 495 |   def test_trash_directory(self):
 496 |     self._write('bar/foo', b'hello, world!')
 497 |     assert self.client.delete('bar', recursive=True, skip_trash=False)
 498 |     assert self.client.status('bar', strict=False) == None
 499 | 
 500 | 
 501 | class TestRead(_IntegrationTest):
 502 | 
 503 |   def test_progress_without_chunk_size(self):
 504 |     with pytest.raises(ValueError):
 505 |       self._write('foo', b'hello, world!')
 506 |       with self.client.read('foo', progress=lambda path, nbytes: None) as reader:
 507 |         pass
 508 | 
 509 |   def test_delimiter_without_encoding(self):
 510 |     with pytest.raises(ValueError):
 511 |       self._write('foo', b'hello, world!')
 512 |       with self.client.read('foo', delimiter=',') as reader:
 513 |         pass
 514 | 
 515 |   def test_delimiter_with_chunk_size(self):
 516 |     with pytest.raises(ValueError):
 517 |       self._write('foo', b'hello, world!')
 518 |       with self.client.read('foo', delimiter=',', chunk_size=1) as reader:
 519 |         pass
 520 | 
 521 |   def test_read_file(self):
 522 |     self._write('foo', b'hello, world!')
 523 |     with self.client.read('foo') as reader:
 524 |       assert reader.read() == b'hello, world!'
 525 | 
 526 |   def test_read_directory(self):
 527 |     with pytest.raises(HdfsError):
 528 |       self.client._mkdirs('foo')
 529 |       with self.client.read('foo') as reader:
 530 |         pass
 531 | 
 532 |   def test_read_missing_file(self):
 533 |     with pytest.raises(HdfsError):
 534 |       with self.client.read('foo') as reader:
 535 |         pass
 536 | 
 537 |   def test_read_file_from_offset(self):
 538 |     self._write('foo', b'hello, world!')
 539 |     with self.client.read('foo', offset=7) as reader:
 540 |       assert reader.read() == b'world!'
 541 | 
 542 |   def test_read_file_from_offset_with_limit(self):
 543 |     self._write('foo', b'hello, world!')
 544 |     with self.client.read('foo', offset=7, length=5) as reader:
 545 |       assert reader.read() == b'world'
 546 | 
 547 |   def test_read_file_with_chunk_size(self):
 548 |     self._write('foo', b'hello, world!')
 549 |     with self.client.read('foo', chunk_size=5) as reader:
 550 |       assert list(reader) == [b'hello', b', wor', b'ld!']
 551 | 
 552 |   def test_with_progress(self):
 553 |     def cb(path, nbytes, chunk_lengths=[]):
 554 |       chunk_lengths.append(nbytes)
 555 |       return chunk_lengths
 556 |     self._write('foo', b'hello, world!')
 557 |     with temppath() as tpath:
 558 |       with open(tpath, 'wb') as writer:
 559 |         with self.client.read('foo', chunk_size=5, progress=cb) as reader:
 560 |           for chunk in reader:
 561 |             writer.write(chunk)
 562 |       with open(tpath, 'rb') as reader:
 563 |         assert reader.read() == b'hello, world!'
 564 |       assert cb('', 0) == [5, 10, 13, -1, 0]
 565 | 
 566 |   def test_read_with_encoding(self):
 567 |     s = u'hello, world!'
 568 |     self._write('foo', s, encoding='utf-8')
 569 |     with self.client.read('foo', encoding='utf-8') as reader:
 570 |       assert reader.read() == s
 571 | 
 572 |   def test_read_with_chunk_size_and_encoding(self):
 573 |     s = u'hello, world!'
 574 |     self._write('foo', s, encoding='utf-8')
 575 |     with self.client.read('foo', chunk_size=5, encoding='utf-8') as reader:
 576 |       assert list(reader) == [u'hello', u', wor', u'ld!']
 577 | 
 578 |   def test_read_json(self):
 579 |     from json import dumps, load
 580 |     data = {'one': 1, 'two': 2}
 581 |     self._write('foo', data=dumps(data), encoding='utf-8')
 582 |     with self.client.read('foo', encoding='utf-8') as reader:
 583 |       assert load(reader) == data
 584 | 
 585 |   def test_read_with_delimiter(self):
 586 |     self._write('foo', u'hi\nworld!\n', encoding='utf-8')
 587 |     with self.client.read('foo', delimiter='\n', encoding='utf-8') as reader:
 588 |       assert list(reader) == [u'hi', u'world!', u'']
 589 | 
 590 | 
 591 | class TestRename(_IntegrationTest):
 592 | 
 593 |   def test_rename_file(self):
 594 |     self._write('foo', b'hello, world!')
 595 |     self.client.rename('foo', 'bar')
 596 |     assert self._read('bar') == b'hello, world!'
 597 | 
 598 |   def test_rename_missing_file(self):
 599 |     with pytest.raises(HdfsError):
 600 |       self.client.rename('foo', 'bar')
 601 | 
 602 |   def test_rename_file_to_existing_file(self):
 603 |     with pytest.raises(HdfsError):
 604 |       self._write('foo', b'hello, world!')
 605 |       self._write('bar', b'hello again, world!')
 606 |       self.client.rename('foo', 'bar')
 607 | 
 608 |   def test_move_file_into_existing_directory(self):
 609 |     self._write('foo', b'hello, world!')
 610 |     self.client._mkdirs('bar')
 611 |     self.client.rename('foo', 'bar')
 612 |     assert self._read('bar/foo') == b'hello, world!'
 613 | 
 614 |   def test_rename_file_into_existing_directory(self):
 615 |     self._write('foo', b'hello, world!')
 616 |     self.client._mkdirs('bar')
 617 |     self.client.rename('foo', 'bar/baz')
 618 |     assert self._read('bar/baz') == b'hello, world!'
 619 | 
 620 |   def test_rename_file_with_special_characters(self):
 621 |     path = 'fo&oa ?a=1'
 622 |     self._write('foo', b'hello, world!')
 623 |     self.client.rename('foo', path)
 624 |     assert self._read(path) == b'hello, world!'
 625 | 
 626 | 
 627 | class TestDownload(_IntegrationTest):
 628 | 
 629 |   def test_missing_dir(self):
 630 |     with pytest.raises(HdfsError):
 631 |       self._write('dl', b'hello')
 632 |       with temppath() as tpath:
 633 |         self.client.download('dl', osp.join(tpath, 'foo'))
 634 | 
 635 |   def test_normal_file(self):
 636 |     self._write('dl', b'hello')
 637 |     with temppath() as tpath:
 638 |       fpath = self.client.download('dl', tpath)
 639 |       with open(fpath) as reader:
 640 |         assert reader.read() == 'hello'
 641 | 
 642 |   def test_nonpartitioned_file(self):
 643 |     partname = 'part-r-00000'
 644 |     self._write('dl/' + partname, b'world')
 645 |     with temppath() as tpath:
 646 |       fname = self.client.download('dl/' + partname, tpath)
 647 |       with open(fname) as reader:
 648 |         assert reader.read() == 'world'
 649 | 
 650 |   def test_singly_partitioned_file(self):
 651 |     partname = 'part-r-00000'
 652 |     self._write('dl/' + partname, b'world')
 653 |     with temppath() as tpath:
 654 |       os.mkdir(tpath)
 655 |       fname = self.client.download('dl', tpath)
 656 |       with open(osp.join(fname, partname)) as reader:
 657 |         assert reader.read() == 'world'
 658 | 
 659 |   def _download_partitioned_file(self, n_threads):
 660 |     parts = {
 661 |       'part-r-00000': b'fee',
 662 |       'part-r-00001': b'faa',
 663 |       'part-r-00002': b'foo',
 664 |     }
 665 |     for name, content in parts.items():
 666 |       self._write('dl/{}'.format(name), content)
 667 |     with temppath() as tpath:
 668 |       self.client.download('dl', tpath, n_threads=-1)
 669 |       local_parts = os.listdir(tpath)
 670 |       assert set(local_parts) == set(parts) # We have all the parts.
 671 |       for part in local_parts:
 672 |         with open(osp.join(tpath, part), mode='rb') as reader:
 673 |           assert reader.read() == parts[part] # Their content is correct.
 674 | 
 675 |   def test_partitioned_file_max_threads(self):
 676 |     self._download_partitioned_file(0)
 677 | 
 678 |   def test_partitioned_file_sync(self):
 679 |     self._download_partitioned_file(1)
 680 | 
 681 |   def test_partitioned_file_setting_n_threads(self):
 682 |     self._download_partitioned_file(2)
 683 | 
 684 |   def test_overwrite_file(self):
 685 |     with temppath() as tpath:
 686 |       self._write('dl', b'hello')
 687 |       self.client.download('dl', tpath)
 688 |       self.client.write('dl', b'there', overwrite=True)
 689 |       fname = self.client.download('dl', tpath, overwrite=True)
 690 |       with open(fname) as reader:
 691 |         assert reader.read() == 'there'
 692 | 
 693 |   def test_download_file_to_existing_file(self):
 694 |     with pytest.raises(HdfsError):
 695 |       self._write('dl', b'hello')
 696 |       with temppath() as tpath:
 697 |         with open(tpath, 'w') as writer:
 698 |           writer.write('hi')
 699 |         self.client.download('dl', tpath)
 700 | 
 701 |   def test_download_file_to_existing_file_with_overwrite(self):
 702 |     self._write('dl', b'hello')
 703 |     with temppath() as tpath:
 704 |       with open(tpath, 'w') as writer:
 705 |         writer.write('hi')
 706 |       self.client.download('dl', tpath, overwrite=True)
 707 |       with open(tpath) as reader:
 708 |         assert reader.read() == 'hello'
 709 | 
 710 |   def test_download_file_to_existing_folder(self):
 711 |     self._write('dl', b'hello')
 712 |     with temppath() as tpath:
 713 |       os.mkdir(tpath)
 714 |       self.client.download('dl', tpath)
 715 |       with open(osp.join(tpath, 'dl')) as reader:
 716 |         assert reader.read() == 'hello'
 717 | 
 718 |   def test_download_file_to_existing_folder_with_matching_file(self):
 719 |     with pytest.raises(HdfsError):
 720 |       self._write('dl', b'hello')
 721 |       with temppath() as tpath:
 722 |         os.mkdir(tpath)
 723 |         with open(osp.join(tpath, 'dl'), 'w') as writer:
 724 |           writer.write('hey')
 725 |         self.client.download('dl', tpath)
 726 | 
 727 |   def test_download_file_to_existing_folder_overwrite_matching_file(self):
 728 |     self._write('dl', b'hello')
 729 |     with temppath() as tpath:
 730 |       os.mkdir(tpath)
 731 |       with open(osp.join(tpath, 'dl'), 'w') as writer:
 732 |         writer.write('hey')
 733 |       self.client.download('dl', tpath, overwrite=True)
 734 |       with open(osp.join(tpath, 'dl')) as reader:
 735 |         assert reader.read() == 'hello'
 736 | 
 737 |   def test_download_folder_to_existing_folder(self):
 738 |     self._write('foo/dl', b'hello')
 739 |     self._write('foo/bar/dl', b'there')
 740 |     with temppath() as tpath:
 741 |       os.mkdir(tpath)
 742 |       self.client.download('foo', tpath)
 743 |       with open(osp.join(tpath, 'foo', 'dl')) as reader:
 744 |         assert reader.read() == 'hello'
 745 |       with open(osp.join(tpath, 'foo', 'bar', 'dl')) as reader:
 746 |         assert reader.read() == 'there'
 747 | 
 748 |   def test_download_folder_to_existing_folder_parallel(self):
 749 |     self._write('foo/dl', b'hello')
 750 |     self._write('foo/bar/dl', b'there')
 751 |     with temppath() as tpath:
 752 |       os.mkdir(tpath)
 753 |       self.client.download('foo', tpath, n_threads=0)
 754 |       with open(osp.join(tpath, 'foo', 'dl')) as reader:
 755 |         assert reader.read() == 'hello'
 756 |       with open(osp.join(tpath, 'foo', 'bar', 'dl')) as reader:
 757 |         assert reader.read() == 'there'
 758 | 
 759 |   def test_download_folder_to_missing_folder(self):
 760 |     self._write('foo/dl', b'hello')
 761 |     self._write('foo/bar/dl', b'there')
 762 |     with temppath() as tpath:
 763 |       self.client.download('foo', tpath)
 764 |       with open(osp.join(tpath, 'dl')) as reader:
 765 |         assert reader.read() == 'hello'
 766 |       with open(osp.join(tpath, 'bar', 'dl')) as reader:
 767 |         assert reader.read() == 'there'
 768 | 
 769 |   def test_download_cleanup(self):
 770 |     self._write('foo/dl', b'hello')
 771 |     self._write('foo/bar/dl', b'there')
 772 |     _read = self.client.read
 773 | 
 774 |     def read(hdfs_path, *args, **kwargs):
 775 |       if 'bar' in hdfs_path:
 776 |         raise RuntimeError()
 777 |       return _read(hdfs_path, *args, **kwargs)
 778 | 
 779 |     with temppath() as tpath:
 780 |       try:
 781 |         self.client.read = read
 782 |         self.client.download('foo', tpath)
 783 |       except RuntimeError:
 784 |         assert not osp.exists(tpath)
 785 |       else:
 786 |         assert False # This shouldn't happen.
 787 |       finally:
 788 |         self.client.read = _read
 789 | 
 790 |   def test_download_empty_folder(self):
 791 |     with pytest.raises(HdfsError):
 792 |       self.client._mkdirs('foo')
 793 |       with temppath() as tpath:
 794 |         self.client.download('foo', tpath)
 795 | 
 796 |   def test_download_dir_whitespace(self):
 797 |     self._write('foo/foo bar.txt', b'hello')
 798 |     with temppath() as tpath:
 799 |       self.client.download('foo', tpath)
 800 |       with open(osp.join(tpath, 'foo bar.txt')) as reader:
 801 |         assert reader.read() == 'hello'
 802 | 
 803 |   def test_download_file_whitespace(self):
 804 |     self._write('foo/foo bar%.txt', b'hello')
 805 |     with temppath() as tpath:
 806 |       self.client.download('foo/foo bar%.txt', tpath)
 807 |       with open(tpath) as reader:
 808 |         assert reader.read() == 'hello'
 809 | 
 810 | 
 811 | class TestStatus(_IntegrationTest):
 812 | 
 813 |   def test_directory(self):
 814 |     self.client._mkdirs('foo')
 815 |     status = self.client.status('foo')
 816 |     assert status['type'] == 'DIRECTORY'
 817 |     assert status['length'] == 0
 818 | 
 819 |   def test_file(self):
 820 |     self._write('foo', b'hello, world!')
 821 |     status = self.client.status('foo')
 822 |     assert status['type'] == 'FILE'
 823 |     assert status['length'] == 13
 824 | 
 825 |   def test_missing(self):
 826 |     with pytest.raises(HdfsError):
 827 |       self.client.status('foo')
 828 | 
 829 |   def test_missing_non_strict(self):
 830 |     assert self.client.status('foo', strict=False) is None
 831 | 
 832 | 
 833 | class TestSetOwner(_IntegrationTest):
 834 | 
 835 |   @classmethod
 836 |   def setup_class(cls):
 837 |     super(TestSetOwner, cls).setup_class()
 838 |     if cls.client:
 839 |       try:
 840 |         cls.client.write('foo', b'')
 841 |         cls.client.set_owner('foo', 'bar')
 842 |       except HdfsError as err:
 843 |         if 'Non-super user cannot change owner' in str(err):
 844 |           cls.client = None
 845 |           # Skip these tests if HDFS isn't configured to support them.
 846 |         else:
 847 |           raise err
 848 | 
 849 |   def test_directory_owner(self):
 850 |     new_owner = 'newowner'
 851 |     self.client._mkdirs('foo')
 852 |     self.client.set_owner('foo', 'oldowner')
 853 |     self.client.set_owner('foo', new_owner)
 854 |     status = self.client.status('foo')
 855 |     assert status['owner'] == new_owner
 856 | 
 857 |   def test_file_owner(self):
 858 |     new_owner = 'newowner'
 859 |     self._write('foo', b'hello, world!')
 860 |     self.client.set_owner('foo', 'oldowner')
 861 |     self.client.set_owner('foo', new_owner)
 862 |     status = self.client.status('foo')
 863 |     assert status['owner'] == new_owner
 864 | 
 865 |   def test_directory_for_group(self):
 866 |     new_group = 'newgroup'
 867 |     self.client._mkdirs('foo')
 868 |     self.client.set_owner('foo', group='oldgroup')
 869 |     self.client.set_owner('foo', group=new_group)
 870 |     status = self.client.status('foo')
 871 |     assert status['group'] == new_group
 872 | 
 873 |   def test_file_for_group(self):
 874 |     new_group = 'newgroup'
 875 |     self._write('foo', b'hello, world!')
 876 |     self.client.set_owner('foo', group='oldgroup')
 877 |     self.client.set_owner('foo', group=new_group)
 878 |     status = self.client.status('foo')
 879 |     assert status['group'] == new_group
 880 | 
 881 |   def test_missing_for_group(self):
 882 |     with pytest.raises(HdfsError):
 883 |       self.client.set_owner('foo', group='blah')
 884 | 
 885 | 
 886 | class TestSetPermission(_IntegrationTest):
 887 | 
 888 |   def test_directory(self):
 889 |     new_permission = '755'
 890 |     self.client._mkdirs('foo', permission='444')
 891 |     self.client.set_permission('foo', new_permission)
 892 |     status = self.client.status('foo')
 893 |     assert status['permission'] == new_permission
 894 | 
 895 |   def test_file(self):
 896 |     new_permission = '755'
 897 |     self.client.write('foo', b'hello, world!', permission='444')
 898 |     self.client.set_permission('foo', new_permission)
 899 |     status = self.client.status('foo')
 900 |     assert status['permission'] == new_permission
 901 | 
 902 |   def test_missing(self):
 903 |     with pytest.raises(HdfsError):
 904 |       self.client.set_permission('foo', '755')
 905 | 
 906 | 
 907 | class TestContent(_IntegrationTest):
 908 | 
 909 |   def test_directory(self):
 910 |     self._write('foo', b'hello, world!')
 911 |     content = self.client.content('')
 912 |     assert content['directoryCount'] == 1
 913 |     assert content['fileCount'] == 1
 914 |     assert content['length'] == 13
 915 | 
 916 |   def test_file(self):
 917 |     self._write('foo', b'hello, world!')
 918 |     content = self.client.content('foo')
 919 |     assert content['directoryCount'] == 0
 920 |     assert content['fileCount'] == 1
 921 |     assert content['length'] == 13
 922 | 
 923 |   def test_missing(self):
 924 |     with pytest.raises(HdfsError):
 925 |       self.client.content('foo')
 926 | 
 927 |   def test_missing_non_strict(self):
 928 |     assert self.client.content('foo', strict=False) is None
 929 | 
 930 | 
 931 | class TestAcl(_IntegrationTest):
 932 | 
 933 |   def test_directory(self):
 934 |     self._write('foo', b'hello, world!')
 935 |     content = self.client.acl_status('')
 936 |     assert len(content) > 1
 937 |     assert 'entries' in content
 938 |     assert 'group' in content
 939 |     assert 'owner' in content
 940 | 
 941 |   def test_set_acl(self):
 942 |     self.client.write('foo', 'hello, world!')
 943 |     self.client.set_acl('foo', 'user::rwx,user:foouser:rwx,group::r--,other::---')
 944 |     content = self.client.acl_status('foo')
 945 |     assert any('user:foouser:rwx' in s for s in content['entries'])
 946 |     assert len(content) > 1
 947 |     assert content['entries'] is not None
 948 | 
 949 |   def test_modify_acl(self):
 950 |     self.client.write('foo', 'hello, world!')
 951 |     self.client.set_acl('foo', 'user::rwx,user:foouser:rwx,group::r--,other::---')
 952 |     self.client.set_acl('foo', 'user:foouser:rw-', clear=False)
 953 |     content = self.client.acl_status('foo')
 954 |     assert any('user:foouser:rw-' in s for s in content['entries'])
 955 | 
 956 |   def test_missing(self):
 957 |     with pytest.raises(HdfsError):
 958 |       self.client.acl_status('foo')
 959 | 
 960 |   def test_missing_non_strict(self):
 961 |     assert self.client.acl_status('foo', strict=False) is None
 962 | 
 963 |   def test_remove_acl_entries(self):
 964 |     self.client.write('foo', 'hello, world!')
 965 |     self.client.set_acl('foo', 'user:baruser:rwx,user:foouser:rw-', clear=False)
 966 |     self.client.remove_acl_entries('foo', 'user:foouser:')
 967 |     content = self.client.acl_status('foo')
 968 |     assert not any('user:foouser:rw-' in s for s in content['entries'])
 969 |     assert any('user:baruser:rwx' in s for s in content['entries'])
 970 | 
 971 |   def test_remove_default_acl(self):
 972 |     self.client.write('foo', 'hello, world!')
 973 |     self.client.set_acl('foo', 'user:foouser:rwx', clear=False)
 974 |     self.client.remove_default_acl('foo')
 975 |     content = self.client.acl_status('foo')
 976 |     assert not any('user::rwx' in s for s in content['entries'])
 977 | 
 978 |   def test_remove_acl(self):
 979 |     self.client.write('foo', 'hello, world!')
 980 |     self.client.remove_acl('foo')
 981 |     content = self.client.acl_status('foo')
 982 |     assert content.get('entries') == []
 983 | 
 984 | 
 985 | class TestList(_IntegrationTest):
 986 | 
 987 |   def test_file(self):
 988 |     with pytest.raises(HdfsError):
 989 |       self.client.write('foo', 'hello, world!')
 990 |       self.client.list('foo')
 991 | 
 992 |   def test_missing(self):
 993 |     with pytest.raises(HdfsError):
 994 |       self.client.list('foo')
 995 | 
 996 |   def test_empty_dir(self):
 997 |     self.client._mkdirs('foo')
 998 |     assert self.client.list('foo') == []
 999 | 
1000 |   def test_dir(self):
1001 |     self.client.write('foo/bar', 'hello, world!')
1002 |     assert self.client.list('foo') == ['bar']
1003 | 
1004 |   def test_dir_with_status(self):
1005 |     self.client.write('foo/bar', 'hello, world!')
1006 |     statuses = self.client.list('foo', status=True)
1007 |     assert len(statuses) == 1
1008 |     status = self.client.status('foo/bar')
1009 |     status['pathSuffix'] = 'bar'
1010 |     assert statuses[0] == ('bar', status)
1011 | 
1012 | 
1013 | class TestWalk(_IntegrationTest):
1014 | 
1015 |   def test_missing(self):
1016 |     with pytest.raises(HdfsError):
1017 |       list(self.client.walk('foo'))
1018 | 
1019 |   def test_file(self):
1020 |     self.client.write('foo', 'hello, world!')
1021 |     assert not list(self.client.walk('foo'))
1022 | 
1023 |   def test_folder(self):
1024 |     self.client.write('hello', 'hello, world!')
1025 |     self.client.write('foo/hey', 'hey, world!')
1026 |     infos = list(self.client.walk(''))
1027 |     assert len(infos) == 2
1028 |     assert infos[0] == (psp.join(self.client.root), ['foo'], ['hello'])
1029 |     assert infos[1] == (psp.join(self.client.root, 'foo'), [], ['hey'])
1030 | 
1031 |   def test_folder_with_depth(self):
1032 |     self.client.write('foo/bar', 'hello, world!')
1033 |     infos = list(self.client.walk('', depth=1))
1034 |     assert len(infos) == 1
1035 |     assert infos[0] == (self.client.root, ['foo'], [])
1036 | 
1037 |   def test_folder_with_status(self):
1038 |     self.client.write('foo', 'hello, world!')
1039 |     infos = list(self.client.walk('', status=True))
1040 |     status = self.client.status('foo')
1041 |     status['pathSuffix'] = 'foo'
1042 |     assert len(infos) == 1
1043 |     assert (
1044 |       infos[0] ==
1045 |       (
1046 |         (self.client.root, self.client.status('')),
1047 |         [],
1048 |         [('foo', status)]
1049 |       ))
1050 | 
1051 |   def test_skip_missing_folder(self):
1052 |     self.client.write('file', 'one')
1053 |     self.client.write('folder/hey', 'two')
1054 |     for info in self.client.walk('', ignore_missing=True):
1055 |       assert info == (psp.join(self.client.root), ['folder'], ['file'])
1056 |       self.client.delete('folder', recursive=True)
1057 | 
1058 |   def test_status_and_allow_dir_changes(self):
1059 |     with pytest.raises(ValueError):
1060 |       list(self.client.walk('.', status=True, allow_dir_changes=True))
1061 | 
1062 |   def test_allow_dir_changes_subset(self):
1063 |     self.client.write('foo/file1', 'one')
1064 |     self.client.write('bar/file2', 'two')
1065 |     infos = self.client.walk('.', allow_dir_changes=True)
1066 |     info = next(infos)
1067 |     info[1][:] = ['bar']
1068 |     info = next(infos)
1069 |     assert info == (psp.join(self.client.root, 'bar'), [], ['file2'])
1070 | 
1071 |   def test_allow_dir_changes_insert(self):
1072 |     self.client.write('foo/file1', 'one')
1073 |     infos = self.client.walk('.', allow_dir_changes=True)
1074 |     info = next(infos)
1075 |     self.client.write('bar/file2', 'two')
1076 |     info[1][:] = ['bar'] # Insert new directory.
1077 |     info = next(infos)
1078 |     assert info == (psp.join(self.client.root, 'bar'), [], ['file2'])
1079 | 
1080 | 
1081 | class TestLatestExpansion(_IntegrationTest):
1082 | 
1083 |   def test_resolve_simple(self):
1084 |     self.client.write('bar', 'hello, world!')
1085 |     self.client.write('foo', 'hello again, world!')
1086 |     assert self.client.resolve('#LATEST') == osp.join(self.client.root, 'foo')
1087 | 
1088 |   def test_resolve_nested(self):
1089 |     self.client.write('baz/bar', 'hello, world!')
1090 |     self.client.write('bar/bar', 'hello there, world!')
1091 |     self.client.write('bar/foo', 'hello again, world!')
1092 |     latest = self.client.resolve('#LATEST/#LATEST')
1093 |     assert latest == osp.join(self.client.root, 'bar', 'foo')
1094 | 
1095 |   def test_resolve_multiple(self):
1096 |     self.client.write('bar/bar', 'hello, world!')
1097 |     self.client.write('bar/foo', 'hello again, world!')
1098 |     latest = self.client.resolve('#LATEST/#LATEST')
1099 |     assert latest == osp.join(self.client.root, 'bar', 'foo')
1100 | 
1101 |   def test_resolve_multiple_shortcut(self):
1102 |     self.client.write('bar/bar', 'hello, world!')
1103 |     self.client.write('bar/foo', 'hello again, world!')
1104 |     latest = self.client.resolve('#LATEST{2}')
1105 |     assert latest == osp.join(self.client.root, 'bar', 'foo')
1106 | 
1107 |   @pytest.mark.skip(reason="HttpFS is inconsistent here.")
1108 |   def test_resolve_file(self):
1109 |     with pytest.raises(HdfsError):
1110 |       self.client.write('bar', 'hello, world!')
1111 |       self.client.resolve('bar/#LATEST')
1112 | 
1113 |   def test_resolve_empty_directory(self):
1114 |     with pytest.raises(HdfsError):
1115 |       self.client._mkdirs('bar')
1116 |       self.client.resolve('bar/#LATEST')
1117 | 
1118 | 
1119 | class TestParts(_IntegrationTest):
1120 | 
1121 |   def test_missing(self):
1122 |     with pytest.raises(HdfsError):
1123 |       self.client.parts('foo')
1124 | 
1125 |   def test_file(self):
1126 |     with pytest.raises(HdfsError):
1127 |       self.client.write('foo', 'hello')
1128 |       self.client.parts('foo')
1129 | 
1130 |   def test_empty_folder(self):
1131 |     with pytest.raises(HdfsError):
1132 |       self.client._mkdirs('foo')
1133 |       self.client.parts('foo')
1134 | 
1135 |   def test_folder_without_parts(self):
1136 |     with pytest.raises(HdfsError):
1137 |       self.client.write('foo/bar', 'hello')
1138 |       self.client.parts('foo')
1139 | 
1140 |   def test_folder_with_single_part(self):
1141 |     fname = 'part-m-00000.avro'
1142 |     self.client.write(psp.join('foo', fname), 'first')
1143 |     assert self.client.parts('foo') == [fname]
1144 | 
1145 |   def test_folder_with_multiple_parts(self):
1146 |     fnames = ['part-m-00000.avro', 'part-m-00001.avro']
1147 |     self.client.write(psp.join('foo', fnames[0]), 'first')
1148 |     self.client.write(psp.join('foo', fnames[1]), 'second')
1149 |     assert self.client.parts('foo') == fnames
1150 | 
1151 |   def test_folder_with_multiple_parts_and_others(self):
1152 |     fnames = ['part-m-00000.avro', 'part-m-00001.avro']
1153 |     self.client.write(psp.join('foo', '.header'), 'metadata')
1154 |     self.client.write(psp.join('foo', fnames[0]), 'first')
1155 |     self.client.write(psp.join('foo', fnames[1]), 'second')
1156 |     assert self.client.parts('foo') == fnames
1157 | 
1158 |   def test_with_selection(self):
1159 |     fnames = ['part-m-00000.avro', 'part-m-00001.avro']
1160 |     self.client.write(psp.join('foo', '.header'), 'metadata')
1161 |     self.client.write(psp.join('foo', fnames[0]), 'first')
1162 |     self.client.write(psp.join('foo', fnames[1]), 'second')
1163 |     parts = self.client.parts('foo', parts=1)
1164 |     assert len(parts) == 1
1165 |     assert parts[0] in fnames
1166 | 
1167 |   def test_with_selection(self):
1168 |     fnames = ['part-m-00000.avro', 'part-m-00001.avro']
1169 |     self.client.write(psp.join('foo', '.header'), 'metadata')
1170 |     self.client.write(psp.join('foo', fnames[0]), 'first')
1171 |     self.client.write(psp.join('foo', fnames[1]), 'second')
1172 |     assert self.client.parts('foo', parts=[1]) == fnames[1:]
1173 | 
1174 |   def test_with_status(self):
1175 |     fname = 'part-m-00000.avro'
1176 |     fpath = psp.join('foo', fname)
1177 |     self.client.write(fpath, 'first')
1178 |     status = self.client.status(fpath)
1179 |     status['pathSuffix'] = fname
1180 |     assert self.client.parts('foo', status=True) == [(fname, status)]
1181 | 
1182 | 
1183 | class TestMakeDirs(_IntegrationTest):
1184 | 
1185 |   def test_simple(self):
1186 |     self.client.makedirs('foo')
1187 |     assert self.client.status('foo')['type'] == 'DIRECTORY'
1188 | 
1189 |   def test_nested(self):
1190 |     self.client.makedirs('foo/bar')
1191 |     assert self.client.status('foo/bar')['type'] == 'DIRECTORY'
1192 | 
1193 |   def test_with_permission(self):
1194 |     self.client.makedirs('foo', permission='733')
1195 |     assert self.client.status('foo')['permission'] == '733'
1196 | 
1197 |   def test_overwrite_file(self):
1198 |     with pytest.raises(HdfsError):
1199 |       self.client.write('foo', 'hello')
1200 |       self.client.makedirs('foo')
1201 | 
1202 |   def test_overwrite_directory_with_permission(self):
1203 |     self.client.makedirs('foo', permission='733')
1204 |     self.client.makedirs('foo/bar', permission='722')
1205 |     assert self.client.status('foo')['permission'] == '733'
1206 |     assert self.client.status('foo/bar')['permission'] == '722'
1207 | 
1208 | 
1209 | class TestSetTimes(_IntegrationTest):
1210 | 
1211 |   def test_none(self):
1212 |     with pytest.raises(ValueError):
1213 |       self.client.makedirs('foo')
1214 |       self.client.set_times('foo')
1215 | 
1216 |   def test_missing(self):
1217 |     with pytest.raises(HdfsError):
1218 |       self.client.set_times('foo', 1234)
1219 | 
1220 |   @pytest.mark.skip() # HttpFS doesn't raise an error here.
1221 |   def test_negative(self):
1222 |     with pytest.raises(HdfsError):
1223 |       self.client.write('foo', 'hello')
1224 |       self.client.set_times('foo', access_time=-1234)
1225 | 
1226 |   def test_file(self):
1227 |     self.client.write('foo', 'hello')
1228 |     self.client.set_times('foo', access_time=1234)
1229 |     assert self.client.status('foo')['accessTime'] == 1234
1230 |     self.client.set_times('foo', modification_time=12345)
1231 |     assert self.client.status('foo')['modificationTime'] == 12345
1232 |     self.client.set_times('foo', access_time=1, modification_time=2)
1233 |     status = self.client.status('foo')
1234 |     assert status['accessTime'] == 1
1235 |     assert status['modificationTime'] == 2
1236 | 
1237 |   def test_folder(self):
1238 |     self.client.write('foo/bar', 'hello')
1239 |     self.client.set_times('foo', access_time=1234)
1240 |     assert self.client.status('foo')['accessTime'] == 1234
1241 |     self.client.set_times('foo', modification_time=12345)
1242 |     assert self.client.status('foo')['modificationTime'] == 12345
1243 |     self.client.set_times('foo', access_time=1, modification_time=2)
1244 |     status = self.client.status('foo')
1245 |     assert status['accessTime'] == 1
1246 |     assert status['modificationTime'] == 2
1247 | 
1248 | 
1249 | class TestChecksum(_IntegrationTest):
1250 | 
1251 |   def test_missing(self):
1252 |     with pytest.raises(HdfsError):
1253 |       self.client.checksum('foo')
1254 | 
1255 |   def test_folder(self):
1256 |     with pytest.raises(HdfsError):
1257 |       self.client.makedirs('foo')
1258 |       self.client.checksum('foo')
1259 | 
1260 |   def test_file(self):
1261 |     self.client.write('foo', 'hello')
1262 |     checksum = self.client.checksum('foo')
1263 |     assert {'algorithm', 'bytes', 'length'} == set(checksum)
1264 | 
1265 | 
1266 | class TestSetReplication(_IntegrationTest):
1267 | 
1268 |   def test_missing(self):
1269 |     with pytest.raises(HdfsError):
1270 |       self.client.set_replication('foo', 1)
1271 | 
1272 |   def test_folder(self):
1273 |     with pytest.raises(HdfsError):
1274 |       self.client.makedirs('foo')
1275 |       self.client.set_replication('foo', 1)
1276 | 
1277 |   def test_invalid_replication(self):
1278 |     with pytest.raises(HdfsError):
1279 |       self.client.write('foo', 'hello')
1280 |       self.client.set_replication('foo', 0)
1281 | 
1282 |   def test_file(self):
1283 |     self.client.write('foo', 'hello')
1284 |     replication = self.client.status('foo')['replication'] + 1
1285 |     self.client.set_replication('foo', replication)
1286 |     assert self.client.status('foo')['replication'] == replication
1287 | 
1288 | 
1289 | class TestTokenClient(object):
1290 | 
1291 |   def test_without_session(self):
1292 |     client = TokenClient('url', '123')
1293 |     assert client._session.params['delegation'] == '123'
1294 | 
1295 |   def test_with_session(self):
1296 |     session = rq.Session()
1297 |     client = TokenClient('url', '123', session=session)
1298 |     assert session.params['delegation'] == '123'
1299 | 
1300 | 
1301 | class TestSnapshot(_IntegrationTest):
1302 | 
1303 |   @classmethod
1304 |   def setup_class(cls):
1305 |     super(TestSnapshot, cls).setup_class()
1306 |     if cls.client:
1307 |       cls.client._mkdirs('foo')
1308 |       try:
1309 |         cls.client.allow_snapshot('foo')
1310 |       except HdfsError as err:
1311 |         if 'java.lang.IllegalArgumentException: No enum constant' in str(err):
1312 |           cls.client = None
1313 |           # Skip these tests if we get this error message from HDFS (currently
1314 |           # happens using HTTPFS) which causes all snapshot operations to fail.
1315 |         else:
1316 |           raise err
1317 | 
1318 |   def test_allow_snapshot(self):
1319 |     self.client._mkdirs('foo')
1320 |     self.client.allow_snapshot('foo')
1321 | 
1322 |   def test_allow_snapshot_double(self):
1323 |     self.client._mkdirs('foo')
1324 |     self.client.allow_snapshot('foo')
1325 |     self.client.allow_snapshot('foo')
1326 | 
1327 |   def test_disallow_snapshot(self):
1328 |     self.client._mkdirs('foo')
1329 |     self.client.allow_snapshot('foo')
1330 |     self.client.disallow_snapshot('foo')
1331 | 
1332 |   def test_disallow_no_allow(self):
1333 |     self.client._mkdirs('foo')
1334 |     self.client.disallow_snapshot('foo')
1335 | 
1336 |   def test_allow_snapshot_not_exists(self):
1337 |     with pytest.raises(HdfsError):
1338 |       self.client.allow_snapshot('foo')
1339 | 
1340 |   def test_disallow_snapshot_not_exists(self):
1341 |     with pytest.raises(HdfsError):
1342 |       self.client.disallow_snapshot('foo')
1343 | 
1344 |   def test_allow_snapshot_file(self):
1345 |     with pytest.raises(HdfsError):
1346 |       self._write('foo', b'hello')
1347 |       self.client.allow_snapshot('foo')
1348 | 
1349 |   def test_disallow_snapshot_file(self):
1350 |     with pytest.raises(HdfsError):
1351 |       self._write('foo', b'hello')
1352 |       self.client.disallow_snapshot('foo')
1353 | 
1354 |   def test_create_delete_snapshot(self):
1355 |     # One cannot test creation and deletion separately, as one cannot
1356 |     # clean HDFS for test isolation if a created snapshot remains
1357 |     # undeleted.
1358 |     self.client._mkdirs('foo')
1359 |     self.client.allow_snapshot('foo')
1360 |     self.client.create_snapshot('foo', 'mysnap')
1361 |     self.client.delete_snapshot('foo', 'mysnap')
1362 | 
1363 |   def test_create_snapshot_name(self):
1364 |     self.client._mkdirs('foo')
1365 |     self.client.allow_snapshot('foo')
1366 |     try:
1367 |       snapshot_path = self.client.create_snapshot('foo', 'mysnap')
1368 |       assert re.search(r'/foo/\.snapshot/mysnap$',snapshot_path)
1369 |     finally:
1370 |       # Cleanup, as it breaks other tests otherwise: the dir cannot be
1371 |       # removed with an active snapshots.
1372 |       self.client.delete_snapshot('foo', 'mysnap')
1373 | 
1374 |   def test_delete_snapshot_other(self):
1375 |     with pytest.raises(HdfsError):
1376 |       self.client._mkdirs('foo')
1377 |       self.client.allow_snapshot('foo')
1378 |       self.client.create_snapshot('foo', 'mysnap')
1379 |       try:
1380 |         self.client.delete_snapshot('foo', 'othersnap')
1381 |       finally:
1382 |         # Cleanup, as it breaks other tests otherwise: the dir cannot be
1383 |         # removed with an active snapshots.
1384 |         self.client.delete_snapshot('foo', 'mysnap')
1385 | 
1386 |   def test_disallow_snapshot_exists(self):
1387 |     with pytest.raises(HdfsError):
1388 |       self.client._mkdirs('foo_disallow')
1389 |       self.client.allow_snapshot('foo_disallow')
1390 |       self.client.create_snapshot('foo_disallow', 'mysnap')
1391 |       try:
1392 |         self.client.disallow_snapshot('foo_disallow')
1393 |       finally:
1394 |         # Cleanup, as it breaks other tests otherwise: the dir cannot be
1395 |         # removed with an active snapshots.
1396 |         self.client.delete_snapshot('foo_disallow', 'mysnap')
1397 | 
1398 |   def test_create_snapshot_noallow(self):
1399 |     with pytest.raises(HdfsError):
1400 |       self.client._mkdirs('foo')
1401 |       self.client.create_snapshot('foo', 'mysnap')
1402 | 
1403 |   def test_delete_snapshot_noallow(self):
1404 |     with pytest.raises(HdfsError):
1405 |       self.client._mkdirs('foo')
1406 |       self.client.delete_snapshot('foo', 'mysnap')
1407 | 
1408 |   def test_create_snapshot_noexist(self):
1409 |     with pytest.raises(HdfsError):
1410 |      self.client.create_snapshot('foo', 'mysnap')
1411 | 
1412 |   def test_rename_snapshot(self):
1413 |     self.client._mkdirs('foo')
1414 |     self.client.allow_snapshot('foo')
1415 |     self.client.create_snapshot('foo', 'myspan')
1416 |     try:
1417 |       self.client.rename_snapshot('foo', 'myspan', 'yourspan')
1418 |     finally:
1419 |       self.client.delete_snapshot('foo', 'yourspan')
1420 | 
1421 |   def test_rename_snapshot_not_exists(self):
1422 |     with pytest.raises(HdfsError):
1423 |       self.client.rename_snapshot('foo', 'myspan', 'yourspan')
1424 | 
1425 |   def test_rename_snapshot_not_overwrite(self):
1426 |     with pytest.raises(HdfsError):
1427 |       self.client._mkdirs('foo')
1428 |       self.client.allow_snapshot('foo')
1429 |       self.client.create_snapshot('foo', 'myspan')
1430 |       self.client.create_snapshot('foo', 'yourspan')
1431 |       try:
1432 |         self.client.rename_snapshot('foo', 'myspan', 'yourspan')
1433 |       finally:
1434 |         self.client.delete_snapshot('foo', 'myspan')
1435 |         self.client.delete_snapshot('foo', 'yourspan')
1436 | 


--------------------------------------------------------------------------------
/test/test_config.py:
--------------------------------------------------------------------------------
  1 | #!/usr/bin/env python
  2 | # encoding: utf-8
  3 | 
  4 | """Test configuration module."""
  5 | 
  6 | from hdfs.client import Client
  7 | from hdfs.config import Config
  8 | from hdfs.util import HdfsError, temppath
  9 | from logging.handlers import TimedRotatingFileHandler
 10 | from string import Template
 11 | from test.util import save_config
 12 | import logging as lg
 13 | import os
 14 | import os.path as osp
 15 | import pytest
 16 | import sys
 17 | 
 18 | 
 19 | class TestConfig(object):
 20 | 
 21 |   @pytest.mark.skip(reason="TODO: Find cross-platform way to reset the environment variable.")
 22 |   def test_config_path(self):
 23 |     path = os.getenv('HDFSCLI_CONFIG')
 24 |     try:
 25 |       with temppath() as tpath:
 26 |         os.environ['HDFSCLI_CONFIG'] = tpath
 27 |         with open(tpath, 'w') as writer:
 28 |           writer.write('[foo]\nbar=hello')
 29 |         assert Config().get('foo', 'bar') == 'hello'
 30 |     finally:
 31 |       if path:
 32 |         os['HDFSCLI_CONFIG'] = path
 33 |       else:
 34 |         del os['HDFSCLI_CONFIG']
 35 | 
 36 |   def _write_client_module(self, path, class_name):
 37 |     template = osp.join(osp.dirname(__file__), 'dat', 'client_template.py')
 38 |     with open(template) as reader:
 39 |       contents = Template(reader.read()).substitute({
 40 |         'class_name': class_name,
 41 |       })
 42 |     with open(path, 'w') as writer:
 43 |       writer.write(contents)
 44 | 
 45 |   def test_autoload_client_from_path(self):
 46 |     with temppath() as module_path:
 47 |       self._write_client_module(module_path, 'PathClient')
 48 |       with temppath() as config_path:
 49 |         config = Config(config_path)
 50 |         config.add_section(config.global_section)
 51 |         config.set(config.global_section, 'autoload.paths', module_path)
 52 |         config._autoload()
 53 |         client = Client.from_options({'url': ''}, 'PathClient')
 54 |         assert client.one == 1
 55 | 
 56 |   def test_autoload_missing_path(self):
 57 |     with pytest.raises(SystemExit):
 58 |       with temppath() as module_path:
 59 |         with temppath() as config_path:
 60 |           config = Config(config_path)
 61 |           config.add_section(config.global_section)
 62 |           config.set(config.global_section, 'autoload.paths', module_path)
 63 |           config._autoload()
 64 | 
 65 |   def test_autoload_client_from_module(self):
 66 |     with temppath() as module_dpath:
 67 |       os.mkdir(module_dpath)
 68 |       sys.path.append(module_dpath)
 69 |       module_fpath = osp.join(module_dpath, 'mclient.py')
 70 |       self._write_client_module(module_fpath, 'ModuleClient')
 71 |       try:
 72 |         with temppath() as config_path:
 73 |           config = Config(config_path)
 74 |           config.add_section(config.global_section)
 75 |           config.set(config.global_section, 'autoload.modules', 'mclient')
 76 |           config._autoload()
 77 |           client = Client.from_options({'url': ''}, 'ModuleClient')
 78 |           assert client.one == 1
 79 |       finally:
 80 |         sys.path.remove(module_dpath)
 81 | 
 82 |   def test_create_client_with_alias(self):
 83 |     with temppath() as tpath:
 84 |       config = Config(path=tpath)
 85 |       section = 'dev.alias'
 86 |       config.add_section(section)
 87 |       config.set(section, 'url', 'http://host:port')
 88 |       save_config(config)
 89 |       Config(path=tpath).get_client('dev')
 90 | 
 91 |   def test_create_client_with_alias_and_timeout(self):
 92 |     with temppath() as tpath:
 93 |       config = Config(path=tpath)
 94 |       section = 'dev.alias'
 95 |       config.add_section(section)
 96 |       config.set(section, 'url', 'http://host:port')
 97 |       config.set(section, 'timeout', '1')
 98 |       save_config(config)
 99 |       assert Config(path=tpath).get_client('dev')._timeout == 1
100 |       config.set(section, 'timeout', '1,2')
101 |       save_config(config)
102 |       assert Config(path=tpath).get_client('dev')._timeout == (1,2)
103 | 
104 |   def test_create_client_with_missing_alias(self):
105 |     with pytest.raises(HdfsError):
106 |       with temppath() as tpath:
107 |         Config(tpath).get_client('dev')
108 | 
109 |   def test_create_client_with_no_alias_without_default(self):
110 |     with pytest.raises(HdfsError):
111 |       with temppath() as tpath:
112 |         Config(tpath).get_client()
113 | 
114 |   def test_create_client_with_default_alias(self):
115 |     with temppath() as tpath:
116 |       config = Config(tpath)
117 |       config.add_section(config.global_section)
118 |       config.set(config.global_section, 'default.alias', 'dev')
119 |       section = 'dev.alias'
120 |       config.add_section(section)
121 |       config.set(section, 'url', 'http://host:port')
122 |       save_config(config)
123 |       Config(tpath).get_client()
124 | 
125 |   def test_get_file_handler(self):
126 |     with temppath() as tpath:
127 |       config = Config(tpath)
128 |       handler = config.get_log_handler('cmd')
129 |       assert isinstance(handler, TimedRotatingFileHandler)
130 | 
131 |   def test_disable_file_logging(self):
132 |     with temppath() as tpath:
133 |       config = Config(tpath)
134 |       config.add_section('cmd.command')
135 |       config.set('cmd.command', 'log.disable', 'true')
136 |       save_config(config)
137 |       config = Config(tpath)
138 |       handler = config.get_log_handler('cmd')
139 |       assert not isinstance(handler, TimedRotatingFileHandler)
140 | 


--------------------------------------------------------------------------------
/test/test_examples.py:
--------------------------------------------------------------------------------
 1 | #!/usr/bin/env python
 2 | # encoding: utf-8
 3 | 
 4 | """Test that the examples run correctly."""
 5 | 
 6 | from hdfs import Config
 7 | from hdfs.config import _load_source
 8 | from six import add_metaclass
 9 | from test.util import _IntegrationTest
10 | import os
11 | import os.path as osp
12 | import pytest
13 | 
14 | class _ExamplesType(type):
15 | 
16 |   """Metaclass generating a test for each example."""
17 | 
18 |   dpath = osp.join(osp.dirname(__file__), os.pardir, 'examples')
19 | 
20 |   def __new__(mcs, cls, bases, attrs):
21 | 
22 |     def make_test(fname):
23 |       fpath = osp.join(mcs.dpath, fname)
24 |       module = osp.splitext(fname)[0]
25 | 
26 |       def test(self):
27 |         try:
28 |           _load_source(module, fpath)
29 |         except ImportError:
30 |           # Unmet dependency.
31 |           pytest.skip()
32 | 
33 |       test.__name__ = 'test_{}'.format(module)
34 |       test.__doc__ = 'Test for example {}.'.format(fpath)
35 |       return test
36 | 
37 |     for fname in os.listdir(mcs.dpath):
38 |       if osp.splitext(fname)[1] == '.py':
39 |         test = make_test(fname)
40 |         attrs[test.__name__] = test
41 |     return super(_ExamplesType, mcs).__new__(mcs, cls, bases, attrs)
42 | 
43 | 
44 | @add_metaclass(_ExamplesType)
45 | class TestExamples(_IntegrationTest):
46 | 
47 |   """Empty since tests are injected by the metaclass."""
48 | 
49 |   _get_client = None
50 | 
51 |   @classmethod
52 |   def setup_class(cls):
53 |     super(TestExamples, cls).setup_class()
54 |     cls._get_client = Config.get_client
55 |     Config.get_client = staticmethod(lambda: cls.client)
56 | 
57 |   @classmethod
58 |   def teardown_class(cls):
59 |     Config.get_client = cls._get_client
60 |     super(TestExamples, cls).teardown_class()
61 | 


--------------------------------------------------------------------------------
/test/test_ext_avro.py:
--------------------------------------------------------------------------------
  1 | #!/usr/bin/env python
  2 | # encoding: utf-8
  3 | 
  4 | """Test Avro extension."""
  5 | 
  6 | from hdfs.util import HdfsError, temppath
  7 | from json import dumps, load, loads
  8 | from test.util import _IntegrationTest
  9 | import os
 10 | import os.path as osp
 11 | import pytest
 12 | 
 13 | try:
 14 |   from hdfs.ext.avro import (_SeekableReader, _SchemaInferrer, AvroReader,
 15 |     AvroWriter)
 16 |   from hdfs.ext.avro.__main__ import main
 17 | except ImportError:
 18 |   SKIP = True
 19 | else:
 20 |   SKIP = False
 21 | 
 22 | 
 23 | class TestSeekableReader(object):
 24 | 
 25 |   def setup_method(self):
 26 |     if SKIP:
 27 |       pytest.skip()
 28 | 
 29 |   def test_normal_read(self):
 30 |     with temppath() as tpath:
 31 |       with open(tpath, 'w') as writer:
 32 |         writer.write('abcd')
 33 |       with open(tpath) as reader:
 34 |         sreader = _SeekableReader(reader)
 35 |         assert sreader.read(3) == 'abc'
 36 |         assert sreader.read(2) == 'd'
 37 |         assert not sreader.read(1)
 38 | 
 39 |   def test_buffered_read(self):
 40 |     with temppath() as tpath:
 41 |       with open(tpath, 'w') as writer:
 42 |         writer.write('abcdefghi')
 43 |       with open(tpath) as reader:
 44 |         sreader = _SeekableReader(reader, 3)
 45 |         assert sreader.read(1) == 'a'
 46 |         assert sreader.read(3) == 'bcd'
 47 |         sreader.seek(-3, os.SEEK_CUR)
 48 |         assert sreader.read(2) == 'bc'
 49 |         assert sreader.read(6) == 'defghi'
 50 |         assert not sreader.read(1)
 51 | 
 52 | 
 53 | class TestInferSchema(object):
 54 | 
 55 |   def setup_method(self):
 56 |     if SKIP:
 57 |       pytest.skip()
 58 | 
 59 | 
 60 |   def test_array(self):
 61 |     assert (
 62 |       _SchemaInferrer().infer({'foo': 1, 'bar': ['hello']}) ==
 63 |       {
 64 |         'type': 'record',
 65 |         'name': '__Record1',
 66 |         'fields': [
 67 |           {'type': {'type': 'array', 'items': 'string'}, 'name': 'bar'},
 68 |           {'type': 'int', 'name': 'foo'},
 69 |         ]
 70 |       })
 71 | 
 72 |   def test_flat_record(self):
 73 |     assert (
 74 |       _SchemaInferrer().infer({'foo': 1, 'bar': 'hello'}) ==
 75 |       {
 76 |         'type': 'record',
 77 |         'name': '__Record1',
 78 |         'fields': [
 79 |           {'type': 'string', 'name': 'bar'},
 80 |           {'type': 'int', 'name': 'foo'},
 81 |         ]
 82 |       })
 83 | 
 84 |   def test_nested_record(self):
 85 |     assert (
 86 |       _SchemaInferrer().infer({'foo': {'bax': 2}, 'bar': {'baz': 3}}) ==
 87 |       {
 88 |         'type': 'record',
 89 |         'name': '__Record1',
 90 |         'fields': [
 91 |           {
 92 |             'type': {
 93 |               'type': 'record',
 94 |               'name': '__Record2',
 95 |               'fields': [{'type': 'int', 'name': 'baz'}]
 96 |             },
 97 |             'name': 'bar',
 98 |           },
 99 |           {
100 |             'type': {
101 |               'type': 'record',
102 |               'name': '__Record3',
103 |               'fields': [{'type': 'int', 'name': 'bax'}]
104 |             },
105 |             'name': 'foo',
106 |           },
107 |         ]
108 |       })
109 | 
110 | 
111 | class _AvroIntegrationTest(_IntegrationTest):
112 | 
113 |   dpath = osp.join(osp.dirname(__file__), 'dat')
114 |   schema = None
115 |   records = None
116 | 
117 |   @classmethod
118 |   def setup_class(cls):
119 |     if SKIP:
120 |       return
121 |     super(_AvroIntegrationTest, cls).setup_class()
122 |     with open(osp.join(cls.dpath, 'weather.avsc')) as reader:
123 |       cls.schema = loads(reader.read())
124 |     with open(osp.join(cls.dpath, 'weather.jsonl')) as reader:
125 |       cls.records = [loads(line) for line in reader]
126 | 
127 |   @classmethod
128 |   def _get_data_bytes(cls, fpath):
129 |     # Get Avro bytes, skipping header (order of schema fields is undefined) and
130 |     # sync marker. This assumes that the file can be written in a single block.
131 |     with open(fpath, 'rb') as reader:
132 |       reader.seek(-16, os.SEEK_END) # Sync marker always last 16 bytes.
133 |       sync_marker = reader.read()
134 |       reader.seek(0)
135 |       content = reader.read()
136 |       sync_pos = content.find(sync_marker)
137 |       return content[sync_pos + 16:-16]
138 | 
139 | 
140 | class TestRead(_AvroIntegrationTest):
141 | 
142 |   def test_read(self):
143 |     self.client.upload('weather.avro', osp.join(self.dpath, 'weather.avro'))
144 |     with AvroReader(self.client, 'weather.avro') as reader:
145 |       assert list(reader) == self.records
146 | 
147 |   def test_read_with_same_schema(self):
148 |     self.client.upload('w.avro', osp.join(self.dpath, 'weather.avro'))
149 |     with AvroReader(self.client, 'w.avro', reader_schema=self.schema) as reader:
150 |       assert list(reader) == self.records
151 | 
152 |   def test_read_with_compatible_schema(self):
153 |     self.client.upload('w.avro', osp.join(self.dpath, 'weather.avro'))
154 |     schema = {
155 |       'name': 'test.Weather',
156 |       'type': 'record',
157 |       'fields': [
158 |         {'name': 'temp', 'type': 'int'},
159 |         {'name': 'tag', 'type': 'string', 'default': ''},
160 |       ],
161 |     }
162 |     with AvroReader(self.client, 'w.avro', reader_schema=schema) as reader:
163 |       assert (
164 |         list(reader) ==
165 |         [{'temp': r['temp'], 'tag': ''} for r in self.records])
166 | 
167 | 
168 | class TestWriter(_AvroIntegrationTest):
169 | 
170 |   def test_write(self):
171 |     writer = AvroWriter(
172 |       self.client,
173 |       'weather.avro',
174 |       schema=self.schema,
175 |     )
176 |     with writer:
177 |       for record in self.records:
178 |         writer.write(record)
179 |     with temppath() as tpath:
180 |       self.client.download('weather.avro', tpath)
181 |       assert (
182 |         self._get_data_bytes(osp.join(self.dpath, 'weather.avro')) ==
183 |         self._get_data_bytes(tpath))
184 | 
185 |   def test_write_in_multiple_blocks(self):
186 |     writer = AvroWriter(
187 |       self.client,
188 |       'weather.avro',
189 |       schema=self.schema,
190 |       sync_interval=1 # Flush block on every write.
191 |     )
192 |     with writer:
193 |       for record in self.records:
194 |         writer.write(record)
195 |     with AvroReader(self.client, 'weather.avro') as reader:
196 |       assert list(reader) == self.records
197 | 
198 |   def test_write_empty(self):
199 |     with AvroWriter(self.client, 'empty.avro', schema=self.schema):
200 |       pass
201 |     with AvroReader(self.client, 'empty.avro') as reader:
202 |       assert reader.schema == self.schema
203 |       assert list(reader) == []
204 | 
205 |   def test_write_overwrite_error(self):
206 |     with pytest.raises(HdfsError):
207 |       # To check that the background `AsyncWriter` thread doesn't hang.
208 |       self.client.makedirs('weather.avro')
209 |       with AvroWriter(self.client, 'weather.avro', schema=self.schema) as writer:
210 |         for record in self.records:
211 |           writer.write(record)
212 | 
213 |   def test_infer_schema(self):
214 |     with AvroWriter(self.client, 'weather.avro') as writer:
215 |       for record in self.records:
216 |         writer.write(record)
217 |     with AvroReader(self.client, 'weather.avro') as reader:
218 |       assert list(reader) == self.records
219 | 
220 | 
221 | class TestMain(_AvroIntegrationTest):
222 | 
223 |   def test_schema(self):
224 |     self.client.upload('weather.avro', osp.join(self.dpath, 'weather.avro'))
225 |     with temppath() as tpath:
226 |       with open(tpath, 'w') as writer:
227 |         main(['schema', 'weather.avro'], client=self.client, stdout=writer)
228 |       with open(tpath) as reader:
229 |         schema = load(reader)
230 |       assert self.schema == schema
231 | 
232 |   def test_read(self):
233 |     self.client.upload('weather.avro', osp.join(self.dpath, 'weather.avro'))
234 |     with temppath() as tpath:
235 |       with open(tpath, 'w') as writer:
236 |         main(
237 |           ['read', 'weather.avro', '--num', '2'],
238 |           client=self.client,
239 |           stdout=writer
240 |         )
241 |       with open(tpath) as reader:
242 |         records = [loads(line) for line in reader]
243 |       assert records == self.records[:2]
244 | 
245 |   def test_read_part_file(self):
246 |     data = {
247 |       'part-m-00000.avro': [{'name': 'jane'}, {'name': 'bob'}],
248 |       'part-m-00001.avro': [{'name': 'john'}, {'name': 'liz'}],
249 |     }
250 |     for fname, records in data.items():
251 |       with AvroWriter(self.client, 'data.avro/{}'.format(fname)) as writer:
252 |         for record in records:
253 |           writer.write(record)
254 |     with temppath() as tpath:
255 |       with open(tpath, 'w') as writer:
256 |         main(
257 |           ['read', 'data.avro', '--parts', '1,'],
258 |           client=self.client,
259 |           stdout=writer
260 |         )
261 |       with open(tpath) as reader:
262 |         records = [loads(line) for line in reader]
263 |       assert records == data['part-m-00001.avro']
264 | 
265 |   def test_write(self):
266 |     with open(osp.join(self.dpath, 'weather.jsonl')) as reader:
267 |       main(
268 |         [
269 |           'write', 'weather.avro',
270 |           '--schema', dumps(self.schema),
271 |           '--codec', 'null',
272 |         ],
273 |         client=self.client,
274 |         stdin=reader
275 |       )
276 |     with temppath() as tpath:
277 |       self.client.download('weather.avro', tpath)
278 |       assert (
279 |         self._get_data_bytes(tpath) ==
280 |         self._get_data_bytes(osp.join(self.dpath, 'weather.avro')))
281 | 
282 |   def test_write_codec(self):
283 |     with open(osp.join(self.dpath, 'weather.jsonl')) as reader:
284 |       main(
285 |         [
286 |           'write', 'weather.avro',
287 |           '--schema', dumps(self.schema),
288 |           '--codec', 'deflate',
289 |         ],
290 |         client=self.client,
291 |         stdin=reader
292 |       )
293 |     # Correct content.
294 |     with AvroReader(self.client, 'weather.avro') as reader:
295 |       records = list(reader)
296 |     assert records == self.records
297 |     # Different size (might not be smaller, since very small file).
298 |     compressed_size = self.client.content('weather.avro')['length']
299 |     uncompressed_size = osp.getsize(osp.join(self.dpath, 'weather.avro'))
300 |     assert compressed_size != uncompressed_size
301 | 


--------------------------------------------------------------------------------
/test/test_ext_dataframe.py:
--------------------------------------------------------------------------------
 1 | #!/usr/bin/env python
 2 | # encoding: utf-8
 3 | 
 4 | """Test Dataframe extension."""
 5 | 
 6 | from hdfs.util import HdfsError, temppath
 7 | from json import loads
 8 | from test.util import _IntegrationTest
 9 | import os.path as osp
10 | 
11 | try:
12 |   from hdfs.ext.avro import AvroReader
13 |   from hdfs.ext.dataframe import read_dataframe, write_dataframe
14 |   from pandas.testing import assert_frame_equal
15 |   import pandas as pd
16 | except ImportError:
17 |   SKIP = True
18 | else:
19 |   SKIP = False
20 | 
21 | 
22 | class _DataFrameIntegrationTest(_IntegrationTest):
23 | 
24 |   dpath = osp.join(osp.dirname(__file__), 'dat')
25 |   records = None
26 |   df = None
27 | 
28 |   @classmethod
29 |   def setup_class(cls):
30 |     if SKIP:
31 |       return
32 |     super(_DataFrameIntegrationTest, cls).setup_class()
33 |     with open(osp.join(cls.dpath, 'weather.jsonl')) as reader:
34 |       cls.records = [loads(line) for line in reader]
35 |       cls.df = pd.DataFrame.from_records(cls.records)
36 | 
37 | 
38 | class TestReadDataFrame(_DataFrameIntegrationTest):
39 | 
40 |   def test_read(self):
41 |     self.client.upload('weather.avro', osp.join(self.dpath, 'weather.avro'))
42 |     assert_frame_equal(
43 |       read_dataframe(self.client, 'weather.avro'),
44 |       self.df
45 |     )
46 | 
47 | 
48 | class TestWriteDataFrame(_DataFrameIntegrationTest):
49 | 
50 |   def test_write(self):
51 |     write_dataframe(self.client, 'weather.avro', self.df)
52 |     with AvroReader(self.client, 'weather.avro') as reader:
53 |       assert list(reader) == self.records
54 | 
55 | 
56 | class TestReadWriteDataFrame(_DataFrameIntegrationTest):
57 | 
58 |   def test_column_order(self):
59 |     # Column order should be preserved, not just alphabetical.
60 |     df = self.df[['temp', 'station', 'time']]
61 |     write_dataframe(self.client, 'weather-ordered.avro', df)
62 |     assert_frame_equal(
63 |       read_dataframe(self.client, 'weather-ordered.avro'),
64 |       df
65 |     )
66 | 


--------------------------------------------------------------------------------
/test/test_ext_kerberos.py:
--------------------------------------------------------------------------------
 1 | #!/usr/bin/env python
 2 | # encoding: utf-8
 3 | 
 4 | """Test Kerberos extension."""
 5 | 
 6 | from threading import Lock, Thread
 7 | from time import sleep, time
 8 | import sys
 9 | 
10 | 
11 | class MockHTTPKerberosAuth(object):
12 | 
13 |   def __init__(self, **kwargs):
14 |     self._lock = Lock()
15 |     self._calls = set()
16 |     self._items = []
17 | 
18 |   def __call__(self, n):
19 |     with self._lock:
20 |       assert not self._items
21 |       self._items.append(n)
22 |     sleep(0.25)
23 |     with self._lock:
24 |       thread = self._items.pop()
25 |       assert thread == n
26 |       self._calls.add(thread)
27 | 
28 | 
29 | class MockModule(object):
30 |   def __init__(self):
31 |     self.HTTPKerberosAuth = MockHTTPKerberosAuth
32 | 
33 | 
34 | sys.modules['requests_kerberos'] = MockModule()
35 | 
36 | from hdfs.ext.kerberos import _HdfsHTTPKerberosAuth
37 | 
38 | 
39 | class TestKerberosClient(object):
40 | 
41 |   def test_max_concurrency(self):
42 |     auth = _HdfsHTTPKerberosAuth(1, mutual_auth='OPTIONAL')
43 |     t1 = Thread(target=auth.__call__, args=(1, ))
44 |     t1.start()
45 |     t2 = Thread(target=auth.__call__, args=(2, ))
46 |     t2.start()
47 |     t1.join()
48 |     t2.join()
49 |     assert auth._calls == {1, 2}
50 | 


--------------------------------------------------------------------------------
/test/test_main.py:
--------------------------------------------------------------------------------
  1 | #!/usr/bin/env python
  2 | # encoding: utf-8
  3 | 
  4 | """Test CLI."""
  5 | 
  6 | from hdfs.__main__ import _Progress, configure_client, main, parse_arg
  7 | from hdfs.config import Config, NullHandler
  8 | from hdfs.util import HdfsError, temppath
  9 | from logging.handlers import TimedRotatingFileHandler
 10 | from test.util import _IntegrationTest
 11 | import filecmp
 12 | import logging as lg
 13 | import os
 14 | import os.path as osp
 15 | import pytest
 16 | import sys
 17 | 
 18 | 
 19 | class TestParseArg(object):
 20 | 
 21 |   def test_parse_invalid(self):
 22 |     with pytest.raises(HdfsError):
 23 |       parse_arg({'foo': 'a'}, 'foo', int)
 24 | 
 25 |   def test_parse_int(self):
 26 |     assert parse_arg({'foo': '1'}, 'foo', int) == 1
 27 |     assert parse_arg({'foo': '1'}, 'foo', int, ',') == 1
 28 | 
 29 |   def test_parse_float(self):
 30 |     assert parse_arg({'foo': '123.4'}, 'foo', float) == 123.4
 31 | 
 32 |   def test_parse_int_list(self):
 33 |     assert parse_arg({'foo': '1,'}, 'foo', int, ',') == [1]
 34 |     assert parse_arg({'foo': '1,2'}, 'foo', int, ',') == [1,2]
 35 | 
 36 | 
 37 | class TestConfigureClient(object):
 38 | 
 39 |   def test_with_alias(self):
 40 |     url = 'http://host:port'
 41 |     with temppath() as tpath:
 42 |       config = Config(path=tpath)
 43 |       section = 'dev.alias'
 44 |       config.add_section(section)
 45 |       config.set(section, 'url', url)
 46 |       args = {'--alias': 'dev', '--log': False, '--verbose': 0}
 47 |       client = configure_client('test', args, config=config)
 48 |       assert client.url == url
 49 |       assert client.urls == [url]
 50 | 
 51 | 
 52 | class TestProgress(object):
 53 | 
 54 |   def test_single_file(self):
 55 |     with temppath() as tpath:
 56 |       with open(tpath, 'w') as writer:
 57 |         progress = _Progress(100, 1, writer=writer)
 58 |         progress('foo', 60)
 59 |         assert progress._data['foo'] == 60
 60 |         assert progress._pending_files == 0
 61 |         assert progress._downloading_files == 1
 62 |         progress('foo', 40)
 63 |         progress('foo', -1)
 64 |         assert progress._downloading_files == 0
 65 |         assert progress._complete_files == 1
 66 | 
 67 |   def test_from_local_path(self):
 68 |     with temppath() as dpath:
 69 |       os.mkdir(dpath)
 70 |       fpath1 = osp.join(dpath, 'foo')
 71 |       with open(fpath1, 'w') as writer:
 72 |         writer.write('hey')
 73 |       os.mkdir(osp.join(dpath, 'bar'))
 74 |       fpath2 = osp.join(dpath, 'bar', 'baz')
 75 |       with open(fpath2, 'w') as writer:
 76 |         writer.write('hello')
 77 |       with temppath() as tpath:
 78 |         with open(tpath, 'w') as writer:
 79 |           progress = _Progress.from_local_path(dpath, writer=writer)
 80 |           assert progress._total_bytes == 8
 81 |           assert progress._pending_files == 2
 82 | 
 83 | 
 84 | class TestMain(_IntegrationTest):
 85 | 
 86 |   dpath = osp.join(osp.dirname(__file__), 'dat')
 87 | 
 88 |   def setup_method(self):
 89 |     self._root_logger = lg.getLogger()
 90 |     self._handlers = self._root_logger.handlers
 91 |     super(TestMain, self).setup_method()
 92 | 
 93 |   def teardown_method(self):
 94 |     self._root_logger.handlers = self._handlers
 95 | 
 96 |   def _dircmp(self, dpath):
 97 |     dircmp = filecmp.dircmp(self.dpath, dpath)
 98 |     assert not dircmp.left_only
 99 |     assert not dircmp.right_only
100 |     assert not dircmp.diff_files
101 | 
102 |   def test_download(self):
103 |     self.client.upload('foo', self.dpath)
104 |     with temppath() as tpath:
105 |       main(
106 |         ['download', 'foo', tpath, '--silent', '--threads', '1'],
107 |         self.client
108 |       )
109 |       self._dircmp(tpath)
110 | 
111 |   def test_download_stream(self):
112 |     self.client.write('foo', 'hello')
113 |     with temppath() as tpath:
114 |       stdout = sys.stdout
115 |       try:
116 |         with open(tpath, 'wb') as writer:
117 |           sys.stdout = writer
118 |           main(
119 |             ['download', 'foo', '-', '--silent', '--threads', '1'],
120 |             self.client
121 |           )
122 |       finally:
123 |         sys.stdout = stdout
124 |       with open(tpath) as reader:
125 |         assert reader.read() == 'hello'
126 | 
127 |   def test_download_stream_multiple_files(self):
128 |     with pytest.raises(SystemExit):
129 |       self.client.upload('foo', self.dpath)
130 |       main(
131 |         ['download', 'foo', '-', '--silent', '--threads', '1'],
132 |         self.client
133 |       )
134 | 
135 |   def test_download_overwrite(self):
136 |     with pytest.raises(SystemExit):
137 |       self.client.upload('foo', self.dpath)
138 |       with temppath() as tpath:
139 |         with open(tpath, 'w'):
140 |           pass
141 |         main(
142 |           ['download', 'foo', tpath, '--silent', '--threads', '1'],
143 |           self.client
144 |         )
145 |         self._dircmp(tpath)
146 | 
147 |   def test_download_force(self):
148 |     self.client.write('foo', 'hey')
149 |     with temppath() as tpath:
150 |       with open(tpath, 'w'):
151 |         pass
152 |       main(
153 |         ['download', 'foo', tpath, '--silent', '--force', '--threads', '1'],
154 |         self.client
155 |       )
156 |       with open(tpath) as reader:
157 |         assert reader.read() == 'hey'
158 | 
159 |   def test_upload(self):
160 |     main(
161 |       ['upload', self.dpath, 'bar', '--silent', '--threads', '1'],
162 |       self.client
163 |     )
164 |     with temppath() as tpath:
165 |       self.client.download('bar', tpath)
166 |       self._dircmp(tpath)
167 | 
168 |   def test_upload_overwrite(self):
169 |     with pytest.raises(SystemExit):
170 |       self.client.write('bar', 'hey')
171 |       main(
172 |         ['upload', self.dpath, 'bar', '--silent', '--threads', '1'],
173 |         self.client
174 |       )
175 | 
176 |   def test_upload_force(self):
177 |     self.client.write('bar', 'hey')
178 |     main(
179 |       ['upload', self.dpath, 'bar', '--silent', '--threads', '1', '--force'],
180 |       self.client
181 |     )
182 |     with temppath() as tpath:
183 |       self.client.download('bar', tpath)
184 |       self._dircmp(tpath)
185 | 
186 |   def test_upload_append(self):
187 |     with temppath() as tpath:
188 |       with open(tpath, 'w') as writer:
189 |         writer.write('hey')
190 |       main(['upload', tpath, 'bar', '--silent', '--threads', '1'], self.client)
191 |       main(
192 |         ['upload', tpath, 'bar', '--silent', '--threads', '1', '--append'],
193 |         self.client
194 |       )
195 |     with self.client.read('bar') as reader:
196 |       assert reader.read() == b'heyhey'
197 | 
198 |   def test_upload_append_folder(self):
199 |     with pytest.raises(SystemExit):
200 |       with temppath() as tpath:
201 |         main(['upload', self.dpath, '--silent', '--append'], self.client)
202 | 


--------------------------------------------------------------------------------
/test/test_util.py:
--------------------------------------------------------------------------------
 1 | #!/usr/bin/env python
 2 | # encoding: utf-8
 3 | 
 4 | """Test Hdfs client interactions with HDFS."""
 5 | 
 6 | from hdfs.util import *
 7 | import pytest
 8 | 
 9 | 
10 | class TestAsyncWriter(object):
11 | 
12 |   def test_basic(self):
13 |     result = []
14 |     def consumer(gen):
15 |       result.append(list(gen))
16 |     with AsyncWriter(consumer) as writer:
17 |       writer.write('one')
18 |       writer.write('two')
19 |     assert result == [['one','two']]
20 | 
21 |   def test_multiple_writer_uses(self):
22 |     result = []
23 |     def consumer(gen):
24 |       result.append(list(gen))
25 |     writer = AsyncWriter(consumer)
26 |     with writer:
27 |       writer.write('one')
28 |       writer.write('two')
29 |     with writer:
30 |       writer.write('three')
31 |       writer.write('four')
32 |     assert result == [['one','two'],['three','four']]
33 | 
34 |   def test_multiple_consumer_uses(self):
35 |     result = []
36 |     def consumer(gen):
37 |       result.append(list(gen))
38 |     with AsyncWriter(consumer) as writer:
39 |       writer.write('one')
40 |       writer.write('two')
41 |     with AsyncWriter(consumer) as writer:
42 |       writer.write('three')
43 |       writer.write('four')
44 |     assert result == [['one','two'],['three','four']]
45 | 
46 |   def test_nested(self):
47 |     with pytest.raises(ValueError):
48 |       result = []
49 |       def consumer(gen):
50 |         result.append(list(gen))
51 |       with AsyncWriter(consumer) as _writer:
52 |         _writer.write('one')
53 |         with _writer as writer:
54 |           writer.write('two')
55 | 
56 |   def test_child_error(self):
57 |     with pytest.raises(HdfsError):
58 |       def consumer(gen):
59 |         for value in gen:
60 |           if value == 'two':
61 |             raise HdfsError('Yo')
62 |       with AsyncWriter(consumer) as writer:
63 |         writer.write('one')
64 |         writer.write('two')
65 | 
66 |   def test_parent_error(self):
67 |     with pytest.raises(HdfsError):
68 |       def consumer(gen):
69 |         for value in gen:
70 |           pass
71 |       def invalid(w):
72 |         w.write('one')
73 |         raise HdfsError('Ya')
74 |       with AsyncWriter(consumer) as writer:
75 |         invalid(writer)
76 | 
77 | 
78 | class TestTemppath(object):
79 | 
80 |   def test_new(self):
81 |     with temppath() as tpath:
82 |       assert not osp.exists(tpath)
83 | 
84 |   def test_cleanup(self):
85 |     with temppath() as tpath:
86 |       with open(tpath, 'w') as writer:
87 |         writer.write('hi')
88 |     assert not osp.exists(tpath)
89 | 
90 |   def test_dpath(self):
91 |     with temppath() as dpath:
92 |       os.mkdir(dpath)
93 |       with temppath(dpath) as tpath:
94 |         assert osp.dirname(tpath) == dpath
95 | 


--------------------------------------------------------------------------------
/test/util.py:
--------------------------------------------------------------------------------
  1 | #!/usr/bin/env python
  2 | # encoding: utf-8
  3 | 
  4 | """Test helpers."""
  5 | 
  6 | from hdfs import InsecureClient
  7 | from hdfs.config import Config
  8 | from hdfs.util import HdfsError
  9 | from requests.exceptions import ConnectionError
 10 | from six.moves.configparser import NoOptionError, NoSectionError
 11 | from time import sleep
 12 | import os
 13 | import posixpath as psp
 14 | import pytest
 15 | 
 16 | 
 17 | def save_config(config, path=None):
 18 |   """Save configuration to file.
 19 | 
 20 |   :param config: :class:`~hdfs.config.Config` instance.
 21 | 
 22 |   """
 23 |   with open(path or config.path, 'w') as writer:
 24 |     config.write(writer)
 25 | 
 26 | 
 27 | class _IntegrationTest(object):
 28 | 
 29 |   """Base class to run tests using remote HDFS.
 30 | 
 31 |   These tests are run only if a `HDFSCLI_TEST_ALIAS` or `HDFSCLI_TEST_URL`
 32 |   environment variable is defined (the former taking precedence). For safety,
 33 |   a suffix is appended to any defined root.
 34 | 
 35 |   .. warning::
 36 | 
 37 |     The new root directory used is entirely cleaned during tests!
 38 | 
 39 |   Also contains a few helper functions.
 40 | 
 41 |   """
 42 | 
 43 |   client = None
 44 |   delay = 0.5 # Delay in seconds between tests.
 45 |   root_suffix = '.hdfscli' # Also used as default root if none specified.
 46 | 
 47 |   @classmethod
 48 |   def setup_class(cls):
 49 |     alias = os.getenv('HDFSCLI_TEST_ALIAS')
 50 |     url = os.getenv('HDFSCLI_TEST_URL')
 51 |     if alias:
 52 |       cls.client = Config().get_client(alias)
 53 |       if cls.client.root:
 54 |         cls.client.root = psp.join(cls.client.root, cls.root_suffix)
 55 |       else:
 56 |         cls.client.root = cls.root_suffix
 57 |     elif url:
 58 |       cls.client = InsecureClient(url, root=cls.root_suffix)
 59 | 
 60 |   @classmethod
 61 |   def teardown_class(cls):
 62 |     if cls.client:
 63 |       cls.client.delete('', recursive=True)
 64 | 
 65 |   def setup_method(self):
 66 |     if not self.client:
 67 |       pytest.skip()
 68 |     else:
 69 |       try:
 70 |         self.client.delete('', recursive=True)
 71 |         # Wrapped inside a `ConnectionError` block because this causes failures
 72 |         # when trying to reuse some streamed connections when they aren't fully
 73 |         # read (even though it is closed explicitly, it acts differently than
 74 |         # when all its content has been read), but only on HttpFS. A test which
 75 |         # needs this for example is `test_ext_avro.py:TestMain.test_schema`.
 76 |         # This seems related to this issue:
 77 |         # https://github.com/kennethreitz/requests/issues/1915 (even on more
 78 |         # recent versions of `requests` though).
 79 |         #
 80 |         # Here is a simple test case that will pass on WebHDFS but fail on
 81 |         # HttpFS:
 82 |         #
 83 |         # .. code:: python
 84 |         #
 85 |         #   client = Config().get_client('test-webhdfs')
 86 |         #   client.write('foo', 'hello')
 87 |         #   with client.read('foo') as reader:
 88 |         #     pass # Will succeed if this is replaced by `reader.read()`.
 89 |         #   client.delete('foo')
 90 |         #
 91 |       except ConnectionError:
 92 |         self.client.delete('', recursive=True) # Retry.
 93 |       finally:
 94 |         sleep(self.delay)
 95 | 
 96 |   # Helpers.
 97 | 
 98 |   def _read(self, hdfs_path, encoding=None):
 99 |     with self.client.read(hdfs_path, encoding=encoding) as reader:
100 |       return reader.read()
101 | 
102 |   def _write(self, hdfs_path, data, encoding=None):
103 |     with self.client.write(hdfs_path, encoding=encoding) as writer:
104 |       return writer.write(data)
105 | 
106 |   def _exists(self, hdfs_path):
107 |     return bool(self.client.status(hdfs_path, strict=False))
108 | 


--------------------------------------------------------------------------------