├── requirements.txt
├── fsspec
    ├── tests
    │   ├── __init__.py
    │   ├── test_gui.py
    │   ├── test_async.py
    │   ├── test_fuse.py
    │   ├── test_config.py
    │   ├── test_caches.py
    │   ├── test_mapping.py
    │   ├── test_registry.py
    │   ├── test_compression.py
    │   ├── test_core.py
    │   └── test_file.py
    ├── implementations
    │   ├── __init__.py
    │   ├── tests
    │   │   ├── __init__.py
    │   │   ├── test_dask.py
    │   │   ├── conftest.py
    │   │   ├── test_common.py
    │   │   ├── test_hdfs.py
    │   │   ├── test_jupyter.py
    │   │   ├── test_git.py
    │   │   ├── cassettes
    │   │   │   └── test_dbfs_file_listing.yaml
    │   │   ├── test_reference.py
    │   │   ├── test_memory.py
    │   │   ├── test_smb.py
    │   │   ├── test_webhdfs.py
    │   │   ├── test_sftp.py
    │   │   ├── test_ftp.py
    │   │   ├── test_dbfs.py
    │   │   ├── test_zip.py
    │   │   └── test_libarchive.py
    │   ├── dvc.py
    │   ├── git.py
    │   ├── jupyter.py
    │   ├── sftp.py
    │   ├── dask.py
    │   ├── zip.py
    │   ├── reference.py
    │   ├── hdfs.py
    │   ├── libarchive.py
    │   └── github.py
    ├── __init__.py
    ├── conftest.py
    ├── transaction.py
    ├── dircache.py
    ├── config.py
    ├── compression.py
    ├── fuse.py
    └── mapping.py
├── .gitattributes
├── readthedocs.yml
├── docs
    ├── source
    │   ├── img
    │   │   └── gui.png
    │   ├── index.rst
    │   ├── developer.rst
    │   ├── usage.rst
    │   ├── intro.rst
    │   ├── changelog.rst
    │   ├── conf.py
    │   └── api.rst
    ├── environment.yml
    ├── README.md
    ├── Makefile
    └── make.bat
├── MANIFEST.in
├── ci
    ├── environment-py38.yml
    └── environment-win.yml
├── pyproject.toml
├── .coveragerc
├── .pre-commit-config.yaml
├── setup.cfg
├── .github
    └── workflows
    │   ├── pypipublish.yaml
    │   └── main.yaml
├── LICENSE
├── setup.py
├── .gitignore
├── tox.ini
└── README.md


/requirements.txt:
--------------------------------------------------------------------------------
1 | 


--------------------------------------------------------------------------------
/fsspec/tests/__init__.py:
--------------------------------------------------------------------------------
1 | 


--------------------------------------------------------------------------------
/fsspec/implementations/__init__.py:
--------------------------------------------------------------------------------
1 | 


--------------------------------------------------------------------------------
/fsspec/implementations/tests/__init__.py:
--------------------------------------------------------------------------------
1 | 


--------------------------------------------------------------------------------
/.gitattributes:
--------------------------------------------------------------------------------
1 | fsspec/_version.py export-subst
2 | 


--------------------------------------------------------------------------------
/readthedocs.yml:
--------------------------------------------------------------------------------
1 | conda:
2 |     file: docs/environment.yml
3 | 


--------------------------------------------------------------------------------
/docs/source/img/gui.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/andersy005/filesystem_spec/master/docs/source/img/gui.png


--------------------------------------------------------------------------------
/MANIFEST.in:
--------------------------------------------------------------------------------
1 | include versioneer.py
2 | include fsspec/_version.py
3 | 
4 | include LICENSE
5 | include README.rst
6 | include requirements.txt
7 | 


--------------------------------------------------------------------------------
/ci/environment-py38.yml:
--------------------------------------------------------------------------------
1 | name: test_env
2 | channels:
3 |   - conda-forge
4 |   - defaults
5 | dependencies:
6 |   - python=3.8
7 |   - tox
8 |   - tox-conda
9 | 


--------------------------------------------------------------------------------
/pyproject.toml:
--------------------------------------------------------------------------------
1 | [tool.black]
2 | # Revert to py34 target syntax to accomodate
3 | # errors in trailing commas.
4 | # https://github.com/psf/black/pull/763
5 | target_version = ['py34']
6 | 


--------------------------------------------------------------------------------
/fsspec/tests/test_gui.py:
--------------------------------------------------------------------------------
 1 | import pytest
 2 | 
 3 | panel = pytest.importorskip("panel")
 4 | 
 5 | 
 6 | def test_basic():
 7 |     import fsspec.gui
 8 | 
 9 |     gui = fsspec.gui.FileSelector()
10 |     assert "url" in str(gui.panel)
11 | 


--------------------------------------------------------------------------------
/docs/environment.yml:
--------------------------------------------------------------------------------
 1 | name: fsspec
 2 | channels:
 3 |   - defaults
 4 |   - conda-forge
 5 | dependencies:
 6 |   - python=3.7
 7 |   - paramiko
 8 |   - requests
 9 |   - numpydoc
10 |   - aiohttp
11 |   - pygit2
12 |   - distributed
13 |   - pyarrow=1
14 |   - libarchive
15 |   - smbprotocol
16 | 


--------------------------------------------------------------------------------
/docs/README.md:
--------------------------------------------------------------------------------
 1 | # Building Documentation
 2 | 
 3 | A basic python environment with packages listed in `./requirements.txt` is
 4 | required to build the docs, see ``environment.yml``.
 5 | 
 6 | To make HTML documentation:
 7 | 
 8 | ```bash
 9 | make html
10 | ```
11 | 
12 | Outputs to `build/html/index.html`
13 | 


--------------------------------------------------------------------------------
/ci/environment-win.yml:
--------------------------------------------------------------------------------
 1 | name: test_env
 2 | channels:
 3 |   - conda-forge
 4 |   - defaults
 5 | dependencies:
 6 |   - aiohttp
 7 |   - pip
 8 |   - requests
 9 |   - zstandard
10 |   - python-snappy
11 |   - lz4
12 |   - pyftpdlib
13 |   - cloudpickle
14 |   - pytest
15 |   - pytest-benchmark
16 |   - pytest-cov
17 |   - pytest-vcr
18 |   - python-libarchive-c
19 |   - numpy
20 |   - nomkl
21 | 


--------------------------------------------------------------------------------
/.coveragerc:
--------------------------------------------------------------------------------
 1 | [run]
 2 | omit =
 3 |     */test_*.py
 4 |     fsspec/_version.py
 5 |     fsspec/implementations/dvc.py
 6 |     fsspec/implementations/github.py
 7 |     fsspec/implementations/hdfs.py
 8 | source =
 9 |     fsspec
10 | 
11 | [report]
12 | # Regexes for lines to exclude from consideration
13 | exclude_lines =
14 |     pragma: no cover
15 | 
16 |     raise AssertionError
17 |     raise NotImplementedError
18 |     pass
19 | 
20 | ignore_errors = True
21 | 


--------------------------------------------------------------------------------
/.pre-commit-config.yaml:
--------------------------------------------------------------------------------
 1 | exclude: >
 2 |     (?x)^(
 3 |         \.tox/.*
 4 |     )$
 5 | default_language_version:
 6 |   python: python3.7
 7 | repos:
 8 |   - repo: local
 9 |     hooks:
10 |       - id: black
11 |         name: black
12 |         entry: black
13 |         language: python
14 |         require_serial: true
15 |         types: [python]
16 |   - repo: https://github.com/pre-commit/pre-commit-hooks
17 |     rev: v2.3.0
18 |     hooks:
19 |     - id: flake8
20 | 


--------------------------------------------------------------------------------
/setup.cfg:
--------------------------------------------------------------------------------
 1 | [metadata]
 2 | long_description: file: README.rst
 3 | 
 4 | [versioneer]
 5 | VCS = git
 6 | style = pep440
 7 | versionfile_source = fsspec/_version.py
 8 | versionfile_build = fsspec/_version.py
 9 | tag_prefix = ""
10 | 
11 | [flake8]
12 | exclude = .tox,build,docs/source/conf.py,versioneer.py
13 | max-line-length = 88
14 | ignore =
15 |     # Assigning lambda expression
16 |     E731
17 |     # Ambiguous variable names
18 |     E741
19 |     # line break before binary operator
20 |     W503
21 |     # whitespace before :
22 |     E203
23 | 


--------------------------------------------------------------------------------
/fsspec/tests/test_async.py:
--------------------------------------------------------------------------------
 1 | import pytest
 2 | import asyncio
 3 | import sys
 4 | from fsspec.asyn import _run_until_done
 5 | 
 6 | 
 7 | async def inner():
 8 |     await asyncio.sleep(1)
 9 |     return True
10 | 
11 | 
12 | async def outer():
13 |     await asyncio.sleep(1)
14 |     return _run_until_done(inner())
15 | 
16 | 
17 | @pytest.mark.skipif(sys.version_info < (3, 7), reason="Async fails on py36")
18 | def test_runtildone():
19 |     loop = asyncio.new_event_loop()
20 |     asyncio.set_event_loop(loop)
21 |     assert loop.run_until_complete(outer())
22 |     loop.close()
23 | 


--------------------------------------------------------------------------------
/docs/Makefile:
--------------------------------------------------------------------------------
 1 | # Minimal makefile for Sphinx documentation
 2 | #
 3 | 
 4 | # You can set these variables from the command line.
 5 | SPHINXOPTS    =
 6 | SPHINXBUILD   = sphinx-build
 7 | SPHINXPROJ    = fsspec
 8 | SOURCEDIR     = source
 9 | BUILDDIR      = build
10 | 
11 | # Put it first so that "make" without argument is like "make help".
12 | help:
13 | 	@$(SPHINXBUILD) -M help "$(SOURCEDIR)" "$(BUILDDIR)" $(SPHINXOPTS) $(O)
14 | 
15 | .PHONY: help Makefile
16 | 
17 | # Catch-all target: route all unknown targets to Sphinx using the new
18 | # "make mode" option.  $(O) is meant as a shortcut for $(SPHINXOPTS).
19 | %: Makefile
20 | 	@$(SPHINXBUILD) -M $@ "$(SOURCEDIR)" "$(BUILDDIR)" $(SPHINXOPTS) $(O)
21 | 


--------------------------------------------------------------------------------
/fsspec/implementations/tests/test_dask.py:
--------------------------------------------------------------------------------
 1 | import pytest
 2 | import fsspec
 3 | 
 4 | pytest.importorskip("distributed")
 5 | 
 6 | 
 7 | @pytest.fixture()
 8 | def cli(tmpdir):
 9 |     import dask.distributed
10 | 
11 |     client = dask.distributed.Client(n_workers=1)
12 | 
13 |     def setup():
14 |         m = fsspec.filesystem("memory")
15 |         with m.open("afile", "wb") as f:
16 |             f.write(b"data")
17 | 
18 |     client.run(setup)
19 |     try:
20 |         yield client
21 |     finally:
22 |         client.close()
23 | 
24 | 
25 | def test_basic(cli):
26 | 
27 |     fs = fsspec.filesystem("dask", target_protocol="memory")
28 |     assert fs.ls("") == ["afile"]
29 |     assert fs.cat("afile") == b"data"
30 | 


--------------------------------------------------------------------------------
/fsspec/implementations/tests/conftest.py:
--------------------------------------------------------------------------------
 1 | import tempfile
 2 | 
 3 | import pytest
 4 | 
 5 | from fsspec.implementations.local import LocalFileSystem
 6 | 
 7 | 
 8 | # A dummy filesystem that has a list of protocols
 9 | class MultiProtocolFileSystem(LocalFileSystem):
10 |     protocol = ["file", "other"]
11 | 
12 | 
13 | FILESYSTEMS = {"local": LocalFileSystem, "multi": MultiProtocolFileSystem}
14 | 
15 | READ_ONLY_FILESYSTEMS = []
16 | 
17 | 
18 | @pytest.fixture(scope="function")
19 | def fs(request):
20 |     cls = FILESYSTEMS[request.param]
21 |     return cls()
22 | 
23 | 
24 | @pytest.fixture(scope="function")
25 | def temp_file():
26 |     with tempfile.TemporaryDirectory() as temp_dir:
27 |         return temp_dir + "test-file"
28 | 


--------------------------------------------------------------------------------
/.github/workflows/pypipublish.yaml:
--------------------------------------------------------------------------------
 1 | name: Upload Python Package
 2 | 
 3 | on:
 4 |   release:
 5 |     types: [created]
 6 | 
 7 | jobs:
 8 |   deploy:
 9 |     runs-on: ubuntu-latest
10 |     steps:
11 |       - uses: actions/checkout@v2
12 |       - name: Set up Python
13 |         uses: actions/setup-python@v2
14 |         with:
15 |           python-version: "3.x"
16 |       - name: Install dependencies
17 |         run: |
18 |           python -m pip install --upgrade pip
19 |           pip install setuptools setuptools-scm wheel twine
20 |       - name: Build and publish
21 |         env:
22 |           TWINE_USERNAME: ${{ secrets.PYPI_USERNAME }}
23 |           TWINE_PASSWORD: ${{ secrets.PYPI_PASSWORD }}
24 |         run: |
25 |           python setup.py sdist bdist_wheel
26 |           twine upload dist/*
27 | 


--------------------------------------------------------------------------------
/docs/make.bat:
--------------------------------------------------------------------------------
 1 | @ECHO OFF
 2 | 
 3 | pushd %~dp0
 4 | 
 5 | REM Command file for Sphinx documentation
 6 | 
 7 | if "%SPHINXBUILD%" == "" (
 8 | 	set SPHINXBUILD=sphinx-build
 9 | )
10 | set SOURCEDIR=source
11 | set BUILDDIR=build
12 | set SPHINXPROJ=fsspec
13 | 
14 | if "%1" == "" goto help
15 | 
16 | %SPHINXBUILD% >NUL 2>NUL
17 | if errorlevel 9009 (
18 | 	echo.
19 | 	echo.The 'sphinx-build' command was not found. Make sure you have Sphinx
20 | 	echo.installed, then set the SPHINXBUILD environment variable to point
21 | 	echo.to the full path of the 'sphinx-build' executable. Alternatively you
22 | 	echo.may add the Sphinx directory to PATH.
23 | 	echo.
24 | 	echo.If you don't have Sphinx installed, grab it from
25 | 	echo.http://sphinx-doc.org/
26 | 	exit /b 1
27 | )
28 | 
29 | %SPHINXBUILD% -M %1 %SOURCEDIR% %BUILDDIR% %SPHINXOPTS%
30 | goto end
31 | 
32 | :help
33 | %SPHINXBUILD% -M help %SOURCEDIR% %BUILDDIR% %SPHINXOPTS%
34 | 
35 | :end
36 | popd
37 | 


--------------------------------------------------------------------------------
/fsspec/implementations/tests/test_common.py:
--------------------------------------------------------------------------------
 1 | import datetime
 2 | import time
 3 | import pytest
 4 | 
 5 | from fsspec import AbstractFileSystem
 6 | from fsspec.implementations.tests.conftest import READ_ONLY_FILESYSTEMS
 7 | 
 8 | 
 9 | @pytest.mark.parametrize("fs", ["local"], indirect=["fs"])
10 | def test_created(fs: AbstractFileSystem, temp_file):
11 |     try:
12 |         fs.touch(temp_file)
13 |         created = fs.created(path=temp_file)
14 |         assert isinstance(created, datetime.datetime)
15 |     finally:
16 |         if not isinstance(fs, tuple(READ_ONLY_FILESYSTEMS)):
17 |             fs.rm(temp_file)
18 | 
19 | 
20 | @pytest.mark.parametrize("fs", ["local"], indirect=["fs"])
21 | def test_modified(fs: AbstractFileSystem, temp_file):
22 |     try:
23 |         fs.touch(temp_file)
24 |         created = fs.created(path=temp_file)
25 |         time.sleep(0.05)
26 |         fs.touch(temp_file)
27 |         modified = fs.modified(path=temp_file)
28 |         assert isinstance(modified, datetime.datetime)
29 |         assert modified > created
30 |     finally:
31 |         fs.rm(temp_file)
32 | 


--------------------------------------------------------------------------------
/fsspec/__init__.py:
--------------------------------------------------------------------------------
 1 | try:
 2 |     from importlib.metadata import entry_points
 3 | except ImportError:  # python < 3.8
 4 |     try:
 5 |         from importlib_metadata import entry_points
 6 |     except ImportError:
 7 |         entry_points = None
 8 | 
 9 | 
10 | from . import caching
11 | from ._version import get_versions
12 | from .core import get_fs_token_paths, open, open_files, open_local
13 | from .mapping import FSMap, get_mapper
14 | from .registry import (
15 |     filesystem,
16 |     get_filesystem_class,
17 |     register_implementation,
18 |     registry,
19 | )
20 | from .spec import AbstractFileSystem
21 | 
22 | __version__ = get_versions()["version"]
23 | del get_versions
24 | 
25 | 
26 | __all__ = [
27 |     "AbstractFileSystem",
28 |     "FSMap",
29 |     "filesystem",
30 |     "register_implementation",
31 |     "get_filesystem_class",
32 |     "get_fs_token_paths",
33 |     "get_mapper",
34 |     "open",
35 |     "open_files",
36 |     "open_local",
37 |     "registry",
38 |     "caching",
39 | ]
40 | 
41 | if entry_points is not None:
42 |     entry_points = entry_points()
43 |     for spec in entry_points.get("fsspec.specs", []):
44 |         err_msg = f"Unable to load filesystem from {spec}"
45 |         register_implementation(spec.name, spec.module, errtxt=err_msg)
46 | 


--------------------------------------------------------------------------------
/fsspec/conftest.py:
--------------------------------------------------------------------------------
 1 | import os
 2 | import shutil
 3 | import subprocess
 4 | import sys
 5 | import time
 6 | 
 7 | import pytest
 8 | 
 9 | import fsspec
10 | from fsspec.implementations.cached import CachingFileSystem
11 | 
12 | 
13 | @pytest.fixture()
14 | def m():
15 |     """
16 |     Fixture providing a memory filesystem.
17 |     """
18 |     m = fsspec.filesystem("memory")
19 |     m.store.clear()
20 |     try:
21 |         yield m
22 |     finally:
23 |         m.store.clear()
24 | 
25 | 
26 | @pytest.fixture
27 | def ftp_writable(tmpdir):
28 |     """
29 |     Fixture providing a writable FTP filesystem.
30 |     """
31 |     pytest.importorskip("pyftpdlib")
32 |     from fsspec.implementations.ftp import FTPFileSystem
33 | 
34 |     FTPFileSystem.clear_instance_cache()  # remove lingering connections
35 |     CachingFileSystem.clear_instance_cache()
36 |     d = str(tmpdir)
37 |     with open(os.path.join(d, "out"), "wb") as f:
38 |         f.write(b"hello" * 10000)
39 |     P = subprocess.Popen(
40 |         [sys.executable, "-m", "pyftpdlib", "-d", d, "-u", "user", "-P", "pass", "-w"]
41 |     )
42 |     try:
43 |         time.sleep(1)
44 |         yield "localhost", 2121, "user", "pass"
45 |     finally:
46 |         P.terminate()
47 |         P.wait()
48 |         try:
49 |             shutil.rmtree(tmpdir)
50 |         except Exception:
51 |             pass
52 | 


--------------------------------------------------------------------------------
/LICENSE:
--------------------------------------------------------------------------------
 1 | BSD 3-Clause License
 2 | 
 3 | Copyright (c) 2018, Martin Durant
 4 | All rights reserved.
 5 | 
 6 | Redistribution and use in source and binary forms, with or without
 7 | modification, are permitted provided that the following conditions are met:
 8 | 
 9 | * Redistributions of source code must retain the above copyright notice, this
10 |   list of conditions and the following disclaimer.
11 | 
12 | * Redistributions in binary form must reproduce the above copyright notice,
13 |   this list of conditions and the following disclaimer in the documentation
14 |   and/or other materials provided with the distribution.
15 | 
16 | * Neither the name of the copyright holder nor the names of its
17 |   contributors may be used to endorse or promote products derived from
18 |   this software without specific prior written permission.
19 | 
20 | THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
21 | AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
22 | IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
23 | DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE
24 | FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
25 | DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
26 | SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
27 | CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
28 | OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
29 | OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
30 | 


--------------------------------------------------------------------------------
/fsspec/implementations/tests/test_hdfs.py:
--------------------------------------------------------------------------------
 1 | import pytest
 2 | import fsspec
 3 | 
 4 | pyarrow = pytest.importorskip("pyarrow")
 5 | 
 6 | basedir = "/tmp/test-fsspec"
 7 | data = b"\n".join([b"some test data"] * 1000)
 8 | 
 9 | 
10 | @pytest.fixture
11 | def hdfs(request):
12 |     try:
13 |         hdfs = pyarrow.hdfs.connect()
14 |     except IOError:
15 |         pytest.skip("No HDFS configured")
16 | 
17 |     if hdfs.exists(basedir):
18 |         hdfs.rm(basedir, recursive=True)
19 | 
20 |     hdfs.mkdir(basedir)
21 | 
22 |     with hdfs.open(basedir + "/file", "wb") as f:
23 |         f.write(data)
24 | 
25 |     yield hdfs
26 | 
27 |     if hdfs.exists(basedir):
28 |         hdfs.rm(basedir, recursive=True)
29 | 
30 | 
31 | def test_ls(hdfs):
32 |     h = fsspec.filesystem("hdfs")
33 |     out = [f["name"] for f in h.ls(basedir)]
34 |     assert out == hdfs.ls(basedir)
35 | 
36 | 
37 | def test_walk(hdfs):
38 |     h = fsspec.filesystem("hdfs")
39 |     out = h.walk(basedir)
40 |     assert list(out) == list(hdfs.walk(basedir))
41 | 
42 | 
43 | def test_isdir(hdfs):
44 |     h = fsspec.filesystem("hdfs")
45 |     assert h.isdir(basedir)
46 |     assert not h.isdir(basedir + "/file")
47 | 
48 | 
49 | def test_exists(hdfs):
50 |     h = fsspec.filesystem("hdfs")
51 |     assert not h.exists(basedir + "/notafile")
52 | 
53 | 
54 | def test_read(hdfs):
55 |     h = fsspec.filesystem("hdfs")
56 |     out = basedir + "/file"
57 |     with h.open(out, "rb") as f:
58 |         assert f.read() == data
59 |     with h.open(out, "rb", block_size=0) as f:
60 |         assert f.read() == data
61 |     with h.open(out, "rb") as f:
62 |         assert f.read(100) + f.read() == data
63 | 


--------------------------------------------------------------------------------
/fsspec/implementations/tests/test_jupyter.py:
--------------------------------------------------------------------------------
 1 | import os
 2 | import shlex
 3 | import subprocess
 4 | import time
 5 | import pytest
 6 | 
 7 | import fsspec
 8 | 
 9 | pytest.importorskip("notebook")
10 | requests = pytest.importorskip("requests")
11 | 
12 | 
13 | @pytest.fixture()
14 | def jupyter(tmpdir):
15 | 
16 |     tmpdir = str(tmpdir)
17 |     os.environ["JUPYTER_TOKEN"] = "blah"
18 |     try:
19 |         cmd = f"jupyter notebook --notebook-dir={tmpdir} --no-browser --port=5566"
20 |         P = subprocess.Popen(shlex.split(cmd))
21 |     except FileNotFoundError:
22 |         pytest.skip("notebook not installed correctly")
23 |     try:
24 |         timeout = 15
25 |         while True:
26 |             try:
27 |                 r = requests.get("http://localhost:5566/?token=blah")
28 |                 r.raise_for_status()
29 |                 break
30 |             except (requests.exceptions.BaseHTTPError, IOError):
31 |                 time.sleep(0.1)
32 |                 timeout -= 0.1
33 |                 if timeout < 0:
34 |                     pytest.xfail("Timed out for jupyter")
35 |         yield "http://localhost:5566/?token=blah", tmpdir
36 |     finally:
37 |         P.terminate()
38 | 
39 | 
40 | def test_simple(jupyter):
41 |     url, d = jupyter
42 |     fs = fsspec.filesystem("jupyter", url=url)
43 |     assert fs.ls("") == []
44 | 
45 |     fs.pipe("afile", b"data")
46 |     assert fs.cat("afile") == b"data"
47 |     assert "afile" in os.listdir(d)
48 | 
49 |     with fs.open("bfile", "wb") as f:
50 |         f.write(b"more")
51 |     with fs.open("bfile", "rb") as f:
52 |         assert f.read() == b"more"
53 | 
54 |     assert fs.info("bfile")["size"] == 4
55 |     fs.rm("afile")
56 | 
57 |     assert "afile" not in os.listdir(d)
58 | 


--------------------------------------------------------------------------------
/setup.py:
--------------------------------------------------------------------------------
 1 | #!/usr/bin/env python
 2 | import os
 3 | 
 4 | from setuptools import setup
 5 | import versioneer
 6 | 
 7 | here = os.path.abspath(os.path.dirname(__file__))
 8 | with open(os.path.join(here, "README.md"), encoding="utf-8") as f:
 9 |     long_description = f.read()
10 | 
11 | setup(
12 |     name="fsspec",
13 |     version=versioneer.get_version(),
14 |     cmdclass=versioneer.get_cmdclass(),
15 |     classifiers=[
16 |         "Development Status :: 4 - Beta",
17 |         "Intended Audience :: Developers",
18 |         "License :: OSI Approved :: BSD License",
19 |         "Operating System :: OS Independent",
20 |         "Programming Language :: Python :: 3.6",
21 |         "Programming Language :: Python :: 3.7",
22 |         "Programming Language :: Python :: 3.8",
23 |     ],
24 |     description="File-system specification",
25 |     long_description=long_description,
26 |     long_description_content_type="text/markdown",
27 |     url="http://github.com/intake/filesystem_spec",
28 |     maintainer="Martin Durant",
29 |     maintainer_email="mdurant@anaconda.com",
30 |     license="BSD",
31 |     keywords="file",
32 |     packages=["fsspec", "fsspec.implementations"],
33 |     python_requires=">3.6",
34 |     install_requires=open("requirements.txt").read().strip().split("\n"),
35 |     extras_require={
36 |         ":python_version < '3.8'": ['importlib_metadata'],
37 |         "abfs": ["adlfs"],
38 |         "adl": ["adlfs"],
39 |         "dask": ["dask", "distributed"],
40 |         "dropbox": ["dropboxdrivefs", "requests", "dropbox"],
41 |         "gcs": ["gcsfs"],
42 |         "git": ["pygit2"],
43 |         "github": ["requests"],
44 |         "gs": ["gcsfs"],
45 |         "hdfs": ["pyarrow"],
46 |         "http": ["requests", "aiohttp"],
47 |         "sftp": ["paramiko"],
48 |         "s3": ["s3fs"],
49 |         "smb": ["smbprotocol"],
50 |         "ssh": ["paramiko"],
51 |     },
52 |     zip_safe=False,
53 | )
54 | 


--------------------------------------------------------------------------------
/fsspec/tests/test_fuse.py:
--------------------------------------------------------------------------------
 1 | import os
 2 | import signal
 3 | import time
 4 | from multiprocessing import Process
 5 | 
 6 | import pytest
 7 | 
 8 | try:
 9 |     pytest.importorskip("fuse")  # noqa: E402
10 | except OSError:
11 |     # can succeed in importing fuse, but fail to load so
12 |     pytest.importorskip("nonexistent")  # noqa: E402
13 | 
14 | from fsspec.fuse import run
15 | from fsspec.implementations.memory import MemoryFileSystem
16 | 
17 | 
18 | def host_fuse(mountdir):
19 |     fs = MemoryFileSystem()
20 |     fs.touch("/mounted/testfile")
21 |     run(fs, "/mounted/", mountdir)
22 | 
23 | 
24 | def test_basic(tmpdir, capfd):
25 |     mountdir = str(tmpdir.mkdir("mount"))
26 | 
27 |     fuse_process = Process(target=host_fuse, args=(str(mountdir),))
28 |     fuse_process.start()
29 | 
30 |     try:
31 |         timeout = 10
32 |         while True:
33 |             try:
34 |                 # can fail with device not ready while waiting for fuse
35 |                 if "testfile" in os.listdir(mountdir):
36 |                     break
37 |             except Exception:
38 |                 pass
39 |             timeout -= 1
40 |             time.sleep(1)
41 |             assert timeout > 0, "Timeout"
42 | 
43 |         fn = os.path.join(mountdir, "test")
44 |         with open(fn, "wb") as f:
45 |             f.write(b"data")
46 | 
47 |         with open(fn) as f:
48 |             assert f.read() == "data"
49 | 
50 |         os.remove(fn)
51 | 
52 |         os.mkdir(fn)
53 |         assert os.listdir(fn) == []
54 | 
55 |         os.mkdir(fn + "/inner")
56 | 
57 |         with pytest.raises(OSError):
58 |             os.rmdir(fn)
59 | 
60 |         captured = capfd.readouterr()
61 |         assert "Traceback" not in captured.out
62 |         assert "Traceback" not in captured.err
63 | 
64 |         os.rmdir(fn + "/inner")
65 |         os.rmdir(fn)
66 |     finally:
67 |         os.kill(fuse_process.pid, signal.SIGTERM)
68 |         fuse_process.join()
69 | 


--------------------------------------------------------------------------------
/.gitignore:
--------------------------------------------------------------------------------
  1 | # Dask
  2 | dask-worker-space
  3 | 
  4 | # Byte-compiled / optimized / DLL files
  5 | __pycache__/
  6 | *.py[cod]
  7 | *$py.class
  8 | 
  9 | # C extensions
 10 | *.so
 11 | 
 12 | # Distribution / packaging
 13 | .Python
 14 | env/
 15 | build/
 16 | develop-eggs/
 17 | dist/
 18 | downloads/
 19 | eggs/
 20 | .eggs/
 21 | lib/
 22 | lib64/
 23 | parts/
 24 | sdist/
 25 | var/
 26 | wheels/
 27 | *.egg-info/
 28 | .installed.cfg
 29 | *.egg
 30 | pip-wheel-metadata/
 31 | 
 32 | # PyInstaller
 33 | #  Usually these files are written by a python script from a template
 34 | #  before PyInstaller builds the exe, so as to inject date/other infos into it.
 35 | *.manifest
 36 | *.spec
 37 | 
 38 | # Installer logs
 39 | pip-log.txt
 40 | pip-delete-this-directory.txt
 41 | 
 42 | # Unit test / coverage reports
 43 | htmlcov/
 44 | .tox/
 45 | .coverage
 46 | .coverage.*
 47 | .cache
 48 | nosetests.xml
 49 | coverage.xml
 50 | *.cover
 51 | .hypothesis/
 52 | .pytest_cache/
 53 | 
 54 | # Translations
 55 | *.mo
 56 | *.pot
 57 | 
 58 | # Django stuff:
 59 | *.log
 60 | local_settings.py
 61 | 
 62 | # Flask stuff:
 63 | instance/
 64 | .webassets-cache
 65 | 
 66 | # Scrapy stuff:
 67 | .scrapy
 68 | 
 69 | # Sphinx documentation
 70 | docs/_build/
 71 | 
 72 | # PyBuilder
 73 | target/
 74 | 
 75 | # Jupyter Notebook
 76 | .ipynb_checkpoints
 77 | 
 78 | # pyenv
 79 | .python-version
 80 | 
 81 | # celery beat schedule file
 82 | celerybeat-schedule
 83 | 
 84 | # SageMath parsed files
 85 | *.sage.py
 86 | 
 87 | # dotenv
 88 | .env
 89 | 
 90 | # virtualenv
 91 | .venv
 92 | venv/
 93 | ENV/
 94 | 
 95 | # Spyder project settings
 96 | .spyderproject
 97 | .spyproject
 98 | 
 99 | # Rope project settings
100 | .ropeproject
101 | 
102 | # mkdocs documentation
103 | /site
104 | 
105 | # mypy
106 | .mypy_cache/
107 | 
108 | # jetbrains ide stuff
109 | *.iml
110 | .idea/
111 | 
112 | # vscode ide stuff
113 | *.code-workspace
114 | .history
115 | .vscode
116 | 


--------------------------------------------------------------------------------
/.github/workflows/main.yaml:
--------------------------------------------------------------------------------
 1 | name: CI
 2 | 
 3 | on:
 4 |   push:
 5 |     branches: "*"
 6 |   pull_request:
 7 |     branches: master
 8 | 
 9 | jobs:
10 |   linux:
11 |     name: ${{ matrix.TOXENV }}-pytest
12 |     runs-on: ubuntu-latest
13 |     strategy:
14 |       fail-fast: false
15 |       matrix:
16 |         TOXENV: [py37, py38, s3fs, gcsfs]
17 | 
18 |     env:
19 |       TOXENV: ${{ matrix.TOXENV }}
20 |       CIRUN: true
21 | 
22 |     steps:
23 |       - name: Checkout
24 |         uses: actions/checkout@v2
25 | 
26 |       - name: Setup Miniconda
27 |         uses: conda-incubator/setup-miniconda@v2
28 |         with:
29 |           auto-update-conda: true
30 |           auto-activate-base: false
31 |           activate-environment: test_env
32 |           environment-file: ci/environment-py38.yml
33 | 
34 |       - name: Run Tests
35 |         shell: bash -l {0}
36 |         run: |
37 |           tox -v
38 | 
39 |   win:
40 |     name: ${{ matrix.TOXENV }}-pytest-win
41 |     runs-on: windows-2019
42 |     strategy:
43 |       fail-fast: false
44 |       matrix:
45 |         TOXENV: [py38]
46 | 
47 |     env:
48 |       TOXENV: ${{ matrix.TOXENV }}
49 |       CIRUN: true
50 | 
51 |     steps:
52 |       - name: Checkout
53 |         uses: actions/checkout@v2
54 | 
55 |       - name: Setup Miniconda
56 |         uses: conda-incubator/setup-miniconda@v2
57 |         with:
58 |           auto-update-conda: true
59 |           auto-activate-base: false
60 |           activate-environment: test_env
61 |           environment-file: ci/environment-win.yml
62 | 
63 |       - name: Run Tests
64 |         shell: bash -l {0}
65 |         run: |
66 |           pytest -v
67 | 
68 |   lint:
69 |     name: lint
70 |     runs-on: ubuntu-latest
71 |     steps:
72 |       - uses: actions/checkout@v2
73 |       - uses: actions/setup-python@v2
74 |       - name: pip-install
75 |         shell: bash -l {0}
76 |         run: |
77 |           pip install flake8 black
78 |       - name: Lint
79 |         shell: bash -l {0}
80 |         run: |
81 |           flake8 fsspec
82 |           black fsspec --check
83 | 


--------------------------------------------------------------------------------
/fsspec/tests/test_config.py:
--------------------------------------------------------------------------------
 1 | import os
 2 | import pytest
 3 | import fsspec
 4 | from fsspec.config import conf, set_conf_env, set_conf_files
 5 | 
 6 | 
 7 | @pytest.fixture
 8 | def clean_conf():
 9 |     """Tests should start and end with clean config dict"""
10 |     conf.clear()
11 |     yield
12 |     conf.clear()
13 | 
14 | 
15 | def test_from_env(clean_conf):
16 |     env = {
17 |         "FSSPEC_PROTO_KEY": "value",
18 |         "FSSPEC_PROTO_LONG_KEY": "othervalue",
19 |         "FSSPEC_MALFORMED": "novalue",
20 |     }
21 |     cd = {}
22 |     set_conf_env(conf_dict=cd, envdict=env)
23 |     assert cd == {"proto": {"key": "value", "long_key": "othervalue"}}
24 | 
25 | 
26 | def test_from_file_ini(clean_conf, tmpdir):
27 |     file1 = os.path.join(tmpdir, "1.ini")
28 |     file2 = os.path.join(tmpdir, "2.ini")
29 |     with open(file1, "w") as f:
30 |         f.write(
31 |             """[proto]
32 | key=value
33 | other_key:othervalue
34 | overwritten=dont_see
35 |         """
36 |         )
37 |     with open(file2, "w") as f:
38 |         f.write(
39 |             """[proto]
40 | overwritten=see
41 |         """
42 |         )
43 |     cd = {}
44 |     set_conf_files(tmpdir, cd)
45 |     assert cd == {
46 |         "proto": {"key": "value", "other_key": "othervalue", "overwritten": "see"}
47 |     }
48 | 
49 | 
50 | def test_from_file_json(clean_conf, tmpdir):
51 |     file1 = os.path.join(tmpdir, "1.json")
52 |     file2 = os.path.join(tmpdir, "2.json")
53 |     with open(file1, "w") as f:
54 |         f.write(
55 |             """{"proto":
56 | {"key": "value",
57 | "other_key": "othervalue",
58 | "overwritten": false}}
59 |         """
60 |         )
61 |     with open(file2, "w") as f:
62 |         f.write(
63 |             """{"proto":
64 | {"overwritten": true}}
65 |         """
66 |         )
67 |     cd = {}
68 |     set_conf_files(tmpdir, cd)
69 |     assert cd == {
70 |         "proto": {"key": "value", "other_key": "othervalue", "overwritten": True}
71 |     }
72 | 
73 | 
74 | def test_apply(clean_conf):
75 |     conf["file"] = {"auto_mkdir": "test"}
76 |     fs = fsspec.filesystem("file")
77 |     assert fs.auto_mkdir == "test"
78 |     fs = fsspec.filesystem("file", auto_mkdir=True)
79 |     assert fs.auto_mkdir is True
80 | 


--------------------------------------------------------------------------------
/tox.ini:
--------------------------------------------------------------------------------
  1 | # content of: tox.ini , put in same dir as setup.py
  2 | [tox]
  3 | envlist = {py36,py37,py38}
  4 | 
  5 | [core]
  6 | conda_channels=
  7 |     defaults
  8 |     conda-forge
  9 | conda_deps=
 10 |     pip
 11 |     paramiko
 12 |     requests
 13 |     zstandard
 14 |     python-snappy
 15 |     aiohttp
 16 |     lz4
 17 |     distributed
 18 |     dask
 19 |     pyarrow
 20 |     panel
 21 |     notebook
 22 |     pygit2
 23 |     git
 24 |     s3fs
 25 |     pyftpdlib
 26 |     cloudpickle
 27 |     pytest
 28 |     pytest-benchmark
 29 |     pytest-cov
 30 |     pytest-vcr
 31 |     fusepy
 32 |     msgpack-python
 33 |     python-libarchive-c
 34 |     numpy
 35 |     nomkl
 36 | deps=
 37 |     hadoop-test-cluster==0.1.0
 38 |     smbprotocol
 39 |     py36,py37: importlib_metadata
 40 | 
 41 | [testenv]
 42 | description=Run test suite against target versions.
 43 | conda_channels=
 44 |     {[core]conda_channels}
 45 | conda_deps=
 46 |     {[core]conda_deps}
 47 | deps=
 48 |     {[core]deps}
 49 | commands =
 50 |     py.test --cov=fsspec -v -r s {posargs}
 51 | passenv = CIRUN
 52 | 
 53 | [testenv:s3fs]
 54 | description=Run s3fs (@master) test suite against fsspec.
 55 | extras=s3
 56 | conda_channels=
 57 |     defaults
 58 |     conda-forge
 59 | conda_deps=
 60 |     {[core]conda_deps}
 61 |     httpretty
 62 |     aiobotocore
 63 |     moto
 64 |     flask
 65 | changedir=.tox/s3fs/tmp
 66 | whitelist_externals=
 67 |     rm
 68 |     git
 69 | setenv=
 70 |     BOTO_CONFIG=/dev/null
 71 |     AWS_ACCESS_KEY_ID=foobar_key
 72 |     AWS_SECRET_ACCESS_KEY=foobar_secret
 73 | commands=
 74 |     rm -rf s3fs
 75 |     git clone https://github.com/dask/s3fs
 76 |     py.test -vv s3fs/s3fs
 77 | 
 78 | [testenv:gcsfs]
 79 | description=Run gcsfs (@master) test suite against fsspec.
 80 | extras=gcs
 81 | conda_channels=
 82 |     defaults
 83 |     conda-forge
 84 | conda_deps=
 85 |     {[core]conda_deps}
 86 | deps=
 87 |     {[core]deps}
 88 |     vcrpy
 89 |     google-auth-oauthlib
 90 | changedir=.tox/gcsfs/tmp
 91 | whitelist_externals=
 92 |     rm
 93 |     git
 94 | setenv=
 95 |     GCSFS_RECORD_MODE=none
 96 | commands=
 97 |     rm -rf gcsfs
 98 |     git clone https://github.com/dask/gcsfs
 99 |     py.test -vv gcsfs/gcsfs -k 'not fuse'
100 | 


--------------------------------------------------------------------------------
/fsspec/implementations/tests/test_git.py:
--------------------------------------------------------------------------------
 1 | import fsspec
 2 | import os
 3 | import pytest
 4 | import shutil
 5 | import tempfile
 6 | import subprocess
 7 | 
 8 | pygit2 = pytest.importorskip("pygit2")
 9 | 
10 | 
11 | @pytest.fixture()
12 | def repo():
13 |     orig_dir = os.getcwd()
14 |     d = tempfile.mkdtemp()
15 |     try:
16 |         os.chdir(d)
17 |         subprocess.call("git init", shell=True, cwd=d)
18 |         subprocess.call("git init", shell=True, cwd=d)
19 |         subprocess.call('git config user.email "you@example.com"', shell=True, cwd=d)
20 |         subprocess.call('git config user.name "Your Name"', shell=True, cwd=d)
21 |         open(os.path.join(d, "file1"), "wb").write(b"data0")
22 |         subprocess.call("git add file1", shell=True, cwd=d)
23 |         subprocess.call('git commit -m "init"', shell=True, cwd=d)
24 |         sha = open(os.path.join(d, ".git/refs/heads/master"), "r").read().strip()
25 |         open(os.path.join(d, "file1"), "wb").write(b"data00")
26 |         subprocess.check_output('git commit -a -m "tagger"', shell=True, cwd=d)
27 |         subprocess.call('git tag -a thetag -m "make tag"', shell=True, cwd=d)
28 |         open(os.path.join(d, "file2"), "wb").write(b"data000")
29 |         subprocess.call("git add file2", shell=True)
30 |         subprocess.call('git commit -m "master tip"', shell=True, cwd=d)
31 |         subprocess.call("git checkout -b abranch", shell=True, cwd=d)
32 |         os.mkdir("inner")
33 |         open(os.path.join(d, "inner", "file1"), "wb").write(b"data3")
34 |         subprocess.call("git add inner/file1", shell=True, cwd=d)
35 |         subprocess.call('git commit -m "branch tip"', shell=True, cwd=d)
36 |         yield d, sha
37 |     finally:
38 |         os.chdir(orig_dir)
39 |         shutil.rmtree(d)
40 | 
41 | 
42 | def test_refs(repo):
43 |     d, sha = repo
44 |     with fsspec.open("git://file1", path=d, ref=sha) as f:
45 |         assert f.read() == b"data0"
46 | 
47 |     with fsspec.open("git://file1", path=d, ref="thetag") as f:
48 |         assert f.read() == b"data00"
49 | 
50 |     with fsspec.open("git://file2", path=d, ref="master") as f:
51 |         assert f.read() == b"data000"
52 | 
53 |     with fsspec.open("git://file2", path=d, ref=None) as f:
54 |         assert f.read() == b"data000"
55 | 
56 |     with fsspec.open("git://inner/file1", path=d, ref="abranch") as f:
57 |         assert f.read() == b"data3"
58 | 


--------------------------------------------------------------------------------
/fsspec/implementations/tests/cassettes/test_dbfs_file_listing.yaml:
--------------------------------------------------------------------------------
 1 | interactions:
 2 | - request:
 3 |     body: '{"path": "/"}'
 4 |     headers:
 5 |       Accept:
 6 |       - '*/*'
 7 |       Accept-Encoding:
 8 |       - gzip, deflate
 9 |       Connection:
10 |       - keep-alive
11 |       Content-Length:
12 |       - '13'
13 |       Content-Type:
14 |       - application/json
15 |       User-Agent:
16 |       - python-requests/2.25.1
17 |       authorization:
18 |       - DUMMY
19 |     method: GET
20 |     uri: https://my_instance.com/api/2.0/dbfs/list
21 |   response:
22 |     body:
23 |       string: !!binary |
24 |         H4sIAAAAAAAEA4zLMQ5AQBBG4bv89Qr1HsAFlCKyGDFByMxsQ9ydaHSy7cv3Toy8kMLXJ/ZgEzzy
25 |         8imVbUJwYG0HFniTSO61rfJB8MXlvmMIFjrhftZMSONimrwaqaXjdU+2UUn+cXPdAAAA//8DAHlY
26 |         NJf+AAAA
27 |     headers:
28 |       content-encoding:
29 |       - gzip
30 |       content-type:
31 |       - application/json
32 |       server:
33 |       - databricks
34 |       strict-transport-security:
35 |       - max-age=31536000; includeSubDomains; preload
36 |       transfer-encoding:
37 |       - chunked
38 |       vary:
39 |       - Accept-Encoding
40 |       x-content-type-options:
41 |       - nosniff
42 |     status:
43 |       code: 200
44 |       message: OK
45 | - request:
46 |     body: '{"path": "/"}'
47 |     headers:
48 |       Accept:
49 |       - '*/*'
50 |       Accept-Encoding:
51 |       - gzip, deflate
52 |       Connection:
53 |       - keep-alive
54 |       Content-Length:
55 |       - '13'
56 |       Content-Type:
57 |       - application/json
58 |       User-Agent:
59 |       - python-requests/2.25.1
60 |       authorization:
61 |       - DUMMY
62 |     method: GET
63 |     uri: https://my_instance.com/api/2.0/dbfs/list
64 |   response:
65 |     body:
66 |       string: !!binary |
67 |         H4sIAAAAAAAEA4zLMQ5AQBBG4bv89Qr1HsAFlCKyGDFByMxsQ9ydaHSy7cv3Toy8kMLXJ/ZgEzzy
68 |         8imVbUJwYG0HFniTSO61rfJB8MXlvmMIFjrhftZMSONimrwaqaXjdU+2UUn+cXPdAAAA//8DAHlY
69 |         NJf+AAAA
70 |     headers:
71 |       content-encoding:
72 |       - gzip
73 |       content-type:
74 |       - application/json
75 |       server:
76 |       - databricks
77 |       strict-transport-security:
78 |       - max-age=31536000; includeSubDomains; preload
79 |       transfer-encoding:
80 |       - chunked
81 |       vary:
82 |       - Accept-Encoding
83 |       x-content-type-options:
84 |       - nosniff
85 |     status:
86 |       code: 200
87 |       message: OK
88 | version: 1
89 | 


--------------------------------------------------------------------------------
/fsspec/implementations/dvc.py:
--------------------------------------------------------------------------------
 1 | import os
 2 | from fsspec.spec import AbstractFileSystem
 3 | from fsspec.implementations.local import LocalFileSystem
 4 | import dvc.repo
 5 | 
 6 | lfs = LocalFileSystem()
 7 | 
 8 | 
 9 | class DVCFileSystem(AbstractFileSystem):
10 |     """DVC backend (experimental)
11 | 
12 |     Load data files that are versioned using the `Data Version Control`_ system
13 | 
14 |     .. _Data Version Control: https://dvc.org/
15 | 
16 |     This interface is incomplete and experimental.
17 |     """
18 | 
19 |     root_marker = ""
20 | 
21 |     def __init__(self, path=None, **kwargs):
22 |         """
23 | 
24 |         Parameters
25 |         ----------
26 |         path: str (optional)
27 |             Location of the repo to access; defaults to the current directory.
28 |         """
29 |         super().__init__(**kwargs)
30 |         self.repo = dvc.repo.Repo(path)
31 |         self.path = self.repo.find_root()
32 | 
33 |     @classmethod
34 |     def _strip_protocol(cls, path):
35 |         return super()._strip_protocol(path).lstrip("/")
36 | 
37 |     def ls(self, path, detail=False, **kwargs):
38 |         path = self._strip_protocol(path)
39 |         allfiles = self.repo.tree.walk(os.path.join(self.repo.root_dir, path))
40 |         dirname, dirs, files = next(allfiles)
41 |         out = [os.path.join(path, f) for f in dirs + files]
42 |         details = []
43 | 
44 |         for f in out:
45 |             full = os.path.join(self.repo.root_dir, f)
46 |             file_info = lfs.info(full)
47 |             if lfs.isdir(full):
48 |                 details.append(file_info)
49 |             else:
50 |                 try:
51 |                     extra = self.repo.find_out_by_relpath(full).dumpd()
52 |                 except dvc.exceptions.OutputNotFoundError:
53 |                     continue
54 |                 details.append(dict(**extra, **file_info))
55 |             details[-1]["name"] = f
56 |         if detail:
57 |             return details
58 |         return [d["name"] for d in details]
59 | 
60 |     def ukey(self, path):
61 |         return self.info(path)["md5"]
62 | 
63 |     def _open(
64 |         self,
65 |         path,
66 |         mode="rb",
67 |         block_size=None,
68 |         autocommit=True,
69 |         cache_options=None,
70 |         **kwargs
71 |     ):
72 |         # returns a context file object (i.e., needs to be used with ``with``
73 |         path = self._strip_protocol(path)
74 |         return self.repo.open_by_relpath(path)
75 | 


--------------------------------------------------------------------------------
/fsspec/transaction.py:
--------------------------------------------------------------------------------
 1 | class Transaction(object):
 2 |     """Filesystem transaction write context
 3 | 
 4 |     Gathers files for deferred commit or discard, so that several write
 5 |     operations can be finalized semi-atomically. This works by having this
 6 |     instance as the ``.transaction`` attribute of the given filesystem
 7 |     """
 8 | 
 9 |     def __init__(self, fs):
10 |         """
11 |         Parameters
12 |         ----------
13 |         fs: FileSystem instance
14 |         """
15 |         self.fs = fs
16 |         self.files = []
17 | 
18 |     def __enter__(self):
19 |         self.start()
20 | 
21 |     def __exit__(self, exc_type, exc_val, exc_tb):
22 |         """End transaction and commit, if exit is not due to exception"""
23 |         # only commit if there was no exception
24 |         self.complete(commit=exc_type is None)
25 |         self.fs._intrans = False
26 |         self.fs._transaction = None
27 | 
28 |     def start(self):
29 |         """Start a transaction on this FileSystem"""
30 |         self.files = []  # clean up after previous failed completions
31 |         self.fs._intrans = True
32 | 
33 |     def complete(self, commit=True):
34 |         """Finish transaction: commit or discard all deferred files"""
35 |         for f in self.files:
36 |             if commit:
37 |                 f.commit()
38 |             else:
39 |                 f.discard()
40 |         self.files = []
41 |         self.fs._intrans = False
42 | 
43 | 
44 | class FileActor(object):
45 |     def __init__(self):
46 |         self.files = []
47 | 
48 |     def commit(self):
49 |         for f in self.files:
50 |             f.commit()
51 |         self.files.clear()
52 | 
53 |     def discard(self):
54 |         for f in self.files:
55 |             f.discard()
56 |         self.files.clear()
57 | 
58 |     def append(self, f):
59 |         self.files.append(f)
60 | 
61 | 
62 | class DaskTransaction(Transaction):
63 |     def __init__(self, fs):
64 |         """
65 |         Parameters
66 |         ----------
67 |         fs: FileSystem instance
68 |         """
69 |         import distributed
70 | 
71 |         super().__init__(fs)
72 |         client = distributed.default_client()
73 |         self.files = client.submit(FileActor, actor=True).result()
74 | 
75 |     def complete(self, commit=True):
76 |         """Finish transaction: commit or discard all deferred files"""
77 |         if commit:
78 |             self.files.commit().result()
79 |         else:
80 |             self.files.discard().result()
81 |         self.fs._intrans = False
82 | 


--------------------------------------------------------------------------------
/fsspec/tests/test_caches.py:
--------------------------------------------------------------------------------
 1 | import pickle
 2 | import string
 3 | 
 4 | import pytest
 5 | from fsspec.caching import BlockCache, caches
 6 | 
 7 | 
 8 | def test_cache_getitem(Cache_imp):
 9 |     cacher = Cache_imp(4, letters_fetcher, len(string.ascii_letters))
10 |     assert cacher._fetch(0, 4) == b"abcd"
11 |     assert cacher._fetch(None, 4) == b"abcd"
12 |     assert cacher._fetch(2, 4) == b"cd"
13 | 
14 | 
15 | def test_block_cache_lru():
16 |     cache = BlockCache(4, letters_fetcher, len(string.ascii_letters), maxblocks=2)
17 |     # miss
18 |     cache._fetch(0, 2)
19 |     assert cache.cache_info().misses == 1
20 |     assert cache.cache_info().currsize == 1
21 | 
22 |     # hit
23 |     cache._fetch(0, 2)
24 |     assert cache.cache_info().misses == 1
25 |     assert cache.cache_info().currsize == 1
26 | 
27 |     # miss
28 |     cache._fetch(4, 6)
29 |     assert cache.cache_info().misses == 2
30 |     assert cache.cache_info().currsize == 2
31 | 
32 |     # miss & evict
33 |     cache._fetch(12, 13)
34 |     assert cache.cache_info().misses == 3
35 |     assert cache.cache_info().currsize == 2
36 | 
37 | 
38 | def _fetcher(start, end):
39 |     return b"0" * (end - start)
40 | 
41 | 
42 | def letters_fetcher(start, end):
43 |     return string.ascii_letters[start:end].encode()
44 | 
45 | 
46 | @pytest.fixture(params=caches.values(), ids=list(caches.keys()))
47 | def Cache_imp(request):
48 |     return request.param
49 | 
50 | 
51 | def test_cache_empty_file(Cache_imp):
52 |     blocksize = 5
53 |     size = 0
54 |     cache = Cache_imp(blocksize, _fetcher, size)
55 |     assert cache._fetch(0, 0) == b""
56 | 
57 | 
58 | def test_cache_pickleable(Cache_imp):
59 |     blocksize = 5
60 |     size = 100
61 |     cache = Cache_imp(blocksize, _fetcher, size)
62 |     cache._fetch(0, 5)  # fill in cache
63 |     unpickled = pickle.loads(pickle.dumps(cache))
64 |     assert isinstance(unpickled, Cache_imp)
65 |     assert unpickled.blocksize == blocksize
66 |     assert unpickled.size == size
67 |     assert unpickled._fetch(0, 10) == b"0" * 10
68 | 
69 | 
70 | @pytest.mark.parametrize(
71 |     "size_requests",
72 |     [[(0, 30), (0, 35), (51, 52)], [(0, 1), (1, 11), (1, 52)], [(0, 52), (11, 15)]],
73 | )
74 | @pytest.mark.parametrize("blocksize", [1, 10, 52, 100])
75 | def test_cache_basic(Cache_imp, blocksize, size_requests):
76 |     cache = Cache_imp(blocksize, letters_fetcher, len(string.ascii_letters))
77 | 
78 |     for start, end in size_requests:
79 |         result = cache._fetch(start, end)
80 |         expected = string.ascii_letters[start:end].encode()
81 |         assert result == expected
82 | 


--------------------------------------------------------------------------------
/docs/source/index.rst:
--------------------------------------------------------------------------------
 1 | FSSPEC: Filesystem interfaces for Python
 2 | ======================================
 3 | 
 4 | Filesystem Spec (FSSPEC) is a project to unify various projects and classes to work with remote filesystems and
 5 | file-system-like abstractions using a standard pythonic interface.
 6 | 
 7 | 
 8 | .. _highlight:
 9 | 
10 | Highlights
11 | ----------
12 | 
13 | - based on s3fs and gcsfs
14 | - ``fsspec`` instances are serializable and can be passed between processes/machines
15 | - the ``OpenFiles`` file-like instances are also serializable
16 | - implementations provide random access, to enable only the part of a file required to be read; plus a template
17 |   to base other file-like classes on
18 | - file access can use transparent compression and text-mode
19 | - any file-system directory can be viewed as a key-value/mapping store
20 | - if installed, all file-system classes also subclass from ``pyarrow.filesystem.FileSystem``, so
21 |   can work with any arrow function expecting such an instance
22 | - writes can be transactional: stored in a temporary location and only moved to the final
23 |   destination when the transaction is committed
24 | - FUSE: mount any path from any backend to a point on your file-system
25 | - cached instances tokenised on the instance parameters
26 | 
27 | These are described further in the :doc:`features` section.
28 | 
29 | Installation
30 | ------------
31 | 
32 |    pip install fsspec
33 | 
34 | Not all included filesystems are usable by default without installing extra
35 | dependencies. For example to be able to access data in S3::
36 | 
37 |    pip install fsspec[s3]
38 | 
39 | or
40 | 
41 |    conda install -c conda-forge fsspec
42 | 
43 | Implementations
44 | ---------------
45 | 
46 | This repo contains several file-system implementations, see :ref:`implementations`. However,
47 | the external projects ``s3fs`` and ``gcsfs`` depend on ``fsspec`` and share the same behaviours.
48 | ``Dask`` and ``Intake`` use ``fsspec`` internally for their IO needs.
49 | 
50 | The current list of known implementations can be found as follows
51 | 
52 | .. code-block:: python
53 | 
54 |     from fsspec.registry import known_implementations
55 |     known_implementations
56 | 
57 | These are only imported on request, which may fail if a required dependency is missing. The dictionary
58 | ``fsspec.registry`` contains all imported implementations, and can be mutated by user code, if necessary.
59 | 
60 | 
61 | .. toctree::
62 |    :maxdepth: 2
63 |    :caption: Contents:
64 | 
65 |    intro.rst
66 |    usage.rst
67 |    features.rst
68 |    api.rst
69 |    changelog.rst
70 |    developer.rst
71 | 
72 | 
73 | Indices and tables
74 | ==================
75 | 
76 | * :ref:`genindex`
77 | * :ref:`modindex`
78 | * :ref:`search`
79 | 


--------------------------------------------------------------------------------
/fsspec/implementations/tests/test_reference.py:
--------------------------------------------------------------------------------
 1 | import json
 2 | import pytest
 3 | 
 4 | import fsspec
 5 | from .test_http import data, realfile, server  # noqa: F401
 6 | from fsspec.implementations.reference import _unmodel_hdf5
 7 | 
 8 | 
 9 | def test_simple(server):  # noqa: F811
10 | 
11 |     refs = {"a": b"data", "b": (realfile, 0, 5), "c": (realfile, 1, 5)}
12 |     h = fsspec.filesystem("http")
13 |     fs = fsspec.filesystem("reference", references=refs, fs=h)
14 | 
15 |     assert fs.cat("a") == b"data"
16 |     assert fs.cat("b") == data[:5]
17 |     assert fs.cat("c") == data[1 : 1 + 5]
18 | 
19 | 
20 | def test_ls(server):  # noqa: F811
21 |     refs = {"a": b"data", "b": (realfile, 0, 5), "c/d": (realfile, 1, 6)}
22 |     h = fsspec.filesystem("http")
23 |     fs = fsspec.filesystem("reference", references=refs, fs=h)
24 | 
25 |     assert fs.ls("", detail=False) == ["a", "b", "c"]
26 |     assert {"name": "c", "type": "directory", "size": 0} in fs.ls("", detail=True)
27 |     assert fs.find("") == ["a", "b", "c/d"]
28 |     assert fs.find("", withdirs=True) == ["a", "b", "c", "c/d"]
29 | 
30 | 
31 | def test_err(m):
32 |     with pytest.raises(NotImplementedError):
33 |         fsspec.filesystem("reference", references={}, fs=m)
34 |     with pytest.raises(NotImplementedError):
35 |         fsspec.filesystem("reference", references={}, target_protocol="memory")
36 | 
37 | 
38 | def test_defaults(server):  # noqa: F811
39 |     refs = {"a": b"data", "b": (None, 0, 5)}
40 |     fs = fsspec.filesystem(
41 |         "reference", references=refs, target_protocol="http", target=realfile
42 |     )
43 | 
44 |     assert fs.cat("a") == b"data"
45 |     assert fs.cat("b") == data[:5]
46 | 
47 | 
48 | jdata = """{
49 |     "metadata": {
50 |         ".zattrs": {
51 |             "Conventions": "UGRID-0.9.0"
52 |         },
53 |         ".zgroup": {
54 |             "zarr_format": 2
55 |         },
56 |         "adcirc_mesh/.zarray": {
57 |             "chunks": [
58 |                 1
59 |             ],
60 |             "dtype": "<i4",
61 |             "shape": [
62 |                 1
63 |             ],
64 |             "zarr_format": 2
65 |         },
66 |         "adcirc_mesh/.zattrs": {
67 |             "_ARRAY_DIMENSIONS": [
68 |                 "mesh"
69 |             ],
70 |             "cf_role": "mesh_topology"
71 |         },
72 |         "adcirc_mesh/.zchunkstore": {
73 |             "adcirc_mesh/0": {
74 |                 "offset": 8928,
75 |                 "size": 4
76 |             },
77 |             "source": {
78 |                 "array_name": "/adcirc_mesh",
79 |                 "uri": "https://url"
80 |             }
81 |         }
82 |     },
83 |     "zarr_consolidated_format": 1
84 | }
85 | """
86 | 
87 | 
88 | def test_unmodel():
89 |     refs = _unmodel_hdf5(json.loads(jdata))
90 |     assert b'"Conventions": "UGRID-0.9.0"' in refs[".zattrs"]
91 |     assert refs["adcirc_mesh/0"] == ("https://url", 8928, 8932)
92 | 


--------------------------------------------------------------------------------
/fsspec/dircache.py:
--------------------------------------------------------------------------------
 1 | from functools import lru_cache
 2 | import time
 3 | from collections.abc import MutableMapping
 4 | 
 5 | 
 6 | class DirCache(MutableMapping):
 7 |     """
 8 |     Caching of directory listings, in a structure like
 9 | 
10 |     {"path0": [
11 |         {"name": "path0/file0",
12 |          "size": 123,
13 |          "type": "file",
14 |          ...
15 |         },
16 |         {"name": "path0/file1",
17 |         },
18 |         ...
19 |         ],
20 |      "path1": [...]
21 |     }
22 | 
23 |     Parameters to this class control listing expiry or indeed turn
24 |     caching off
25 |     """
26 | 
27 |     def __init__(
28 |         self,
29 |         use_listings_cache=True,
30 |         listings_expiry_time=None,
31 |         max_paths=None,
32 |         **kwargs
33 |     ):
34 |         """
35 | 
36 |         Parameters
37 |         ----------
38 |         use_listings_cache: bool
39 |             If False, this cache never returns items, but always reports KeyError,
40 |             and setting items has no effect
41 |         listings_expiry_time: int (optional)
42 |             Time in seconds that a listing is considered valid. If None,
43 |             listings do not expire.
44 |         max_paths: int (optional)
45 |             The number of most recent listings that are considered valid; 'recent'
46 |             refers to when the entry was set.
47 |         """
48 |         self._cache = {}
49 |         self._times = {}
50 |         if max_paths:
51 |             self._q = lru_cache(max_paths + 1)(lambda key: self._cache.pop(key, None))
52 |         self.use_listings_cache = use_listings_cache
53 |         self.listings_expiry_time = listings_expiry_time
54 |         self.max_paths = max_paths
55 | 
56 |     def __getitem__(self, item):
57 |         if self.listings_expiry_time is not None:
58 |             if self._times.get(item, 0) - time.time() < -self.listings_expiry_time:
59 |                 del self._cache[item]
60 |         if self.max_paths:
61 |             self._q(item)
62 |         return self._cache[item]  # maybe raises KeyError
63 | 
64 |     def clear(self):
65 |         self._cache.clear()
66 | 
67 |     def __len__(self):
68 |         return len(self._cache)
69 | 
70 |     def __contains__(self, item):
71 |         try:
72 |             self[item]
73 |             return True
74 |         except KeyError:
75 |             return False
76 | 
77 |     def __setitem__(self, key, value):
78 |         if not self.use_listings_cache:
79 |             return
80 |         if self.max_paths:
81 |             self._q(key)
82 |         self._cache[key] = value
83 |         if self.listings_expiry_time is not None:
84 |             self._times[key] = time.time()
85 | 
86 |     def __delitem__(self, key):
87 |         del self._cache[key]
88 | 
89 |     def __iter__(self):
90 |         return (k for k in self._cache if k in self)
91 | 
92 |     def __reduce__(self):
93 |         return (
94 |             DirCache,
95 |             (self.use_listings_cache, self.listings_expiry_time, self.max_paths),
96 |         )
97 | 


--------------------------------------------------------------------------------
/fsspec/config.py:
--------------------------------------------------------------------------------
 1 | import configparser
 2 | import json
 3 | import os
 4 | 
 5 | 
 6 | conf = {}
 7 | default_conf_dir = os.path.join(os.path.expanduser("~"), ".config/fsspec")
 8 | conf_dir = os.environ.get("FSSPEC_CONFIG_DIR", default_conf_dir)
 9 | 
10 | 
11 | def set_conf_env(conf_dict, envdict=os.environ):
12 |     """Set config values from environment variables
13 | 
14 |     Looks for variable of the form ``FSSPEC_<protocol>_<kwarg>``.
15 |     There is no attempt to convert strings, but the kwarg keys will
16 |     be lower-cased.
17 | 
18 |     Parameters
19 |     ----------
20 |     conf_dict : dict(str, dict)
21 |         This dict will be mutated
22 |     envdict : dict-like(str, str)
23 |         Source for the values - usually the real environment
24 |     """
25 |     for key in envdict:
26 |         if key.startswith("FSSPEC"):
27 |             if key.count("_") < 2:
28 |                 continue
29 |             _, proto, kwarg = key.split("_", 2)
30 |             conf_dict.setdefault(proto.lower(), {})[kwarg.lower()] = envdict[key]
31 | 
32 | 
33 | def set_conf_files(cdir, conf_dict):
34 |     """Set config values from files
35 | 
36 |     Scans for INI and JSON files in the given dictionary, and uses their
37 |     contents to set the config. In case of repeated values, later values
38 |     win.
39 | 
40 |     In the case of INI files, all values are strings, and these will not
41 |     be converted.
42 | 
43 |     Parameters
44 |     ----------
45 |     cdir : str
46 |         Directory to search
47 |     conf_dict : dict(str, dict)
48 |         This dict will be mutated
49 |     """
50 |     if not os.path.isdir(cdir):
51 |         return
52 |     allfiles = sorted(os.listdir(cdir))
53 |     for fn in allfiles:
54 |         if fn.endswith(".ini"):
55 |             ini = configparser.ConfigParser()
56 |             ini.read(os.path.join(cdir, fn))
57 |             for key in ini:
58 |                 if key == "DEFAULT":
59 |                     continue
60 |                 conf_dict.setdefault(key, {}).update(dict(ini[key]))
61 |         if fn.endswith(".json"):
62 |             js = json.load(open(os.path.join(cdir, fn)))
63 |             for key in js:
64 |                 conf_dict.setdefault(key, {}).update(dict(js[key]))
65 | 
66 | 
67 | def apply_config(cls, kwargs, conf_dict=conf):
68 |     """Supply default values for kwargs when instantiating class
69 | 
70 |     Augments the passed kwargs, by finding entries in the config dict
71 |     which match the classes ``.protocol`` attribute (one or more str)
72 | 
73 |     Parameters
74 |     ----------
75 |     cls : file system implementation
76 |     kwargs : dict
77 |     conf_dict : dict of dict
78 |         Typically this is the global configuration
79 | 
80 |     Returns
81 |     -------
82 |     dict : the modified set of kwargs
83 |     """
84 |     protos = cls.protocol if isinstance(cls.protocol, (tuple, list)) else [cls.protocol]
85 |     kw = {}
86 |     for proto in protos:
87 |         # default kwargs from the current state of the config
88 |         if proto in conf_dict:
89 |             kw.update(conf_dict[proto])
90 |     # explicit kwargs always win
91 |     kw.update(**kwargs)
92 |     kwargs = kw
93 |     return kwargs
94 | 
95 | 
96 | set_conf_files(conf_dir, conf)
97 | set_conf_env(conf)
98 | 


--------------------------------------------------------------------------------
/fsspec/implementations/tests/test_memory.py:
--------------------------------------------------------------------------------
  1 | import pytest
  2 | import sys
  3 | 
  4 | 
  5 | def test_1(m):
  6 |     m.touch("/somefile")  # NB: is found with or without initial /
  7 |     m.touch("afiles/and/anothers")
  8 |     files = m.find("")
  9 |     if "somefile" in files:
 10 |         assert files == ["afiles/and/anothers", "somefile"]
 11 |     else:
 12 |         assert files == ["/somefile", "afiles/and/anothers"]
 13 | 
 14 |     files = sorted(m.get_mapper(""))
 15 |     if "somefile" in files:
 16 |         assert files == ["afiles/and/anothers", "somefile"]
 17 |     else:
 18 |         assert files == ["/somefile", "afiles/and/anothers"]
 19 | 
 20 | 
 21 | @pytest.mark.xfail(
 22 |     sys.version_info < (3, 6),
 23 |     reason="py35 error, see https://github.com/intake/filesystem_spec/issues/148",
 24 | )
 25 | def test_ls(m):
 26 |     m.mkdir("/dir")
 27 |     m.mkdir("/dir/dir1")
 28 | 
 29 |     m.touch("/dir/afile")
 30 |     m.touch("/dir/dir1/bfile")
 31 |     m.touch("/dir/dir1/cfile")
 32 | 
 33 |     assert m.ls("/", False) == ["/dir/"]
 34 |     assert m.ls("/dir", False) == ["/dir/afile", "/dir/dir1/"]
 35 |     assert m.ls("/dir", True)[0]["type"] == "file"
 36 |     assert m.ls("/dir", True)[1]["type"] == "directory"
 37 | 
 38 |     assert len(m.ls("/dir/dir1")) == 2
 39 | 
 40 | 
 41 | def test_directories(m):
 42 |     with pytest.raises(NotADirectoryError):
 43 |         m.mkdir("outer/inner", create_parents=False)
 44 |     m.mkdir("outer/inner")
 45 | 
 46 |     assert m.ls("outer")
 47 |     assert m.ls("outer/inner") == []
 48 | 
 49 |     with pytest.raises(OSError):
 50 |         m.rmdir("outer")
 51 | 
 52 |     m.rmdir("outer/inner")
 53 |     m.rmdir("outer")
 54 | 
 55 |     assert not m.store
 56 | 
 57 | 
 58 | def test_mv_recursive(m):
 59 |     m.mkdir("src")
 60 |     m.touch("src/file.txt")
 61 |     m.mv("src", "dest", recursive=True)
 62 |     assert m.exists("dest/file.txt")
 63 |     assert not m.exists("src")
 64 | 
 65 | 
 66 | def test_rm_no_psuedo_dir(m):
 67 |     m.touch("/dir1/dir2/file")
 68 |     m.rm("/dir1", recursive=True)
 69 |     assert not m.exists("/dir1/dir2/file")
 70 |     assert not m.exists("/dir1/dir2")
 71 |     assert not m.exists("/dir1")
 72 | 
 73 |     with pytest.raises(FileNotFoundError):
 74 |         m.rm("/dir1", recursive=True)
 75 | 
 76 | 
 77 | def test_rewind(m):
 78 |     # https://github.com/intake/filesystem_spec/issues/349
 79 |     with m.open("src/file.txt", "w") as f:
 80 |         f.write("content")
 81 |     with m.open("src/file.txt") as f:
 82 |         assert f.tell() == 0
 83 | 
 84 | 
 85 | def test_no_rewind_append_mode(m):
 86 |     # https://github.com/intake/filesystem_spec/issues/349
 87 |     with m.open("src/file.txt", "w") as f:
 88 |         f.write("content")
 89 |     with m.open("src/file.txt", "a") as f:
 90 |         assert f.tell() == 7
 91 | 
 92 | 
 93 | def test_moves(m):
 94 |     m.touch("source.txt")
 95 |     m.mv("source.txt", "target.txt")
 96 | 
 97 |     m.touch("source2.txt")
 98 |     m.mv("source2.txt", "target2.txt", recursive=True)
 99 |     assert m.find("") == ["target.txt", "target2.txt"]
100 | 
101 | 
102 | def test_rm_reursive_empty_subdir(m):
103 |     # https://github.com/intake/filesystem_spec/issues/500
104 |     m.mkdir("recdir")
105 |     m.mkdir("recdir/subdir2")
106 |     m.rm("recdir/", recursive=True)
107 |     assert not m.exists("dir")
108 | 


--------------------------------------------------------------------------------
/docs/source/developer.rst:
--------------------------------------------------------------------------------
 1 | Developing with fsspec
 2 | ----------------------
 3 | 
 4 | Whereas the majority of the documentation describes the use of ``fsspec``
 5 | from the end-user's point of view, ``fsspec`` is used by many libraries
 6 | as the primary/only interface to file operations.
 7 | 
 8 | Clients of the library
 9 | ~~~~~~~~~~~~~~~~~~~~~~
10 | 
11 | The most common entrance point for libraries which wish to rely on ``fsspec``
12 | will be ``open`` or ``open_files``, as a way of generating an object compatible
13 | with the python file interface. This actually produces an ``OpenFile`` instance,
14 | which can be serialised across a network, and resources are only engaged when
15 | entering a context, e.g.
16 | 
17 | .. code-block:: python
18 | 
19 |    with fsspec.open("protocol://path", 'rb', param=value) as f:
20 |        process_file(f)
21 | 
22 | Note the backend-specific parameters that can be passed in this call.
23 | 
24 | In cases where the caller wants to control the context directly, they can use the
25 | ``open`` method of the ``OpenFile``, or get the filesystem object directly,
26 | skipping the ``OpenFile`` route. In the latter case, text encoding and compression
27 | or **not** handled for you. The file-like object can also be used as a context
28 | manager, or the ``close()`` method must be called explicitly to release resources.
29 | 
30 | .. code-block:: python
31 | 
32 |    # OpenFile route
33 |    of = fsspec.open("protocol://path", 'rb', param=value)
34 |    f = of.open()
35 |    process_file(f)
36 |    f.close()
37 | 
38 |    # filesystem class route, context
39 |    fs = fsspec.filesystem("protocol", param=value)
40 |    with fs.open("path", "rb") as f:
41 |        process_file(f)
42 | 
43 |    # filesystem class route, explicit close
44 |    fs = fsspec.filesystem("protocol", param=value)
45 |    f = fs.open("path", "rb")
46 |    process_file(f)
47 |    f.close()
48 | 
49 | Implementing a backend
50 | ~~~~~~~~~~~~~~~~~~~~~~
51 | 
52 | The class ``AbstractFileSystem`` provides a template of the methods
53 | that a potential implementation should supply, as well as default
54 | implementation of functionality that depends on these. Methods that
55 | *could* be implemented are marked with ``NotImplementedError`` or
56 | ``pass`` (the patter specifically for directory operations that might
57 | not be required for some backends where directories are emulated.
58 | 
59 | Note that not all of the methods need to be implemented: for example,
60 | some implementations may be read-only, in which case things like ``pipe``,
61 | ``put``, ``touch``, ``rm``, etc., can be left as not-implemented
62 | (or you might implement them are raise PermissionError, OSError 30 or some
63 | read-only exception).
64 | 
65 | We may eventually refactor ``AbstractFileSystem`` to split the default implementation,
66 | the set of methods that you might implement in a new backend, and the
67 | documented end-user API.
68 | 
69 | For now, new backends must register themselves on import
70 | (``register_implementation``) or post a PR to the ``fsspec`` repo
71 | asking to be included in ``fsspec.registry.known_implementations``.
72 | 
73 | Implementing async
74 | ~~~~~~~~~~~~~~~~~~
75 | 
76 | Starting in version 0.7.5, we provide async operations for some methods
77 | of some implementations.
78 | 
79 | This section will contain details on how to implement backends offering
80 | async, once the details are ironed out on our end.
81 | 


--------------------------------------------------------------------------------
/README.md:
--------------------------------------------------------------------------------
 1 | # filesystem_spec
 2 | 
 3 | ![Build](https://github.com/intake/filesystem_spec/workflows/CI/badge.svg)
 4 | [![Docs](https://readthedocs.org/projects/filesystem-spec/badge/?version=latest)](https://filesystem-spec.readthedocs.io/en/latest/?badge=latest)
 5 | 
 6 | A specification for pythonic filesystems.
 7 | 
 8 | ## Install
 9 | 
10 | ```bash
11 | pip install fsspec
12 | ```
13 | or
14 | ```bash
15 | conda install -c conda-forge fsspec
16 | ```
17 | 
18 | ## Purpose
19 | 
20 | To produce a template or specification for a file-system interface, that specific implementations should follow,
21 | so that applications making use of them can rely on a common behaviour and not have to worry about the specific
22 | internal implementation decisions with any given backend. Many such implementations are included in this package,
23 | or in sister projects such as `s3fs` and `gcsfs`.
24 | 
25 | In addition, if this is well-designed, then additional functionality, such as a key-value store or FUSE
26 | mounting of the file-system implementation may be available for all implementations "for free".
27 | 
28 | ## Documentation
29 | 
30 | Please refer to [RTD](https://filesystem-spec.readthedocs.io/en/latest/?badge=latest)
31 | 
32 | ## Develop
33 | 
34 | fsspec uses [tox](https://tox.readthedocs.io/en/latest/) and
35 | [tox-conda](https://github.com/tox-dev/tox-conda) to manage dev and test
36 | environments. First, install conda with tox and tox-conda in a base environment
37 | (eg. `conda install -c conda-forge tox tox-conda`). Calls to `tox` can then be
38 | used to configure a development environment and run tests.
39 | 
40 | First, setup a development conda environment via `tox -e dev`. This will
41 | install fspec dependencies, test & dev tools, and install fsspec in develop
42 | mode. Then, activate the dev environment under `.tox/dev` via `conda activate .tox/dev`.
43 | 
44 | ### Testing
45 | 
46 | Tests can be run directly in the activated dev environment via `pytest fsspec`.
47 | 
48 | The full fsspec test suite can be run via `tox`, which will setup and execute
49 | tests against multiple dependency versions in isolated environment. Run `tox
50 | -av` to list available test environments, select environments via `tox -e <env>`.
51 | 
52 | The full fsspec suite requires a system-level docker, docker-compose, and fuse
53 | installation. See `ci/install.sh` for a detailed installation example.
54 | 
55 | ### Code Formatting
56 | 
57 | fsspec uses [Black](https://black.readthedocs.io/en/stable) to ensure
58 | a consistent code format throughout the project. ``black`` is automatically
59 | installed in the tox dev env, activated via `conda activate .tox/dev`.
60 | 
61 | Then, run `black fsspec` from the root of the filesystem_spec repository to
62 | auto-format your code. Additionally, many editors have plugins that will apply
63 | `black` as you edit files.
64 | 
65 | Optionally, you may wish to setup [pre-commit hooks](https://pre-commit.com) to
66 | automatically run `black` when you make a git commit. ``black`` is automatically
67 | installed in the tox dev env, activated via `conda activate .tox/dev`.
68 | 
69 | Then, run `pre-commit install --install-hooks` from the root of the
70 | filesystem_spec repository to setup pre-commit hooks. `black` will now be run
71 | before you commit, reformatting any changed files. You can format without
72 | committing via `pre-commit run` or skip these checks with `git commit
73 | --no-verify`.
74 | 


--------------------------------------------------------------------------------
/fsspec/implementations/git.py:
--------------------------------------------------------------------------------
  1 | import pygit2
  2 | from fsspec.spec import AbstractFileSystem
  3 | from .memory import MemoryFile
  4 | import os
  5 | 
  6 | 
  7 | class GitFileSystem(AbstractFileSystem):
  8 |     """Browse the files of a local git repo at any hash/tag/branch
  9 | 
 10 |     (experimental backend)
 11 |     """
 12 | 
 13 |     root_marker = ""
 14 | 
 15 |     def __init__(self, path=None, ref=None, **kwargs):
 16 |         """
 17 | 
 18 |         Parameters
 19 |         ----------
 20 |         path: str (optional)
 21 |             Local location of the repo (uses current directory if not given)
 22 |         ref: str (optional)
 23 |             Reference to work with, could be a hash, tag or branch name. Defaults
 24 |             to current working tree. Note that ``ls`` and ``open`` also take hash,
 25 |             so this becomes the default for those operations
 26 |         kwargs
 27 |         """
 28 |         super().__init__(**kwargs)
 29 |         self.repo = pygit2.Repository(path or os.getcwd())
 30 |         self.ref = ref or "master"
 31 | 
 32 |     @classmethod
 33 |     def _strip_protocol(cls, path):
 34 |         return super()._strip_protocol(path).lstrip("/")
 35 | 
 36 |     def _path_to_object(self, path, ref):
 37 |         comm, ref = self.repo.resolve_refish(ref or self.ref)
 38 |         parts = path.split("/")
 39 |         tree = comm.tree
 40 |         for part in parts:
 41 |             if part and isinstance(tree, pygit2.Tree):
 42 |                 tree = tree[part]
 43 |         return tree
 44 | 
 45 |     def ls(self, path, detail=True, ref=None, **kwargs):
 46 |         path = self._strip_protocol(path)
 47 |         tree = self._path_to_object(path, ref)
 48 |         if isinstance(tree, pygit2.Tree):
 49 |             out = []
 50 |             for obj in tree:
 51 |                 if isinstance(obj, pygit2.Tree):
 52 |                     out.append(
 53 |                         {
 54 |                             "type": "directory",
 55 |                             "name": "/".join([path, obj.name]).lstrip("/"),
 56 |                             "hex": obj.hex,
 57 |                             "mode": "%o" % obj.filemode,
 58 |                             "size": 0,
 59 |                         }
 60 |                     )
 61 |                 else:
 62 |                     out.append(
 63 |                         {
 64 |                             "type": "file",
 65 |                             "name": "/".join([path, obj.name]).lstrip("/"),
 66 |                             "hex": obj.hex,
 67 |                             "mode": "%o" % obj.filemode,
 68 |                             "size": obj.size,
 69 |                         }
 70 |                     )
 71 |         else:
 72 |             obj = tree
 73 |             out = [
 74 |                 {
 75 |                     "type": "file",
 76 |                     "name": obj.name,
 77 |                     "hex": obj.hex,
 78 |                     "mode": "%o" % obj.filemode,
 79 |                     "size": obj.size,
 80 |                 }
 81 |             ]
 82 |         if detail:
 83 |             return out
 84 |         return [o["name"] for o in out]
 85 | 
 86 |     def ukey(self, path, ref=None):
 87 |         return self.info(path, ref=ref)["hex"]
 88 | 
 89 |     def _open(
 90 |         self,
 91 |         path,
 92 |         mode="rb",
 93 |         block_size=None,
 94 |         autocommit=True,
 95 |         cache_options=None,
 96 |         ref=None,
 97 |         **kwargs
 98 |     ):
 99 |         obj = self._path_to_object(path, ref or self.ref)
100 |         return MemoryFile(data=obj.data)
101 | 


--------------------------------------------------------------------------------
/fsspec/implementations/tests/test_smb.py:
--------------------------------------------------------------------------------
  1 | # -*- coding: utf-8 -*-
  2 | """
  3 | Test SMBFileSystem class using a docker container
  4 | """
  5 | 
  6 | import logging
  7 | import shlex
  8 | import subprocess
  9 | import time
 10 | import pytest
 11 | import fsspec
 12 | 
 13 | pytest.importorskip("smbprotocol")
 14 | 
 15 | # ! pylint: disable=redefined-outer-name,missing-function-docstring
 16 | 
 17 | 
 18 | def stop_docker(container):
 19 |     cmd = shlex.split('docker ps -a -q --filter "name=%s"' % container)
 20 |     cid = subprocess.check_output(cmd).strip().decode()
 21 |     if cid:
 22 |         subprocess.call(["docker", "rm", "-f", "-v", cid])
 23 | 
 24 | 
 25 | @pytest.fixture(scope="module")
 26 | def smb_params():
 27 |     try:
 28 |         pchk = ["docker", "run", "--name", "fsspec_test_smb", "hello-world"]
 29 |         subprocess.check_call(pchk)
 30 |     except (subprocess.CalledProcessError, FileNotFoundError):
 31 |         pytest.skip("docker run not available")
 32 |         return
 33 |     stop_docker("fsspec_test_smb")
 34 | 
 35 |     # requires docker
 36 |     container = "fsspec_smb"
 37 |     stop_docker(container)
 38 |     img = "docker run --name {} --detach -p 139:139 -p 445:445 dperson/samba"
 39 |     cfg = " -p -u 'testuser;testpass' -s 'home;/share;no;no;no;testuser'"
 40 |     cmd = img.format(container) + cfg
 41 |     cid = subprocess.check_output(shlex.split(cmd)).strip().decode()
 42 |     logger = logging.getLogger("fsspec")
 43 |     logger.debug("Container: %s", cid)
 44 |     try:
 45 |         time.sleep(1)
 46 |         yield dict(host="localhost", port=445, username="testuser", password="testpass")
 47 |     finally:
 48 |         import smbclient  # pylint: disable=import-outside-toplevel
 49 | 
 50 |         smbclient.reset_connection_cache()
 51 |         stop_docker(container)
 52 | 
 53 | 
 54 | def test_simple(smb_params):
 55 |     adir = "/home/adir"
 56 |     adir2 = "/home/adir/otherdir/"
 57 |     afile = "/home/adir/otherdir/afile"
 58 |     fsmb = fsspec.get_filesystem_class("smb")(**smb_params)
 59 |     fsmb.mkdirs(adir2)
 60 |     fsmb.touch(afile)
 61 |     assert fsmb.find(adir) == [afile]
 62 |     assert fsmb.ls(adir2, detail=False) == [afile]
 63 |     assert fsmb.info(afile)["type"] == "file"
 64 |     assert fsmb.info(afile)["size"] == 0
 65 |     assert fsmb.exists(adir)
 66 |     fsmb.rm(adir, recursive=True)
 67 |     assert not fsmb.exists(adir)
 68 | 
 69 | 
 70 | def test_with_url(smb_params):
 71 |     smb_url = "smb://{username}:{password}@{host}:{port}/home/someuser.txt"
 72 |     fwo = fsspec.open(smb_url.format(**smb_params), "wb")
 73 |     with fwo as fwr:
 74 |         fwr.write(b"hello")
 75 |     fro = fsspec.open(smb_url.format(**smb_params), "rb")
 76 |     with fro as frd:
 77 |         read_result = frd.read()
 78 |         assert read_result == b"hello"
 79 | 
 80 | 
 81 | def test_transaction(smb_params):
 82 |     afile = "/home/afolder/otherdir/afile"
 83 |     afile2 = "/home/afolder/otherdir/afile2"
 84 |     adir = "/home/afolder"
 85 |     adir2 = "/home/afolder/otherdir"
 86 |     fsmb = fsspec.get_filesystem_class("smb")(**smb_params)
 87 |     fsmb.mkdirs(adir2)
 88 |     fsmb.start_transaction()
 89 |     fsmb.touch(afile)
 90 |     assert fsmb.find(adir) == []
 91 |     fsmb.end_transaction()
 92 |     assert fsmb.find(adir) == [afile]
 93 | 
 94 |     with fsmb.transaction:
 95 |         assert fsmb._intrans
 96 |         fsmb.touch(afile2)
 97 |         assert fsmb.find(adir) == [afile]
 98 |     assert fsmb.find(adir) == [afile, afile2]
 99 | 
100 | 
101 | def test_makedirs_exist_ok(smb_params):
102 |     fsmb = fsspec.get_filesystem_class("smb")(**smb_params)
103 |     fsmb.makedirs("/home/a/b/c")
104 |     fsmb.makedirs("/home/a/b/c", exist_ok=True)
105 | 


--------------------------------------------------------------------------------
/docs/source/usage.rst:
--------------------------------------------------------------------------------
 1 | Usage
 2 | =====
 3 | 
 4 | This is quick-start documentation to help people get familiar with the layout and functioning of ``fsspec``.
 5 | 
 6 | Instantiate a file-system
 7 | -------------------------
 8 | 
 9 | ``fsspec`` provides an abstract file-system interface as a template for other filesystems. In this context,
10 | "interface" means an API for working with files on the given file-system, which can mean files on some
11 | remote store, local files, files within some wrapper, or anything else that is capable of producing
12 | file-like objects.
13 | 
14 | Some concrete implementations are bundled with ``fsspec`` and others can be installed separately. They
15 | can be instantiated directly, or the `registry` can be used to find them.
16 | 
17 | Direct instantiation:
18 | 
19 | .. code-block:: python
20 | 
21 |    from fsspec.implementations.local import LocalFileSystem
22 |    fs = LocalFileSystem()
23 | 
24 | Look-up via registry:
25 | 
26 | .. code-block:: python
27 | 
28 |    import fsspec
29 |    fs = fsspec.filesystem('file')
30 | 
31 | Many filesystems also take extra parameters, some of which may be options - see :doc:`api`.
32 | 
33 | .. code-block:: python
34 | 
35 |    import fsspec
36 |    fs = fsspec.filesystem('ftp', host=host, port=port,
37 |                           username=user, password=pw)
38 | 
39 | Use a file-system
40 | -----------------
41 | 
42 | File-system instances offer a large number of methods for getting information about and manipulating files
43 | for the given back-end. Although some specific implementations may not offer all features (e.g., ``http``
44 | is read-only), generally all normal operations, such as ``ls``, ``rm``,  should be expected to work (see the
45 | full list: :class:`fsspec.spec.AbstractFileSystem`).
46 | Note that this quick-start will prefer posix-style naming, but
47 | many common operations are aliased: ``cp()`` and ``copy()`` are identical, for instance.
48 | Functionality is generally chosen to be as close to the builtin ``os`` module's working for things like
49 | ``glob`` as possible.
50 | 
51 | The ``open()`` method will return a file-like object which can be passed to any other library that expects
52 | to work with python files. These will normally be binary-mode only, but may implement internal buffering
53 | in order to limit the number of reads from a remote source. They respect the use of ``with`` contexts. If
54 | you have ``pandas`` installed, for example, you can do the following:
55 | 
56 | .. code-block:: python
57 | 
58 |     import fsspec
59 |     import pandas as pd
60 |     with fsspec.open('https://raw.githubusercontent.com/dask/'
61 |                  'fastparquet/master/test-data/nation.csv') as f:
62 |         df = pd.read_csv(f, sep='|', header=None)
63 | 
64 | Higher-level
65 | ------------
66 | 
67 | For many situations, the only function that will be needed is :func:`fsspec.open_files()`, which will return
68 | :class:`fsspec.core.OpenFile` instances created from a single URL and parameters to pass to the backend.
69 | This supports text-mode and compression on the fly, and the objects can be serialized for passing between
70 | processes or machines (so long as each has access to the same backend file-system). The protocol (i.e.,
71 | backend) is inferred from the URL passed, and glob characters are expanded in read mode (search for files)
72 | or write mode (create names). Critically, the file on the backend system is not actually opened until the
73 | ``OpenFile`` instance is used in a ``with`` context. For the example above:
74 | 
75 | .. code-block:: python
76 | 
77 |    of = fsspec.open('https://raw.githubusercontent.com/dask/'
78 |                     'fastparquet/master/test-data/nation.csv', mode='r')
79 |    # files is a not-yet-open OpenFile object. The "with" context actually opens it
80 |    with of as f:
81 |        # now f is a text-mode file
82 |        df = pd.read_csv(f, sep='|', header=None)
83 | 
84 | 


--------------------------------------------------------------------------------
/fsspec/tests/test_mapping.py:
--------------------------------------------------------------------------------
  1 | import os
  2 | import pickle
  3 | import sys
  4 | 
  5 | import fsspec
  6 | from fsspec.implementations.memory import MemoryFileSystem
  7 | import pytest
  8 | 
  9 | 
 10 | def test_mapping_prefix(tmpdir):
 11 |     tmpdir = str(tmpdir)
 12 |     os.makedirs(os.path.join(tmpdir, "afolder"))
 13 |     open(os.path.join(tmpdir, "afile"), "w").write("test")
 14 |     open(os.path.join(tmpdir, "afolder", "anotherfile"), "w").write("test2")
 15 | 
 16 |     m = fsspec.get_mapper("file://" + tmpdir)
 17 |     assert "afile" in m
 18 |     assert m["afolder/anotherfile"] == b"test2"
 19 | 
 20 |     fs = fsspec.filesystem("file")
 21 |     m2 = fs.get_mapper(tmpdir)
 22 |     m3 = fs.get_mapper("file://" + tmpdir)
 23 | 
 24 |     assert m == m2 == m3
 25 | 
 26 | 
 27 | def test_getitems_errors(tmpdir):
 28 |     tmpdir = str(tmpdir)
 29 |     os.makedirs(os.path.join(tmpdir, "afolder"))
 30 |     open(os.path.join(tmpdir, "afile"), "w").write("test")
 31 |     open(os.path.join(tmpdir, "afolder", "anotherfile"), "w").write("test2")
 32 |     m = fsspec.get_mapper("file://" + tmpdir)
 33 |     assert m.getitems(["afile", "bfile"], on_error="omit") == {"afile": b"test"}
 34 |     with pytest.raises(KeyError):
 35 |         m.getitems(["afile", "bfile"])
 36 |     out = m.getitems(["afile", "bfile"], on_error="return")
 37 |     assert isinstance(out["bfile"], KeyError)
 38 |     m = fsspec.get_mapper("file://" + tmpdir, missing_exceptions=())
 39 |     assert m.getitems(["afile", "bfile"], on_error="omit") == {"afile": b"test"}
 40 |     with pytest.raises(FileNotFoundError):
 41 |         m.getitems(["afile", "bfile"])
 42 | 
 43 | 
 44 | def test_ops():
 45 |     MemoryFileSystem.store.clear()
 46 |     m = fsspec.get_mapper("memory://")
 47 |     assert not m
 48 |     assert list(m) == []
 49 | 
 50 |     with pytest.raises(KeyError):
 51 |         m["hi"]
 52 | 
 53 |     assert m.pop("key", 0) == 0
 54 | 
 55 |     m["key0"] = b"data"
 56 |     assert list(m) == ["key0"]
 57 |     assert m["key0"] == b"data"
 58 | 
 59 |     m.clear()
 60 | 
 61 |     assert list(m) == []
 62 | 
 63 | 
 64 | def test_pickle():
 65 |     m = fsspec.get_mapper("memory://")
 66 |     assert isinstance(m.fs, MemoryFileSystem)
 67 |     m["key"] = b"data"
 68 |     m2 = pickle.loads(pickle.dumps(m))
 69 |     assert list(m) == list(m2)
 70 |     assert m.missing_exceptions == m2.missing_exceptions
 71 | 
 72 | 
 73 | def test_keys_view():
 74 |     # https://github.com/intake/filesystem_spec/issues/186
 75 |     m = fsspec.get_mapper("memory://")
 76 |     m["key"] = b"data"
 77 | 
 78 |     keys = m.keys()
 79 |     assert len(keys) == 1
 80 |     # check that we don't consume the keys
 81 |     assert len(keys) == 1
 82 |     m.clear()
 83 | 
 84 | 
 85 | def test_multi():
 86 |     m = fsspec.get_mapper("memory://")
 87 |     data = {"a": b"data1", "b": b"data2"}
 88 |     m.setitems(data)
 89 | 
 90 |     assert m.getitems(list(data)) == data
 91 |     m.delitems(list(data))
 92 |     assert not list(m)
 93 | 
 94 | 
 95 | def test_setitem_types():
 96 |     import array
 97 | 
 98 |     m = fsspec.get_mapper("memory://")
 99 |     m["a"] = array.array("i", [1])
100 |     if sys.byteorder == "little":
101 |         assert m["a"] == b"\x01\x00\x00\x00"
102 |     else:
103 |         assert m["a"] == b"\x00\x00\x00\x01"
104 |     m["b"] = bytearray(b"123")
105 |     assert m["b"] == b"123"
106 |     m.setitems({"c": array.array("i", [1]), "d": bytearray(b"123")})
107 |     if sys.byteorder == "little":
108 |         assert m["c"] == b"\x01\x00\x00\x00"
109 |     else:
110 |         assert m["c"] == b"\x00\x00\x00\x01"
111 |     assert m["d"] == b"123"
112 | 
113 | 
114 | def test_setitem_numpy():
115 |     m = fsspec.get_mapper("memory://")
116 |     np = pytest.importorskip("numpy")
117 |     m["c"] = np.array(1, dtype="<i4")  # scalar
118 |     assert m["c"] == b"\x01\x00\x00\x00"
119 |     m["c"] = np.array([1, 2], dtype="<i4")  # array
120 |     assert m["c"] == b"\x01\x00\x00\x00\x02\x00\x00\x00"
121 | 


--------------------------------------------------------------------------------
/fsspec/implementations/tests/test_webhdfs.py:
--------------------------------------------------------------------------------
  1 | import pickle
  2 | import pytest
  3 | import shlex
  4 | import subprocess
  5 | import time
  6 | import fsspec
  7 | 
  8 | requests = pytest.importorskip("requests")
  9 | 
 10 | from fsspec.implementations.webhdfs import WebHDFS  # noqa: E402
 11 | 
 12 | 
 13 | @pytest.fixture(scope="module")
 14 | def hdfs_cluster():
 15 |     cmd0 = shlex.split("htcluster shutdown")
 16 |     try:
 17 |         subprocess.check_output(cmd0, stderr=subprocess.STDOUT)
 18 |     except FileNotFoundError:
 19 |         pytest.skip("htcluster not found")
 20 |     except subprocess.CalledProcessError as ex:
 21 |         pytest.skip("htcluster failed: " + ex.output.decode())
 22 |     cmd1 = shlex.split("htcluster startup --image base")
 23 |     subprocess.check_output(cmd1)
 24 |     try:
 25 |         while True:
 26 |             t = 90
 27 |             try:
 28 |                 requests.get("http://localhost:50070/webhdfs/v1/?op=LISTSTATUS")
 29 |             except:  # noqa: E722
 30 |                 t -= 1
 31 |                 assert t > 0, "Timeout waiting for HDFS"
 32 |                 time.sleep(1)
 33 |                 continue
 34 |             break
 35 |         time.sleep(7)
 36 |         yield "localhost"
 37 |     finally:
 38 |         subprocess.check_output(cmd0)
 39 | 
 40 | 
 41 | def test_pickle(hdfs_cluster):
 42 |     w = WebHDFS(hdfs_cluster, user="testuser")
 43 |     w2 = pickle.loads(pickle.dumps(w))
 44 |     assert w == w2
 45 | 
 46 | 
 47 | def test_simple(hdfs_cluster):
 48 |     w = WebHDFS(hdfs_cluster, user="testuser")
 49 |     home = w.home_directory()
 50 |     assert home == "/user/testuser"
 51 |     with pytest.raises(PermissionError):
 52 |         w.mkdir("/root")
 53 | 
 54 | 
 55 | def test_url(hdfs_cluster):
 56 |     url = "webhdfs://testuser@localhost:50070/user/testuser/myfile"
 57 |     fo = fsspec.open(url, "wb", data_proxy={"worker.example.com": "localhost"})
 58 |     with fo as f:
 59 |         f.write(b"hello")
 60 |     fo = fsspec.open(url, "rb", data_proxy={"worker.example.com": "localhost"})
 61 |     with fo as f:
 62 |         assert f.read() == b"hello"
 63 | 
 64 | 
 65 | def test_workflow(hdfs_cluster):
 66 |     w = WebHDFS(
 67 |         hdfs_cluster, user="testuser", data_proxy={"worker.example.com": "localhost"}
 68 |     )
 69 |     fn = "/user/testuser/testrun/afile"
 70 |     w.mkdir("/user/testuser/testrun")
 71 |     with w.open(fn, "wb") as f:
 72 |         f.write(b"hello")
 73 |     assert w.exists(fn)
 74 |     info = w.info(fn)
 75 |     assert info["size"] == 5
 76 |     assert w.isfile(fn)
 77 |     assert w.cat(fn) == b"hello"
 78 |     w.rm("/user/testuser/testrun", recursive=True)
 79 |     assert not w.exists(fn)
 80 | 
 81 | 
 82 | def test_with_gzip(hdfs_cluster):
 83 |     from gzip import GzipFile
 84 | 
 85 |     w = WebHDFS(
 86 |         hdfs_cluster, user="testuser", data_proxy={"worker.example.com": "localhost"}
 87 |     )
 88 |     fn = "/user/testuser/gzfile"
 89 |     with w.open(fn, "wb") as f:
 90 |         gf = GzipFile(fileobj=f, mode="w")
 91 |         gf.write(b"hello")
 92 |         gf.close()
 93 |     with w.open(fn, "rb") as f:
 94 |         gf = GzipFile(fileobj=f, mode="r")
 95 |         assert gf.read() == b"hello"
 96 | 
 97 | 
 98 | def test_workflow_transaction(hdfs_cluster):
 99 |     w = WebHDFS(
100 |         hdfs_cluster, user="testuser", data_proxy={"worker.example.com": "localhost"}
101 |     )
102 |     fn = "/user/testuser/testrun/afile"
103 |     w.mkdirs("/user/testuser/testrun")
104 |     with w.transaction:
105 |         with w.open(fn, "wb") as f:
106 |             f.write(b"hello")
107 |         assert not w.exists(fn)
108 |     assert w.exists(fn)
109 |     assert w.ukey(fn)
110 |     files = w.ls("/user/testuser/testrun", True)
111 |     summ = w.content_summary("/user/testuser/testrun")
112 |     assert summ["length"] == files[0]["size"]
113 |     assert summ["fileCount"] == 1
114 | 
115 |     w.rm("/user/testuser/testrun", recursive=True)
116 |     assert not w.exists(fn)
117 | 


--------------------------------------------------------------------------------
/fsspec/implementations/tests/test_sftp.py:
--------------------------------------------------------------------------------
  1 | import pytest
  2 | import shlex
  3 | import subprocess
  4 | import time
  5 | import fsspec
  6 | 
  7 | pytest.importorskip("paramiko")
  8 | 
  9 | 
 10 | def stop_docker(name):
 11 |     cmd = shlex.split('docker ps -a -q --filter "name=%s"' % name)
 12 |     cid = subprocess.check_output(cmd).strip().decode()
 13 |     if cid:
 14 |         subprocess.call(["docker", "rm", "-f", cid])
 15 | 
 16 | 
 17 | @pytest.fixture(scope="module")
 18 | def ssh():
 19 |     try:
 20 |         subprocess.check_call(["docker", "run", "hello-world"])
 21 |     except (subprocess.CalledProcessError, FileNotFoundError):
 22 |         pytest.skip("docker run not available")
 23 |         return
 24 | 
 25 |     # requires docker
 26 |     cmds = [
 27 |         r"apt-get update",
 28 |         r"apt-get install -y openssh-server",
 29 |         r"mkdir /var/run/sshd",
 30 |         "bash -c \"echo 'root:pass' | chpasswd\"",
 31 |         (
 32 |             r"sed -i 's/PermitRootLogin prohibit-password/PermitRootLogin yes/' "
 33 |             r"/etc/ssh/sshd_config"
 34 |         ),
 35 |         (
 36 |             r"sed 's@session\s*required\s*pam_loginuid.so@session optional "
 37 |             r"pam_loginuid.so@g' -i /etc/pam.d/sshd"
 38 |         ),
 39 |         r'bash -c "echo \"export VISIBLE=now\" >> /etc/profile"',
 40 |         r"/usr/sbin/sshd",
 41 |     ]
 42 |     name = "fsspec_sftp"
 43 |     stop_docker(name)
 44 |     cmd = "docker run -d -p 9200:22 --name {} ubuntu:16.04 sleep 9000".format(name)
 45 |     cid = subprocess.check_output(shlex.split(cmd)).strip().decode()
 46 |     for cmd in cmds:
 47 |         subprocess.call(["docker", "exec", cid] + shlex.split(cmd))
 48 |     try:
 49 |         time.sleep(1)
 50 |         yield dict(host="localhost", port=9200, username="root", password="pass")
 51 |     finally:
 52 |         stop_docker(name)
 53 | 
 54 | 
 55 | def test_simple(ssh):
 56 |     f = fsspec.get_filesystem_class("sftp")(**ssh)
 57 |     f.mkdirs("/home/someuser/deeper")
 58 |     f.touch("/home/someuser/deeper/afile")
 59 |     assert f.find("/home/someuser") == ["/home/someuser/deeper/afile"]
 60 |     assert f.ls("/home/someuser/deeper/") == ["/home/someuser/deeper/afile"]
 61 |     assert f.info("/home/someuser/deeper/afile")["type"] == "file"
 62 |     assert f.info("/home/someuser/deeper/afile")["size"] == 0
 63 |     assert f.exists("/home/someuser")
 64 |     f.rm("/home/someuser", recursive=True)
 65 |     assert not f.exists("/home/someuser")
 66 | 
 67 | 
 68 | @pytest.mark.parametrize("protocol", ["sftp", "ssh"])
 69 | def test_with_url(protocol, ssh):
 70 |     fo = fsspec.open(
 71 |         protocol + "://{username}:{password}@{host}:{port}"
 72 |         "/home/someuserout".format(**ssh),
 73 |         "wb",
 74 |     )
 75 |     with fo as f:
 76 |         f.write(b"hello")
 77 |     fo = fsspec.open(
 78 |         protocol + "://{username}:{password}@{host}:{port}"
 79 |         "/home/someuserout".format(**ssh),
 80 |         "rb",
 81 |     )
 82 |     with fo as f:
 83 |         assert f.read() == b"hello"
 84 | 
 85 | 
 86 | def test_transaction(ssh):
 87 |     f = fsspec.get_filesystem_class("sftp")(**ssh)
 88 |     f.mkdirs("/home/someuser/deeper")
 89 |     f.start_transaction()
 90 |     f.touch("/home/someuser/deeper/afile")
 91 |     assert f.find("/home/someuser") == []
 92 |     f.end_transaction()
 93 |     f.find("/home/someuser") == ["/home/someuser/deeper/afile"]
 94 | 
 95 |     with f.transaction:
 96 |         assert f._intrans
 97 |         f.touch("/home/someuser/deeper/afile2")
 98 |         assert f.find("/home/someuser") == ["/home/someuser/deeper/afile"]
 99 |     assert f.find("/home/someuser") == [
100 |         "/home/someuser/deeper/afile",
101 |         "/home/someuser/deeper/afile2",
102 |     ]
103 | 
104 | 
105 | def test_makedirs_exist_ok(ssh):
106 |     f = fsspec.get_filesystem_class("sftp")(**ssh)
107 | 
108 |     f.makedirs("/a/b/c")
109 | 
110 |     with pytest.raises(FileExistsError, match="/a/b/c"):
111 |         f.makedirs("/a/b/c", exist_ok=False)
112 | 
113 |     f.makedirs("/a/b/c", exist_ok=True)
114 | 


--------------------------------------------------------------------------------
/fsspec/implementations/tests/test_ftp.py:
--------------------------------------------------------------------------------
  1 | import os
  2 | import pytest
  3 | import subprocess
  4 | import sys
  5 | import time
  6 | 
  7 | from fsspec.implementations.ftp import FTPFileSystem
  8 | from fsspec import open_files
  9 | import fsspec
 10 | 
 11 | here = os.path.dirname(os.path.abspath(__file__))
 12 | 
 13 | 
 14 | @pytest.fixture()
 15 | def ftp():
 16 |     pytest.importorskip("pyftpdlib")
 17 |     P = subprocess.Popen(
 18 |         [sys.executable, "-m", "pyftpdlib", "-d", here],
 19 |         stderr=subprocess.STDOUT,
 20 |         stdout=subprocess.PIPE,
 21 |     )
 22 |     try:
 23 |         time.sleep(1)
 24 |         yield "localhost", 2121
 25 |     finally:
 26 |         P.terminate()
 27 |         P.wait()
 28 | 
 29 | 
 30 | def test_basic(ftp):
 31 |     host, port = ftp
 32 |     fs = FTPFileSystem(host, port)
 33 |     assert fs.ls("/", detail=False) == sorted(os.listdir(here))
 34 |     out = fs.cat("/" + os.path.basename(__file__))
 35 |     assert out == open(__file__, "rb").read()
 36 | 
 37 | 
 38 | def test_not_cached(ftp):
 39 |     host, port = ftp
 40 |     fs = FTPFileSystem(host, port)
 41 |     fs2 = FTPFileSystem(host, port)
 42 |     assert fs is not fs2
 43 | 
 44 | 
 45 | @pytest.mark.parametrize("cache_type", ["bytes", "mmap"])
 46 | def test_complex(ftp_writable, cache_type):
 47 |     from fsspec.core import BytesCache
 48 | 
 49 |     host, port, user, pw = ftp_writable
 50 |     files = open_files(
 51 |         "ftp:///ou*",
 52 |         host=host,
 53 |         port=port,
 54 |         username=user,
 55 |         password=pw,
 56 |         block_size=10000,
 57 |         cache_type=cache_type,
 58 |     )
 59 |     assert len(files) == 1
 60 |     with files[0] as fo:
 61 |         assert fo.read(10) == b"hellohello"
 62 |         if isinstance(fo.cache, BytesCache):
 63 |             assert len(fo.cache.cache) == 10010
 64 |         assert fo.read(2) == b"he"
 65 |         assert fo.tell() == 12
 66 | 
 67 | 
 68 | def test_write_small(ftp_writable):
 69 |     host, port, user, pw = ftp_writable
 70 |     fs = FTPFileSystem(host, port, user, pw)
 71 |     with fs.open("/out2", "wb") as f:
 72 |         f.write(b"oi")
 73 |     assert fs.cat("/out2") == b"oi"
 74 | 
 75 | 
 76 | def test_with_url(ftp_writable):
 77 |     host, port, user, pw = ftp_writable
 78 |     fo = fsspec.open("ftp://{}:{}@{}:{}/out".format(user, pw, host, port), "wb")
 79 |     with fo as f:
 80 |         f.write(b"hello")
 81 |     fo = fsspec.open("ftp://{}:{}@{}:{}/out".format(user, pw, host, port), "rb")
 82 |     with fo as f:
 83 |         assert f.read() == b"hello"
 84 | 
 85 | 
 86 | @pytest.mark.parametrize("cache_type", ["bytes", "mmap"])
 87 | def test_write_big(ftp_writable, cache_type):
 88 |     host, port, user, pw = ftp_writable
 89 |     fs = FTPFileSystem(host, port, user, pw, block_size=1000, cache_type=cache_type)
 90 |     fn = "/bigger"
 91 |     with fs.open(fn, "wb") as f:
 92 |         f.write(b"o" * 500)
 93 |         assert not fs.exists(fn)
 94 |         f.write(b"o" * 1000)
 95 |         fs.invalidate_cache()
 96 |         assert fs.exists(fn)
 97 |         f.write(b"o" * 200)
 98 |         f.flush()
 99 | 
100 |     assert fs.info(fn)["size"] == 1700
101 |     assert fs.cat(fn) == b"o" * 1700
102 | 
103 | 
104 | def test_transaction(ftp_writable):
105 |     host, port, user, pw = ftp_writable
106 |     fs = FTPFileSystem(host, port, user, pw)
107 |     fs.mkdir("/tmp")
108 |     fn = "/tr"
109 |     with fs.transaction:
110 |         with fs.open(fn, "wb") as f:
111 |             f.write(b"not")
112 |         assert not fs.exists(fn)
113 |     assert fs.exists(fn)
114 |     assert fs.cat(fn) == b"not"
115 | 
116 |     fs.rm(fn)
117 |     assert not fs.exists(fn)
118 | 
119 | 
120 | def test_transaction_with_cache(ftp_writable):
121 |     host, port, user, pw = ftp_writable
122 |     fs = FTPFileSystem(host, port, user, pw)
123 |     fs.mkdir("/tmp")
124 |     fs.mkdir("/tmp/dir")
125 |     assert "dir" in fs.ls("/tmp", detail=False)
126 | 
127 |     with fs.transaction:
128 |         fs.rmdir("/tmp/dir")
129 | 
130 |     assert "dir" not in fs.ls("/tmp", detail=False)
131 |     assert not fs.exists("/tmp/dir")
132 | 


--------------------------------------------------------------------------------
/fsspec/tests/test_registry.py:
--------------------------------------------------------------------------------
  1 | import sys
  2 | from unittest.mock import create_autospec, patch
  3 | 
  4 | import pytest
  5 | 
  6 | from fsspec.registry import (
  7 |     ReadOnlyError,
  8 |     _registry,
  9 |     get_filesystem_class,
 10 |     known_implementations,
 11 |     register_implementation,
 12 |     registry,
 13 | )
 14 | from fsspec.spec import AbstractFileSystem
 15 | 
 16 | try:
 17 |     from importlib.metadata import EntryPoint
 18 | except ImportError:  # python < 3.8
 19 |     from importlib_metadata import EntryPoint
 20 | 
 21 | 
 22 | @pytest.fixture()
 23 | def clear_registry():
 24 |     try:
 25 |         yield
 26 |     finally:
 27 |         _registry.clear()
 28 |         known_implementations.pop("test", None)
 29 | 
 30 | 
 31 | @pytest.fixture()
 32 | def clean_imports():
 33 |     try:
 34 |         real_module = sys.modules["fsspec"]
 35 |         del sys.modules["fsspec"]
 36 |         yield
 37 |     finally:
 38 |         sys.modules["fsspec"] = real_module
 39 | 
 40 | 
 41 | @pytest.mark.parametrize(
 42 |     "protocol,module,minversion,oldversion",
 43 |     [("s3", "s3fs", "0.3.0", "0.1.0"), ("gs", "gcsfs", "0.3.0", "0.1.0")],
 44 | )
 45 | def test_minversion_s3fs(protocol, module, minversion, oldversion, monkeypatch):
 46 |     _registry.clear()
 47 |     mod = pytest.importorskip(module, minversion)
 48 | 
 49 |     assert get_filesystem_class("s3") is not None
 50 |     _registry.clear()
 51 | 
 52 |     monkeypatch.setattr(mod, "__version__", oldversion)
 53 |     with pytest.raises(RuntimeError, match=minversion):
 54 |         get_filesystem_class(protocol)
 55 | 
 56 | 
 57 | def test_registry_readonly():
 58 |     get_filesystem_class("file")
 59 |     assert "file" in registry
 60 |     assert "file" in list(registry)
 61 |     with pytest.raises(ReadOnlyError):
 62 |         del registry["file"]
 63 |     with pytest.raises(ReadOnlyError):
 64 |         registry["file"] = None
 65 |     with pytest.raises(ReadOnlyError):
 66 |         registry.clear()
 67 | 
 68 | 
 69 | def test_register_cls(clear_registry):
 70 |     with pytest.raises(ValueError):
 71 |         get_filesystem_class("test")
 72 |     register_implementation("test", AbstractFileSystem)
 73 |     cls = get_filesystem_class("test")
 74 |     assert cls is AbstractFileSystem
 75 | 
 76 | 
 77 | def test_register_str(clear_registry):
 78 |     with pytest.raises(ValueError):
 79 |         get_filesystem_class("test")
 80 |     register_implementation("test", "fsspec.AbstractFileSystem")
 81 |     assert "test" not in registry
 82 |     cls = get_filesystem_class("test")
 83 |     assert cls is AbstractFileSystem
 84 |     assert "test" in registry
 85 | 
 86 | 
 87 | def test_register_fail(clear_registry):
 88 |     register_implementation("test", "doesntexist.AbstractFileSystem")
 89 |     with pytest.raises(ImportError):
 90 |         get_filesystem_class("test")
 91 | 
 92 |     register_implementation("test", "doesntexist.AbstractFileSystem")
 93 |     with pytest.raises(ValueError):
 94 |         register_implementation("test", "doesntexist.AbstractFileSystem", clobber=False)
 95 | 
 96 |     register_implementation(
 97 |         "test", "doesntexist.AbstractFileSystem", errtxt="hiho", clobber=True
 98 |     )
 99 |     with pytest.raises(ImportError) as e:
100 |         get_filesystem_class("test")
101 |     assert "hiho" in str(e.value)
102 |     register_implementation("test", AbstractFileSystem)
103 | 
104 |     with pytest.raises(ValueError):
105 |         register_implementation("test", AbstractFileSystem, clobber=False)
106 |     register_implementation("test", AbstractFileSystem, clobber=True)
107 | 
108 | 
109 | def test_entry_points_registered_on_import(clear_registry, clean_imports):
110 |     mock_ep = create_autospec(EntryPoint, module="fsspec.spec.AbstractFileSystem")
111 |     mock_ep.name = "test"  # this can't be set in the constructor...
112 |     if sys.version_info < (3, 8):
113 |         import_location = "importlib_metadata.entry_points"
114 |     else:
115 |         import_location = "importlib.metadata.entry_points"
116 |     with patch(import_location, return_value={"fsspec.specs": [mock_ep]}):
117 |         assert "test" not in registry
118 |         import fsspec
119 | 
120 |         get_filesystem_class("test")
121 |         assert "test" in registry
122 | 


--------------------------------------------------------------------------------
/fsspec/implementations/jupyter.py:
--------------------------------------------------------------------------------
  1 | import base64
  2 | import io
  3 | import fsspec
  4 | import re
  5 | import requests
  6 | 
  7 | 
  8 | class JupyterFileSystem(fsspec.AbstractFileSystem):
  9 |     """View of the files as seen by a Jupyter server (notebook or lab)"""
 10 | 
 11 |     protocol = ("jupyter", "jlab")
 12 | 
 13 |     def __init__(self, url, tok=None, **kwargs):
 14 |         """
 15 | 
 16 |         Parameters
 17 |         ----------
 18 |         url : str
 19 |             Base URL of the server, like "http://127.0.0.1:8888". May include
 20 |             token in the string, which is given by the process when starting up
 21 |         tok : str
 22 |             If the token is obtained separately, can be given here
 23 |         kwargs
 24 |         """
 25 |         if "?" in url:
 26 |             if tok is None:
 27 |                 try:
 28 |                     tok = re.findall("token=([a-z0-9]+)", url)[0]
 29 |                 except IndexError as e:
 30 |                     raise ValueError("Could not determine token") from e
 31 |             url = url.split("?", 1)[0]
 32 |         self.url = url.rstrip("/") + "/api/contents"
 33 |         self.session = requests.Session()
 34 |         if tok:
 35 |             self.session.headers["Authorization"] = f"token {tok}"
 36 | 
 37 |         super().__init__(**kwargs)
 38 | 
 39 |     def ls(self, path, detail=True, **kwargs):
 40 |         path = self._strip_protocol(path)
 41 |         r = self.session.get(self.url + "/" + path)
 42 |         if r.status_code == 404:
 43 |             return FileNotFoundError(path)
 44 |         r.raise_for_status()
 45 |         out = r.json()
 46 | 
 47 |         if out["type"] == "directory":
 48 |             out = out["content"]
 49 |         else:
 50 |             out = [out]
 51 |         for o in out:
 52 |             o["name"] = o.pop("path")
 53 |             o.pop("content")
 54 |             if o["type"] == "notebook":
 55 |                 o["type"] = "file"
 56 |         if detail:
 57 |             return out
 58 |         return [o["name"] for o in out]
 59 | 
 60 |     def cat_file(self, path):
 61 |         path = self._strip_protocol(path)
 62 |         r = self.session.get(self.url + "/" + path)
 63 |         if r.status_code == 404:
 64 |             return FileNotFoundError(path)
 65 |         r.raise_for_status()
 66 |         out = r.json()
 67 |         if out["format"] == "text":
 68 |             # data should be binary
 69 |             return out["content"].encode()
 70 |         else:
 71 |             return base64.b64decode(out["content"])
 72 | 
 73 |     def pipe_file(self, path, value, **_):
 74 |         path = self._strip_protocol(path)
 75 |         json = {
 76 |             "name": path.rsplit("/", 1)[-1],
 77 |             "path": path,
 78 |             "size": len(value),
 79 |             "content": base64.b64encode(value).decode(),
 80 |             "format": "base64",
 81 |             "type": "file",
 82 |         }
 83 |         self.session.put(self.url + "/" + path, json=json)
 84 | 
 85 |     def mkdir(self, path, create_parents=True, **kwargs):
 86 |         path = self._strip_protocol(path)
 87 |         if create_parents and "/" in path:
 88 |             self.mkdir(path.rsplit("/", 1)[0], True)
 89 |         json = {
 90 |             "name": path.rsplit("/", 1)[-1],
 91 |             "path": path,
 92 |             "size": None,
 93 |             "content": None,
 94 |             "type": "directory",
 95 |         }
 96 |         self.session.put(self.url + "/" + path, json=json)
 97 | 
 98 |     def _rm(self, path):
 99 |         path = self._strip_protocol(path)
100 |         self.session.delete(self.url + "/" + path)
101 | 
102 |     def _open(self, path, mode="rb", **kwargs):
103 |         path = self._strip_protocol(path)
104 |         if mode == "rb":
105 |             data = self.cat_file(path)
106 |             return io.BytesIO(data)
107 |         else:
108 |             return SimpleFileWriter(self, path, mode="wb")
109 | 
110 | 
111 | class SimpleFileWriter(fsspec.spec.AbstractBufferedFile):
112 |     def _upload_chunk(self, final=False):
113 |         """Never uploads a chunk until file is done
114 | 
115 |         Not suitable for large files
116 |         """
117 |         if final is False:
118 |             return False
119 |         self.buffer.seek(0)
120 |         data = self.buffer.read()
121 |         self.fs.pipe_file(self.path, data)
122 | 


--------------------------------------------------------------------------------
/fsspec/implementations/tests/test_dbfs.py:
--------------------------------------------------------------------------------
  1 | """
  2 | Test-Cases for the DataBricks Filesystem.
  3 | This test case is somewhat special, as there is no "mock" databricks
  4 | API available. We use the "vcr" package to record the requests and
  5 | responses to the real databricks API and replay them on tests.
  6 | 
  7 | This however means, that when you change the tests (or when the API
  8 | itself changes, which is very unlikely to occur as it is versioned),
  9 | you need to re-record the answers. This can be done as follows:
 10 | 
 11 | 1. Delete all casettes files in the "./cassettes" folder
 12 | 2. Spin up a databricks cluster. For example,
 13 |    you can use an Azure Databricks instance for this.
 14 | 3. Take note of the instance details (the instance URL. For example for an Azure
 15 |    databricks cluster, this has the form
 16 |    adb-<some-number>.<two digits>.azuredatabricks.net)
 17 |    and your personal token (Find out more here:
 18 |    https://docs.databricks.com/dev-tools/api/latest/authentication.html)
 19 | 4. Set the two environment variables `DBFS_INSTANCE` and `DBFS_TOKEN`
 20 | 5. Now execute the tests as normal. The results of the API calls will be recorded.
 21 | 6. Unset the environment variables and replay the tests.
 22 | """
 23 | from urllib.parse import urlparse
 24 | import os
 25 | 
 26 | import pytest
 27 | import fsspec
 28 | 
 29 | DUMMY_INSTANCE = "my_instance.com"
 30 | INSTANCE = os.getenv("DBFS_INSTANCE", DUMMY_INSTANCE)
 31 | TOKEN = os.getenv("DBFS_TOKEN", "")
 32 | 
 33 | 
 34 | @pytest.fixture(scope="module")
 35 | def vcr_config():
 36 |     """
 37 |     To not record information in the instance and token details
 38 |     (which are sensitive), we delete them from both the
 39 |     request and the response before storing it.
 40 |     We also delete the date as it is likely to change
 41 |     (and will make git diffs harder).
 42 |     If the DBFS_TOKEN env variable is set, we record with VCR.
 43 |     If not, we only replay (to not accidentely record with a wrong URL).
 44 |     """
 45 | 
 46 |     def before_record_response(response):
 47 |         try:
 48 |             del response["headers"]["x-databricks-org-id"]
 49 |             del response["headers"]["date"]
 50 |         except KeyError:
 51 |             pass
 52 |         return response
 53 | 
 54 |     def before_record_request(request):
 55 |         # Replace the instance URL
 56 |         uri = urlparse(request.uri)
 57 |         uri = uri._replace(netloc=DUMMY_INSTANCE)
 58 |         request.uri = uri.geturl()
 59 | 
 60 |         return request
 61 | 
 62 |     if TOKEN:
 63 |         return {
 64 |             "record_mode": "once",
 65 |             "filter_headers": [("authorization", "DUMMY")],
 66 |             "before_record_response": before_record_response,
 67 |             "before_record_request": before_record_request,
 68 |         }
 69 |     else:
 70 |         return {
 71 |             "record_mode": "none",
 72 |         }
 73 | 
 74 | 
 75 | @pytest.fixture
 76 | def dbfsFS():
 77 |     fs = fsspec.filesystem(
 78 |         "dbfs",
 79 |         instance=INSTANCE,
 80 |         token=TOKEN,
 81 |     )
 82 | 
 83 |     return fs
 84 | 
 85 | 
 86 | @pytest.mark.vcr()
 87 | def test_dbfs_file_listing(dbfsFS):
 88 |     assert "/FileStore" in dbfsFS.ls("/", detail=False)
 89 |     assert {"name": "/FileStore", "size": 0, "type": "directory"} in dbfsFS.ls(
 90 |         "/", detail=True
 91 |     )
 92 | 
 93 | 
 94 | @pytest.mark.vcr()
 95 | def test_dbfs_mkdir(dbfsFS):
 96 |     dbfsFS.rm("/FileStore/my", recursive=True)
 97 |     assert "/FileStore/my" not in dbfsFS.ls("/FileStore/", detail=False)
 98 | 
 99 |     dbfsFS.mkdir("/FileStore/my/dir", create_parents=True)
100 | 
101 |     assert "/FileStore/my" in dbfsFS.ls("/FileStore/", detail=False)
102 |     assert "/FileStore/my/dir" in dbfsFS.ls("/FileStore/my/", detail=False)
103 | 
104 |     with pytest.raises(FileExistsError):
105 |         dbfsFS.mkdir("/FileStore/my/dir", create_parents=True, exist_ok=False)
106 | 
107 |     with pytest.raises(OSError):
108 |         dbfsFS.rm("/FileStore/my", recursive=False)
109 | 
110 |     assert "/FileStore/my" in dbfsFS.ls("/FileStore/", detail=False)
111 | 
112 |     dbfsFS.rm("/FileStore/my", recursive=True)
113 |     assert "/FileStore/my" not in dbfsFS.ls("/FileStore/", detail=False)
114 | 
115 | 
116 | @pytest.mark.vcr()
117 | def test_dbfs_write_and_read(dbfsFS):
118 |     dbfsFS.rm("/FileStore/file.csv")
119 |     assert "/FileStore/file.csv" not in dbfsFS.ls("/FileStore/", detail=False)
120 | 
121 |     content = b"This is a test\n" * 100000 + b"For this is the end\n"
122 | 
123 |     with dbfsFS.open("/FileStore/file.csv", "wb") as f:
124 |         f.write(content)
125 | 
126 |     assert "/FileStore/file.csv" in dbfsFS.ls("/FileStore", detail=False)
127 | 
128 |     with dbfsFS.open("/FileStore/file.csv", "rb") as f:
129 |         data = f.read()
130 |         assert data == content
131 | 
132 |     dbfsFS.rm("/FileStore/file.csv")
133 |     assert "/FileStore/file.csv" not in dbfsFS.ls("/FileStore/", detail=False)
134 | 


--------------------------------------------------------------------------------
/fsspec/implementations/sftp.py:
--------------------------------------------------------------------------------
  1 | import paramiko
  2 | from stat import S_ISDIR, S_ISLNK
  3 | import types
  4 | import uuid
  5 | from .. import AbstractFileSystem
  6 | from ..utils import infer_storage_options
  7 | 
  8 | 
  9 | class SFTPFileSystem(AbstractFileSystem):
 10 |     """Files over SFTP/SSH
 11 | 
 12 |     Peer-to-peer filesystem over SSH using paramiko.
 13 | 
 14 |     Note: if using this with the ``open`` or ``open_files``, with full URLs,
 15 |     there is no way to tell if a path is relative, so all paths are assumed
 16 |     to be absolute.
 17 |     """
 18 | 
 19 |     protocol = "sftp", "ssh"
 20 | 
 21 |     def __init__(self, host, **ssh_kwargs):
 22 |         """
 23 | 
 24 |         Parameters
 25 |         ----------
 26 |         host: str
 27 |             Hostname or IP as a string
 28 |         temppath: str
 29 |             Location on the server to put files, when within a transaction
 30 |         ssh_kwargs: dict
 31 |             Parameters passed on to connection. See details in
 32 |             http://docs.paramiko.org/en/2.4/api/client.html#paramiko.client.SSHClient.connect
 33 |             May include port, username, password...
 34 |         """
 35 |         if self._cached:
 36 |             return
 37 |         super(SFTPFileSystem, self).__init__(**ssh_kwargs)
 38 |         self.temppath = ssh_kwargs.pop("temppath", "/tmp")
 39 |         self.host = host
 40 |         self.ssh_kwargs = ssh_kwargs
 41 |         self._connect()
 42 | 
 43 |     def _connect(self):
 44 |         self.client = paramiko.SSHClient()
 45 |         self.client.set_missing_host_key_policy(paramiko.AutoAddPolicy())
 46 |         self.client.connect(self.host, **self.ssh_kwargs)
 47 |         self.ftp = self.client.open_sftp()
 48 | 
 49 |     @classmethod
 50 |     def _strip_protocol(cls, path):
 51 |         return infer_storage_options(path)["path"]
 52 | 
 53 |     @staticmethod
 54 |     def _get_kwargs_from_urls(urlpath):
 55 |         out = infer_storage_options(urlpath)
 56 |         out.pop("path", None)
 57 |         out.pop("protocol", None)
 58 |         return out
 59 | 
 60 |     def mkdir(self, path, mode=511):
 61 |         self.ftp.mkdir(path, mode)
 62 | 
 63 |     def makedirs(self, path, exist_ok=False, mode=511):
 64 |         if self.exists(path) and not exist_ok:
 65 |             raise FileExistsError("File exists: {}".format(path))
 66 | 
 67 |         parts = path.split("/")
 68 |         path = ""
 69 | 
 70 |         for part in parts:
 71 |             path += "/" + part
 72 |             if not self.exists(path):
 73 |                 self.mkdir(path, mode)
 74 | 
 75 |     def rmdir(self, path):
 76 |         self.ftp.rmdir(path)
 77 | 
 78 |     def info(self, path):
 79 |         s = self.ftp.stat(path)
 80 |         if S_ISDIR(s.st_mode):
 81 |             t = "directory"
 82 |         elif S_ISLNK(s.st_mode):
 83 |             t = "link"
 84 |         else:
 85 |             t = "file"
 86 |         return {
 87 |             "name": path + "/" if t == "directory" else path,
 88 |             "size": s.st_size,
 89 |             "type": t,
 90 |             "uid": s.st_uid,
 91 |             "gid": s.st_gid,
 92 |             "time": s.st_atime,
 93 |             "mtime": s.st_mtime,
 94 |         }
 95 | 
 96 |     def ls(self, path, detail=False):
 97 |         out = ["/".join([path.rstrip("/"), p]) for p in self.ftp.listdir(path)]
 98 |         out = [self.info(o) for o in out]
 99 |         if detail:
100 |             return out
101 |         return sorted([p["name"] for p in out])
102 | 
103 |     def put(self, lpath, rpath):
104 |         self.ftp.put(lpath, rpath)
105 | 
106 |     def get(self, rpath, lpath):
107 |         self.ftp.get(rpath, lpath)
108 | 
109 |     def _open(self, path, mode="rb", block_size=None, **kwargs):
110 |         """
111 |         block_size: int or None
112 |             If 0, no buffering, if 1, line buffering, if >1, buffer that many
113 |             bytes, if None use default from paramiko.
114 |         """
115 |         if kwargs.get("autocommit", True) is False:
116 |             # writes to temporary file, move on commit
117 |             path2 = "{}/{}".format(self.temppath, uuid.uuid4())
118 |             f = self.ftp.open(path2, mode, bufsize=block_size if block_size else -1)
119 |             f.temppath = path2
120 |             f.targetpath = path
121 |             f.fs = self
122 |             f.commit = types.MethodType(commit_a_file, f)
123 |             f.discard = types.MethodType(discard_a_file, f)
124 |         else:
125 |             f = self.ftp.open(path, mode, bufsize=block_size if block_size else -1)
126 |         return f
127 | 
128 |     def _rm(self, path):
129 |         if self.isdir(path):
130 |             self.ftp.rmdir(path)
131 |         else:
132 |             self.ftp.remove(path)
133 | 
134 |     def mv(self, old, new):
135 |         self.ftp.posix_rename(old, new)
136 | 
137 | 
138 | def commit_a_file(self):
139 |     self.fs.mv(self.temppath, self.targetpath)
140 | 
141 | 
142 | def discard_a_file(self):
143 |     self.fs._rm(self.temppath)
144 | 


--------------------------------------------------------------------------------
/docs/source/intro.rst:
--------------------------------------------------------------------------------
 1 | Introduction
 2 | ============
 3 | 
 4 | To get stuck into using the package, rather than reading about its philosophy and history, you can
 5 | skip to :doc:`usage`.
 6 | 
 7 | Background
 8 | ----------
 9 | 
10 | Python provides a standard interface for open files, so that alternate implementations of file-like object can
11 | work seamlessly with many function which rely only on the methods of that standard interface. A number of libraries
12 | have implemented a similar concept for file-systems, where file operations can be performed on a logical file-system
13 | which may be local, structured data store or some remote service.
14 | 
15 | This repository is intended to be a place to define a standard interface that such file-systems should adhere to,
16 | such that code using them should not have to know the details of the implementation in order to operate on any of
17 | a number of backends. With hope, the community can come together to
18 | define an interface that is the best for the highest number of users, and having the specification, makes developing
19 | other file-system implementations simpler.
20 | 
21 | History
22 | -------
23 | 
24 | I (Martin Durant) have been involved in building a number of remote-data file-system implementations, principally
25 | in the context of the `Dask`_ project. In particular, several are listed
26 | in `docs`_ with links to the specific repositories.
27 | With common authorship, there is much that is similar between the implementations, for example posix-like naming
28 | of the operations, and this has allowed Dask to be able to interact with the various backends and parse generic
29 | URLs in order to select amongst them. However, *some* extra code was required in each case to adapt the peculiarities
30 | of each implementation with the generic usage that Dask demanded. People may find the
31 | `code`_ which parses URLs and creates file-system
32 | instances interesting.
33 | 
34 | .. _Dask: http://dask.pydata.org/en/latest/
35 | .. _docs: http://dask.pydata.org/en/latest/remote-data-services.html
36 | .. _code: https://github.com/dask/dask/blob/master/dask/bytes/core.py#L266
37 | 
38 | At the same time, the Apache `Arrow`_ project was also concerned with a similar problem,
39 | particularly a common interface to local and HDFS files, for example the
40 | `hdfs`_ interface (which actually communicated with HDFS
41 | with a choice of driver). These are mostly used internally within Arrow, but Dask was modified in order to be able
42 | to use the alternate HDFS interface (which solves some security issues with `hdfs3`). In the process, a
43 | `conversation`_
44 | was started, and I invite all interested parties to continue the conversation in this location.
45 | 
46 | .. _Arrow: https://arrow.apache.org/
47 | .. _hdfs: https://arrow.apache.org/docs/python/filesystems.html
48 | .. _conversation: https://github.com/dask/dask/issues/2880
49 | 
50 | There is a good argument that this type of code has no place in Dask, which is concerned with making graphs
51 | representing computations, and executing those graphs on a scheduler. Indeed, the file-systems are generally useful,
52 | and each has a user-base wider than just those that work via Dask.
53 | 
54 | Influences
55 | ----------
56 | 
57 | The following places to consider, when choosing the definitions of how we would like the file-system specification
58 | to look:
59 | 
60 | - python's `os`_ module and its `path` namespace; also other file-connected
61 |   functionality in the standard library
62 | - posix/bash method naming conventions that linux/unix/osx users are familiar with; or perhaps their Windows variants
63 | - the existing implementations for the various backends (e.g.,
64 |   `gcsfs`_ or Arrow's
65 |   `hdfs`_)
66 | - `pyfilesystems`_, an attempt to do something similar, with a
67 |   plugin architecture. This conception has several types of local file-system, and a lot of well-thought-out
68 |   validation code.
69 | 
70 | .. _os: https://docs.python.org/3/library/os.html
71 | .. _gcsfs: http://gcsfs.readthedocs.io/en/latest/api.html#gcsfs.core.GCSFileSystem
72 | .. _pyfilesystems: https://docs.pyfilesystem.org/en/latest/index.html
73 | 
74 | Not pyfilesystems?
75 | ------------------
76 | 
77 | It might have been conceivable to reuse code in ``pyfilesystems``, which has an established interface and several
78 | implementations of its own. However, it supports none of the :ref:`highlight`, critical to
79 | cloud and parallel access, and would not be easy to
80 | coerce. Following on the success of ``s3fs`` and ``gcsfs``, and their use within Dask, it seemed best to
81 | have an interface as close to those as possible. See a
82 | `discussion`_ on the topic.
83 | 
84 | .. _discussion: https://github.com/intake/filesystem_spec/issues/5
85 | 
86 | Structure of the package
87 | ------------------------
88 | 
89 | The best place to get a feel for the contents of ``fsspec`` is by looking through the :doc:`usage` and
90 | :doc:`api` sections. In addition, the source code will be interesting for those who wish to subclass and
91 | develop new file-system implementations. ``fsspec/spec.py`` contains the main abstract file-system class
92 | to derive from, ``AbstractFileSystem``.
93 | 
94 | .. _zarr: https://zarr.readthedocs.io
95 | 


--------------------------------------------------------------------------------
/fsspec/implementations/dask.py:
--------------------------------------------------------------------------------
  1 | from distributed.worker import get_worker
  2 | from distributed.client import _get_global_client, Client
  3 | import dask
  4 | from fsspec.spec import AbstractFileSystem, AbstractBufferedFile
  5 | from fsspec import filesystem
  6 | from fsspec.utils import infer_storage_options
  7 | 
  8 | 
  9 | def _get_client(client):
 10 |     if client is None:
 11 |         return _get_global_client()
 12 |     elif isinstance(client, Client):
 13 |         return client
 14 |     else:
 15 |         # e.g., connection string
 16 |         return Client(client)
 17 | 
 18 | 
 19 | class DaskWorkerFileSystem(AbstractFileSystem):
 20 |     """View files accessible to a worker as any other remote file-system
 21 | 
 22 |     When instances are run on the worker, uses the real filesystem. When
 23 |     run on the client, they call the worker to provide information or data.
 24 | 
 25 |     **Warning** this implementation is experimental, and read-only for now.
 26 |     """
 27 | 
 28 |     def __init__(
 29 |         self, target_protocol=None, target_options=None, fs=None, client=None, **kwargs
 30 |     ):
 31 |         super().__init__(**kwargs)
 32 |         if not (fs is None) ^ (target_protocol is None):
 33 |             raise ValueError(
 34 |                 "Please provide one of filesystem instance (fs) or"
 35 |                 " target_protocol, not both"
 36 |             )
 37 |         self.target_protocol = target_protocol
 38 |         self.target_options = target_options
 39 |         self.worker = None
 40 |         self.client = client
 41 |         self.fs = fs
 42 |         self._determine_worker()
 43 | 
 44 |     @staticmethod
 45 |     def _get_kwargs_from_urls(path):
 46 |         so = infer_storage_options(path)
 47 |         if "host" in so and "port" in so:
 48 |             return {"client": f"{so['host']}:{so['port']}"}
 49 |         else:
 50 |             return {}
 51 | 
 52 |     def _determine_worker(self):
 53 |         try:
 54 |             get_worker()
 55 |             self.worker = True
 56 |             if self.fs is None:
 57 |                 self.fs = filesystem(
 58 |                     self.target_protocol, **(self.target_options or {})
 59 |                 )
 60 |         except ValueError:
 61 |             self.worker = False
 62 |             self.client = _get_client(self.client)
 63 |             self.rfs = dask.delayed(self)
 64 | 
 65 |     def mkdir(self, *args, **kwargs):
 66 |         if self.worker:
 67 |             self.fs.mkdir(*args, **kwargs)
 68 |         else:
 69 |             self.rfs.mkdir(*args, **kwargs).compute()
 70 | 
 71 |     def rm(self, *args, **kwargs):
 72 |         if self.worker:
 73 |             self.fs.rm(*args, **kwargs)
 74 |         else:
 75 |             self.rfs.rm(*args, **kwargs).compute()
 76 | 
 77 |     def copy(self, *args, **kwargs):
 78 |         if self.worker:
 79 |             self.fs.copy(*args, **kwargs)
 80 |         else:
 81 |             self.rfs.copy(*args, **kwargs).compute()
 82 | 
 83 |     def mv(self, *args, **kwargs):
 84 |         if self.worker:
 85 |             self.fs.mv(*args, **kwargs)
 86 |         else:
 87 |             self.rfs.mv(*args, **kwargs).compute()
 88 | 
 89 |     def ls(self, *args, **kwargs):
 90 |         if self.worker:
 91 |             return self.fs.ls(*args, **kwargs)
 92 |         else:
 93 |             return self.rfs.ls(*args, **kwargs).compute()
 94 | 
 95 |     def _open(
 96 |         self,
 97 |         path,
 98 |         mode="rb",
 99 |         block_size=None,
100 |         autocommit=True,
101 |         cache_options=None,
102 |         **kwargs
103 |     ):
104 |         if self.worker:
105 |             return self.fs._open(
106 |                 path,
107 |                 mode=mode,
108 |                 block_size=block_size,
109 |                 autocommit=autocommit,
110 |                 cache_options=cache_options,
111 |                 **kwargs
112 |             )
113 |         else:
114 |             return DaskFile(
115 |                 fs=self,
116 |                 path=path,
117 |                 mode=mode,
118 |                 block_size=block_size,
119 |                 autocommit=autocommit,
120 |                 cache_options=cache_options,
121 |                 **kwargs
122 |             )
123 | 
124 |     def fetch_range(self, path, mode, start, end):
125 |         if self.worker:
126 |             with self._open(path, mode) as f:
127 |                 f.seek(start)
128 |                 return f.read(end - start)
129 |         else:
130 |             return self.rfs.fetch_range(path, mode, start, end).compute()
131 | 
132 | 
133 | class DaskFile(AbstractBufferedFile):
134 |     def __init__(self, mode="rb", **kwargs):
135 |         if mode != "rb":
136 |             raise ValueError('Remote dask files can only be opened in "rb" mode')
137 |         super().__init__(**kwargs)
138 | 
139 |     def _upload_chunk(self, final=False):
140 |         pass
141 | 
142 |     def _initiate_upload(self):
143 |         """ Create remote file/upload """
144 |         pass
145 | 
146 |     def _fetch_range(self, start, end):
147 |         """Get the specified set of bytes from remote"""
148 |         return self.fs.fetch_range(self.path, self.mode, start, end)
149 | 


--------------------------------------------------------------------------------
/fsspec/compression.py:
--------------------------------------------------------------------------------
  1 | """Helper functions for a standard streaming compression API"""
  2 | from bz2 import BZ2File
  3 | from gzip import GzipFile
  4 | from zipfile import ZipFile
  5 | 
  6 | import fsspec.utils
  7 | from fsspec.spec import AbstractBufferedFile
  8 | 
  9 | 
 10 | def noop_file(file, mode, **kwargs):
 11 |     return file
 12 | 
 13 | 
 14 | # TODO: files should also be available as contexts
 15 | # should be functions of the form func(infile, mode=, **kwargs) -> file-like
 16 | compr = {None: noop_file}
 17 | 
 18 | 
 19 | def register_compression(name, callback, extensions, force=False):
 20 |     """Register an "inferable" file compression type.
 21 | 
 22 |     Registers transparent file compression type for use with fsspec.open.
 23 |     Compression can be specified by name in open, or "infer"-ed for any files
 24 |     ending with the given extensions.
 25 | 
 26 |     Args:
 27 |         name: (str) The compression type name. Eg. "gzip".
 28 |         callback: A callable of form (infile, mode, **kwargs) -> file-like.
 29 |             Accepts an input file-like object, the target mode and kwargs.
 30 |             Returns a wrapped file-like object.
 31 |         extensions: (str, Iterable[str]) A file extension, or list of file
 32 |             extensions for which to infer this compression scheme. Eg. "gz".
 33 |         force: (bool) Force re-registration of compression type or extensions.
 34 | 
 35 |     Raises:
 36 |         ValueError: If name or extensions already registered, and not force.
 37 | 
 38 |     """
 39 |     if isinstance(extensions, str):
 40 |         extensions = [extensions]
 41 | 
 42 |     # Validate registration
 43 |     if name in compr and not force:
 44 |         raise ValueError("Duplicate compression registration: %s" % name)
 45 | 
 46 |     for ext in extensions:
 47 |         if ext in fsspec.utils.compressions and not force:
 48 |             raise ValueError(
 49 |                 "Duplicate compression file extension: %s (%s)" % (ext, name)
 50 |             )
 51 | 
 52 |     compr[name] = callback
 53 | 
 54 |     for ext in extensions:
 55 |         fsspec.utils.compressions[ext] = name
 56 | 
 57 | 
 58 | def unzip(infile, mode="rb", filename=None, **kwargs):
 59 |     if "r" not in mode:
 60 |         filename = filename or "file"
 61 |         z = ZipFile(infile, mode="w", **kwargs)
 62 |         fo = z.open(filename, mode="w")
 63 |         fo.close = lambda closer=fo.close: closer() or z.close()
 64 |         return fo
 65 |     z = ZipFile(infile)
 66 |     if filename is None:
 67 |         filename = z.namelist()[0]
 68 |     return z.open(filename, mode="r", **kwargs)
 69 | 
 70 | 
 71 | register_compression("zip", unzip, "zip")
 72 | register_compression("bz2", BZ2File, "bz2")
 73 | register_compression("gzip", lambda f, **kwargs: GzipFile(fileobj=f, **kwargs), "gz")
 74 | 
 75 | try:
 76 |     from lzma import LZMAFile
 77 | 
 78 |     register_compression("lzma", LZMAFile, "xz")
 79 |     register_compression("xz", LZMAFile, "xz", force=True)
 80 | except ImportError:
 81 |     pass
 82 | 
 83 | try:
 84 |     import lzmaffi
 85 | 
 86 |     register_compression("lzma", lzmaffi.LZMAFile, "xz", force=True)
 87 |     register_compression("xz", lzmaffi.LZMAFile, "xz", force=True)
 88 | except ImportError:
 89 |     pass
 90 | 
 91 | 
 92 | class SnappyFile(AbstractBufferedFile):
 93 |     def __init__(self, infile, mode, **kwargs):
 94 |         import snappy
 95 | 
 96 |         self.details = {"size": 999999999}  # not true, but OK if we don't seek
 97 |         super().__init__(fs=None, path="snappy", mode=mode.strip("b") + "b", **kwargs)
 98 |         self.infile = infile
 99 |         if "r" in mode:
100 |             self.codec = snappy.StreamDecompressor()
101 |         else:
102 |             self.codec = snappy.StreamCompressor()
103 | 
104 |     def _upload_chunk(self, final=False):
105 |         self.buffer.seek(0)
106 |         out = self.codec.add_chunk(self.buffer.read())
107 |         self.infile.write(out)
108 |         return True
109 | 
110 |     def seek(self, loc, whence=0):
111 |         raise NotImplementedError("SnappyFile is not seekable")
112 | 
113 |     def seekable(self):
114 |         return False
115 | 
116 |     def _fetch_range(self, start, end):
117 |         """Get the specified set of bytes from remote"""
118 |         data = self.infile.read(end - start)
119 |         return self.codec.decompress(data)
120 | 
121 | 
122 | try:
123 |     import snappy
124 | 
125 |     snappy.compress
126 |     # Snappy may use the .sz file extension, but this is not part of the
127 |     # standard implementation.
128 |     register_compression("snappy", SnappyFile, [])
129 | 
130 | except (ImportError, NameError):
131 |     pass
132 | 
133 | try:
134 |     import lz4.frame
135 | 
136 |     register_compression("lz4", lz4.frame.open, "lz4")
137 | except ImportError:
138 |     pass
139 | 
140 | try:
141 |     import zstandard as zstd
142 | 
143 |     def zstandard_file(infile, mode="rb"):
144 |         if "r" in mode:
145 |             cctx = zstd.ZstdDecompressor()
146 |             return cctx.stream_reader(infile)
147 |         else:
148 |             cctx = zstd.ZstdCompressor(level=10)
149 |             return cctx.stream_writer(infile)
150 | 
151 |     register_compression("zstd", zstandard_file, "zst")
152 | except ImportError:
153 |     pass
154 | 


--------------------------------------------------------------------------------
/fsspec/tests/test_compression.py:
--------------------------------------------------------------------------------
  1 | import pathlib
  2 | 
  3 | import pytest
  4 | 
  5 | import fsspec.core
  6 | from fsspec.compression import compr, register_compression
  7 | from fsspec.utils import compressions, infer_compression
  8 | 
  9 | 
 10 | def test_infer_custom_compression():
 11 |     """Inferred compression gets values from fsspec.compression.compr."""
 12 |     assert infer_compression("fn.zip") == "zip"
 13 |     assert infer_compression("fn.gz") == "gzip"
 14 |     assert infer_compression("fn.unknown") is None
 15 |     assert infer_compression("fn.test_custom") is None
 16 |     assert infer_compression("fn.tst") is None
 17 | 
 18 |     register_compression("test_custom", lambda f, **kwargs: f, "tst")
 19 | 
 20 |     try:
 21 |         assert infer_compression("fn.zip") == "zip"
 22 |         assert infer_compression("fn.gz") == "gzip"
 23 |         assert infer_compression("fn.unknown") is None
 24 |         assert infer_compression("fn.test_custom") is None
 25 |         assert infer_compression("fn.tst") == "test_custom"
 26 | 
 27 |         # Duplicate registration in name or extension raises a value error.
 28 |         with pytest.raises(ValueError):
 29 |             register_compression("test_custom", lambda f, **kwargs: f, "tst")
 30 | 
 31 |         with pytest.raises(ValueError):
 32 |             register_compression("test_conflicting", lambda f, **kwargs: f, "tst")
 33 |         assert "test_conflicting" not in compr
 34 | 
 35 |         # ...but can be forced.
 36 |         register_compression(
 37 |             "test_conflicting", lambda f, **kwargs: f, "tst", force=True
 38 |         )
 39 |         assert infer_compression("fn.zip") == "zip"
 40 |         assert infer_compression("fn.gz") == "gzip"
 41 |         assert infer_compression("fn.unknown") is None
 42 |         assert infer_compression("fn.test_custom") is None
 43 |         assert infer_compression("fn.tst") == "test_conflicting"
 44 | 
 45 |     finally:
 46 |         del compr["test_custom"]
 47 |         del compr["test_conflicting"]
 48 |         del compressions["tst"]
 49 | 
 50 | 
 51 | def test_lzma_compression_name():
 52 |     pytest.importorskip("lzma")
 53 |     assert infer_compression("fn.xz") == "xz"
 54 | 
 55 | 
 56 | def test_lz4_compression(tmpdir):
 57 |     """Infer lz4 compression for .lz4 files if lz4 is available."""
 58 |     tmp_path = pathlib.Path(str(tmpdir))
 59 | 
 60 |     lz4 = pytest.importorskip("lz4")
 61 | 
 62 |     tmp_path.mkdir(exist_ok=True)
 63 | 
 64 |     tdat = "foobar" * 100
 65 | 
 66 |     with fsspec.core.open(
 67 |         str(tmp_path / "out.lz4"), mode="wt", compression="infer"
 68 |     ) as outfile:
 69 |         outfile.write(tdat)
 70 | 
 71 |     compressed = (tmp_path / "out.lz4").open("rb").read()
 72 |     assert lz4.frame.decompress(compressed).decode() == tdat
 73 | 
 74 |     with fsspec.core.open(
 75 |         str(tmp_path / "out.lz4"), mode="rt", compression="infer"
 76 |     ) as infile:
 77 |         assert infile.read() == tdat
 78 | 
 79 |     with fsspec.core.open(
 80 |         str(tmp_path / "out.lz4"), mode="rt", compression="lz4"
 81 |     ) as infile:
 82 |         assert infile.read() == tdat
 83 | 
 84 | 
 85 | def test_zstd_compression(tmpdir):
 86 |     """Infer zstd compression for .zst files if zstandard is available."""
 87 |     tmp_path = pathlib.Path(str(tmpdir))
 88 | 
 89 |     zstd = pytest.importorskip("zstandard")
 90 | 
 91 |     tmp_path.mkdir(exist_ok=True)
 92 | 
 93 |     tdat = "foobar" * 100
 94 | 
 95 |     with fsspec.core.open(
 96 |         str(tmp_path / "out.zst"), mode="wt", compression="infer"
 97 |     ) as outfile:
 98 |         outfile.write(tdat)
 99 | 
100 |     compressed = (tmp_path / "out.zst").open("rb").read()
101 |     assert zstd.ZstdDecompressor().decompress(compressed, len(tdat)).decode() == tdat
102 | 
103 |     with fsspec.core.open(
104 |         str(tmp_path / "out.zst"), mode="rt", compression="infer"
105 |     ) as infile:
106 |         assert infile.read() == tdat
107 | 
108 |     with fsspec.core.open(
109 |         str(tmp_path / "out.zst"), mode="rt", compression="zstd"
110 |     ) as infile:
111 |         assert infile.read() == tdat
112 | 
113 | 
114 | def test_snappy_compression(tmpdir):
115 |     """No registered compression for snappy, but can be specified."""
116 |     tmp_path = pathlib.Path(str(tmpdir))
117 | 
118 |     snappy = pytest.importorskip("snappy")
119 | 
120 |     tmp_path.mkdir(exist_ok=True)
121 | 
122 |     tdat = "foobar" * 100
123 | 
124 |     # Snappy isn't inferred.
125 |     with fsspec.core.open(
126 |         str(tmp_path / "out.snappy"), mode="wt", compression="infer"
127 |     ) as outfile:
128 |         outfile.write(tdat)
129 |     assert (tmp_path / "out.snappy").open("rb").read().decode() == tdat
130 | 
131 |     # but can be specified.
132 |     with fsspec.core.open(
133 |         str(tmp_path / "out.snappy"), mode="wt", compression="snappy"
134 |     ) as outfile:
135 |         outfile.write(tdat)
136 | 
137 |     compressed = (tmp_path / "out.snappy").open("rb").read()
138 |     assert snappy.StreamDecompressor().decompress(compressed).decode() == tdat
139 | 
140 |     with fsspec.core.open(
141 |         str(tmp_path / "out.snappy"), mode="rb", compression="infer"
142 |     ) as infile:
143 |         assert infile.read() == compressed
144 | 
145 |     with fsspec.core.open(
146 |         str(tmp_path / "out.snappy"), mode="rt", compression="snappy"
147 |     ) as infile:
148 |         assert infile.read() == tdat
149 | 


--------------------------------------------------------------------------------
/fsspec/fuse.py:
--------------------------------------------------------------------------------
  1 | from __future__ import print_function
  2 | import os
  3 | import stat
  4 | from errno import ENOENT, EIO
  5 | from fuse import Operations, FuseOSError
  6 | import threading
  7 | import time
  8 | from fuse import FUSE
  9 | 
 10 | 
 11 | class FUSEr(Operations):
 12 |     def __init__(self, fs, path):
 13 |         self.fs = fs
 14 |         self.cache = {}
 15 |         self.root = path.rstrip("/") + "/"
 16 |         self.counter = 0
 17 | 
 18 |     def getattr(self, path, fh=None):
 19 |         path = "".join([self.root, path.lstrip("/")]).rstrip("/")
 20 |         try:
 21 |             info = self.fs.info(path)
 22 |         except FileNotFoundError:
 23 |             raise FuseOSError(ENOENT)
 24 |         data = {"st_uid": 1000, "st_gid": 1000}
 25 |         perm = 0o777
 26 | 
 27 |         if info["type"] != "file":
 28 |             data["st_mode"] = stat.S_IFDIR | perm
 29 |             data["st_size"] = 0
 30 |             data["st_blksize"] = 0
 31 |         else:
 32 |             data["st_mode"] = stat.S_IFREG | perm
 33 |             data["st_size"] = info["size"]
 34 |             data["st_blksize"] = 5 * 2 ** 20
 35 |             data["st_nlink"] = 1
 36 |         data["st_atime"] = time.time()
 37 |         data["st_ctime"] = time.time()
 38 |         data["st_mtime"] = time.time()
 39 |         return data
 40 | 
 41 |     def readdir(self, path, fh):
 42 |         path = "".join([self.root, path.lstrip("/")])
 43 |         files = self.fs.ls(path, False)
 44 |         files = [os.path.basename(f.rstrip("/")) for f in files]
 45 |         return [".", ".."] + files
 46 | 
 47 |     def mkdir(self, path, mode):
 48 |         path = "".join([self.root, path.lstrip("/")])
 49 |         self.fs.mkdir(path)
 50 |         return 0
 51 | 
 52 |     def rmdir(self, path):
 53 |         path = "".join([self.root, path.lstrip("/")])
 54 |         self.fs.rmdir(path)
 55 |         return 0
 56 | 
 57 |     def read(self, path, size, offset, fh):
 58 |         f = self.cache[fh]
 59 |         f.seek(offset)
 60 |         out = f.read(size)
 61 |         return out
 62 | 
 63 |     def write(self, path, data, offset, fh):
 64 |         f = self.cache[fh]
 65 |         f.write(data)
 66 |         return len(data)
 67 | 
 68 |     def create(self, path, flags, fi=None):
 69 |         fn = "".join([self.root, path.lstrip("/")])
 70 |         f = self.fs.open(fn, "wb")
 71 |         self.cache[self.counter] = f
 72 |         self.counter += 1
 73 |         return self.counter - 1
 74 | 
 75 |     def open(self, path, flags):
 76 |         fn = "".join([self.root, path.lstrip("/")])
 77 |         if flags % 2 == 0:
 78 |             # read
 79 |             mode = "rb"
 80 |         else:
 81 |             # write/create
 82 |             mode = "wb"
 83 |         self.cache[self.counter] = self.fs.open(fn, mode)
 84 |         self.counter += 1
 85 |         return self.counter - 1
 86 | 
 87 |     def truncate(self, path, length, fh=None):
 88 |         fn = "".join([self.root, path.lstrip("/")])
 89 |         if length != 0:
 90 |             raise NotImplementedError
 91 |         # maybe should be no-op since open with write sets size to zero anyway
 92 |         self.fs.touch(fn)
 93 | 
 94 |     def unlink(self, path):
 95 |         fn = "".join([self.root, path.lstrip("/")])
 96 |         try:
 97 |             self.fs.rm(fn, False)
 98 |         except (IOError, FileNotFoundError):
 99 |             raise FuseOSError(EIO)
100 | 
101 |     def release(self, path, fh):
102 |         try:
103 |             if fh in self.cache:
104 |                 f = self.cache[fh]
105 |                 f.close()
106 |                 self.cache.pop(fh)
107 |         except Exception as e:
108 |             print(e)
109 |         return 0
110 | 
111 |     def chmod(self, path, mode):
112 |         raise NotImplementedError
113 | 
114 | 
115 | def run(fs, path, mount_point, foreground=True, threads=False):
116 |     """Mount stuff in a local directory
117 | 
118 |     This uses fusepy to make it appear as if a given path on an fsspec
119 |     instance is in fact resident within the local file-system.
120 | 
121 |     This requires that fusepy by installed, and that FUSE be available on
122 |     the system (typically requiring a package to be installed with
123 |     apt, yum, brew, etc.).
124 | 
125 |     Parameters
126 |     ----------
127 |     fs: file-system instance
128 |         From one of the compatible implementations
129 |     path: str
130 |         Location on that file-system to regard as the root directory to
131 |         mount. Note that you typically should include the terminating "/"
132 |         character.
133 |     mount_point: str
134 |         An empty directory on the local file-system where the contents of
135 |         the remote path will appear
136 |     foreground: bool
137 |         Whether or not calling this function will block. Operation will
138 |         typically be more stable if True.
139 |     threads: bool
140 |         Whether or not to create threads when responding to file operations
141 |         within the mounter directory. Operation will typically be more
142 |         stable if False.
143 | 
144 |     """
145 |     func = lambda: FUSE(
146 |         FUSEr(fs, path), mount_point, nothreads=not threads, foreground=foreground
147 |     )
148 |     if foreground is False:
149 |         th = threading.Thread(target=func)
150 |         th.daemon = True
151 |         th.start()
152 |         return th
153 |     else:  # pragma: no cover
154 |         try:
155 |             func()
156 |         except KeyboardInterrupt:
157 |             pass
158 | 


--------------------------------------------------------------------------------
/fsspec/implementations/zip.py:
--------------------------------------------------------------------------------
  1 | from __future__ import print_function, division, absolute_import
  2 | 
  3 | import zipfile
  4 | from fsspec import AbstractFileSystem, open_files
  5 | from fsspec.utils import tokenize, DEFAULT_BLOCK_SIZE
  6 | 
  7 | 
  8 | class ZipFileSystem(AbstractFileSystem):
  9 |     """Read contents of ZIP archive as a file-system
 10 | 
 11 |     Keeps file object open while instance lives.
 12 | 
 13 |     This class is pickleable, but not necessarily thread-safe
 14 |     """
 15 | 
 16 |     root_marker = ""
 17 | 
 18 |     def __init__(
 19 |         self,
 20 |         fo="",
 21 |         mode="r",
 22 |         target_protocol=None,
 23 |         target_options=None,
 24 |         block_size=DEFAULT_BLOCK_SIZE,
 25 |         **kwargs
 26 |     ):
 27 |         """
 28 |         Parameters
 29 |         ----------
 30 |         fo: str or file-like
 31 |             Contains ZIP, and must exist. If a str, will fetch file using
 32 |             `open_files()`, which must return one file exactly.
 33 |         mode: str
 34 |             Currently, only 'r' accepted
 35 |         target_protocol: str (optional)
 36 |             If ``fo`` is a string, this value can be used to override the
 37 |             FS protocol inferred from a URL
 38 |         target_options: dict (optional)
 39 |             Kwargs passed when instantiating the target FS, if ``fo`` is
 40 |             a string.
 41 |         """
 42 |         super().__init__(self, **kwargs)
 43 |         if mode != "r":
 44 |             raise ValueError("Only read from zip files accepted")
 45 |         if isinstance(fo, str):
 46 |             files = open_files(fo, protocol=target_protocol, **(target_options or {}))
 47 |             if len(files) != 1:
 48 |                 raise ValueError(
 49 |                     'Path "{}" did not resolve to exactly'
 50 |                     'one file: "{}"'.format(fo, files)
 51 |                 )
 52 |             fo = files[0]
 53 |         self.fo = fo.__enter__()  # the whole instance is a context
 54 |         self.zip = zipfile.ZipFile(self.fo)
 55 |         self.block_size = block_size
 56 |         self.dir_cache = None
 57 | 
 58 |     @classmethod
 59 |     def _strip_protocol(cls, path):
 60 |         # zip file paths are always relative to the archive root
 61 |         return super()._strip_protocol(path).lstrip("/")
 62 | 
 63 |     def _get_dirs(self):
 64 |         if self.dir_cache is None:
 65 |             files = self.zip.infolist()
 66 |             self.dir_cache = {
 67 |                 dirname + "/": {"name": dirname + "/", "size": 0, "type": "directory"}
 68 |                 for dirname in self._all_dirnames(self.zip.namelist())
 69 |             }
 70 |             for z in files:
 71 |                 f = {s: getattr(z, s) for s in zipfile.ZipInfo.__slots__}
 72 |                 f.update(
 73 |                     {
 74 |                         "name": z.filename,
 75 |                         "size": z.file_size,
 76 |                         "type": ("directory" if z.is_dir() else "file"),
 77 |                     }
 78 |                 )
 79 |                 self.dir_cache[f["name"]] = f
 80 | 
 81 |     def info(self, path, **kwargs):
 82 |         self._get_dirs()
 83 |         path = self._strip_protocol(path)
 84 |         if path in self.dir_cache:
 85 |             return self.dir_cache[path]
 86 |         elif path + "/" in self.dir_cache:
 87 |             return self.dir_cache[path + "/"]
 88 |         else:
 89 |             raise FileNotFoundError(path)
 90 | 
 91 |     def ls(self, path, detail=False, **kwargs):
 92 |         self._get_dirs()
 93 |         paths = {}
 94 |         for p, f in self.dir_cache.items():
 95 |             p = p.rstrip("/")
 96 |             if "/" in p:
 97 |                 root = p.rsplit("/", 1)[0]
 98 |             else:
 99 |                 root = ""
100 |             if root == path.rstrip("/"):
101 |                 paths[p] = f
102 |             elif all(
103 |                 (a == b)
104 |                 for a, b in zip(path.split("/"), [""] + p.strip("/").split("/"))
105 |             ):
106 |                 # root directory entry
107 |                 ppath = p.rstrip("/").split("/", 1)[0]
108 |                 if ppath not in paths:
109 |                     out = {"name": ppath + "/", "size": 0, "type": "directory"}
110 |                     paths[ppath] = out
111 |         out = list(paths.values())
112 |         if detail:
113 |             return out
114 |         else:
115 |             return list(sorted(f["name"] for f in out))
116 | 
117 |     def cat(self, path):
118 |         return self.zip.read(path)
119 | 
120 |     def _open(
121 |         self,
122 |         path,
123 |         mode="rb",
124 |         block_size=None,
125 |         autocommit=True,
126 |         cache_options=None,
127 |         **kwargs
128 |     ):
129 |         path = self._strip_protocol(path)
130 |         if mode != "rb":
131 |             raise NotImplementedError
132 |         info = self.info(path)
133 |         out = self.zip.open(path, "r")
134 |         out.size = info["size"]
135 |         out.name = info["name"]
136 |         return out
137 | 
138 |     def ukey(self, path):
139 |         return tokenize(path, self.fo, self.protocol)
140 | 
141 |     def _all_dirnames(self, paths):
142 |         """Returns *all* directory names for each path in paths, including intermediate ones.
143 | 
144 |         Parameters
145 |         ----------
146 |         paths: Iterable of path strings
147 |         """
148 |         if len(paths) == 0:
149 |             return set()
150 | 
151 |         dirnames = {self._parent(path) for path in paths} - {self.root_marker}
152 |         return dirnames | self._all_dirnames(dirnames)
153 | 


--------------------------------------------------------------------------------
/fsspec/tests/test_core.py:
--------------------------------------------------------------------------------
  1 | import os
  2 | import pickle
  3 | import pytest
  4 | import tempfile
  5 | 
  6 | from fsspec.core import (
  7 |     _expand_paths,
  8 |     OpenFile,
  9 |     open_local,
 10 |     get_compression,
 11 |     open_files,
 12 |     OpenFiles,
 13 | )
 14 | import fsspec
 15 | 
 16 | 
 17 | @pytest.mark.parametrize(
 18 |     "path, name_function, num, out",
 19 |     [
 20 |         [["apath"], None, 1, ["apath"]],
 21 |         ["apath.*.csv", None, 1, ["apath.0.csv"]],
 22 |         ["apath.*.csv", None, 2, ["apath.0.csv", "apath.1.csv"]],
 23 |         ["a*", lambda x: "abc"[x], 2, ["aa", "ab"]],
 24 |     ],
 25 | )
 26 | def test_expand_paths(path, name_function, num, out):
 27 |     assert _expand_paths(path, name_function, num) == out
 28 | 
 29 | 
 30 | def test_expand_error():
 31 |     with pytest.raises(ValueError):
 32 |         _expand_paths("*.*", None, 1)
 33 | 
 34 | 
 35 | def test_openfile_api(m):
 36 |     m.open("somepath", "wb").write(b"data")
 37 |     of = OpenFile(m, "somepath")
 38 |     assert str(of) == "<OpenFile 'somepath'>"
 39 |     f = of.open()
 40 |     assert f.read() == b"data"
 41 |     f.close()
 42 |     with OpenFile(m, "somepath", mode="rt") as f:
 43 |         f.read() == "data"
 44 | 
 45 | 
 46 | def test_openfile_open(m):
 47 |     of = OpenFile(m, "somepath", mode="wt")
 48 |     f = of.open()
 49 |     f.write("hello")
 50 |     assert m.size("somepath") == 0  # no flush yet
 51 |     del of
 52 |     assert m.size("somepath") == 0  # still no flush
 53 |     f.close()
 54 |     assert m.size("somepath") == 5
 55 | 
 56 | 
 57 | def test_open_local():
 58 |     d1 = str(tempfile.mkdtemp())
 59 |     f1 = os.path.join(d1, "f1")
 60 |     open(f1, "w").write("test1")
 61 |     d2 = str(tempfile.mkdtemp())
 62 |     fn = open_local("simplecache://" + f1, cache_storage=d2, target_protocol="file")
 63 |     assert isinstance(fn, str)
 64 |     assert open(fn).read() == "test1"
 65 |     assert d2 in fn
 66 | 
 67 | 
 68 | def test_xz_lzma_compressions():
 69 |     pytest.importorskip("lzma")
 70 |     # Ensure that both 'xz' and 'lzma' compression names can be parsed
 71 |     assert get_compression("some_file.xz", "infer") == "xz"
 72 |     assert get_compression("some_file.xz", "xz") == "xz"
 73 |     assert get_compression("some_file.xz", "lzma") == "lzma"
 74 | 
 75 | 
 76 | def test_list():
 77 |     here = os.path.abspath(os.path.dirname(__file__))
 78 |     flist = os.listdir(here)
 79 |     plist = [os.path.join(here, p).replace("\\", "/") for p in flist]
 80 |     of = open_files(plist)
 81 |     assert len(of) == len(flist)
 82 |     assert [f.path for f in of] == plist
 83 | 
 84 | 
 85 | def test_pathobject(tmpdir):
 86 |     import pathlib
 87 | 
 88 |     tmpdir = str(tmpdir)
 89 |     plist_str = [os.path.join(str(tmpdir), f).replace("\\", "/") for f in ["a", "b"]]
 90 |     open(plist_str[0], "w").write("first file")
 91 |     open(plist_str[1], "w").write("second file")
 92 |     plist = [pathlib.Path(p) for p in plist_str]
 93 |     of = open_files(plist)
 94 |     assert len(of) == 2
 95 |     assert [f.path for f in of] == plist_str
 96 | 
 97 |     of = open_files(plist[0])
 98 |     assert len(of) == 1
 99 |     assert of[0].path == plist_str[0]
100 |     with of[0] as f:
101 |         assert f.read() == open(plist_str[0], "rb").read()
102 | 
103 | 
104 | def test_automkdir(tmpdir):
105 |     dir = os.path.join(str(tmpdir), "a")
106 |     of = fsspec.open(os.path.join(dir, "afile"), "w")
107 |     with of:
108 |         pass
109 |     assert "afile" in os.listdir(dir)
110 | 
111 |     dir = os.path.join(str(tmpdir), "b")
112 |     of = fsspec.open(os.path.join(dir, "bfile"), "w", auto_mkdir=True)
113 |     with of:
114 |         pass
115 | 
116 |     assert "bfile" in os.listdir(dir)
117 | 
118 |     dir = os.path.join(str(tmpdir), "c")
119 |     with pytest.raises(FileNotFoundError):
120 |         of = fsspec.open(os.path.join(dir, "bfile"), "w", auto_mkdir=False)
121 |         with of:
122 |             pass
123 | 
124 | 
125 | def test_automkdir_readonly(tmpdir):
126 |     dir = os.path.join(str(tmpdir), "d")
127 |     with pytest.raises(FileNotFoundError):
128 |         of = fsspec.open(os.path.join(dir, "dfile"), "r")
129 |         with of:
130 |             pass
131 | 
132 | 
133 | def test_openfile_pickle_newline():
134 |     # GH#318
135 |     test = fsspec.open(__file__, newline=b"")
136 | 
137 |     pickled = pickle.dumps(test)
138 |     restored = pickle.loads(pickled)
139 | 
140 |     assert test.newline == restored.newline
141 | 
142 | 
143 | def test_mismatch():
144 |     with pytest.raises(ValueError, match="protocol"):
145 |         open_files(["s3://test/path.csv", "/other/path.csv"])
146 | 
147 | 
148 | def test_url_kwargs_chain(ftp_writable):
149 |     host, port, username, password = "localhost", 2121, "user", "pass"
150 |     data = b"hello"
151 |     with fsspec.open(
152 |         "ftp:///afile", "wb", host=host, port=port, username=username, password=password
153 |     ) as f:
154 |         f.write(data)
155 | 
156 |     with fsspec.open(
157 |         "simplecache::ftp://{}:{}@{}:{}/afile".format(username, password, host, port),
158 |         "rb",
159 |     ) as f:
160 |         assert f.read() == data
161 | 
162 | 
163 | def test_multi_context(tmpdir):
164 |     fns = [os.path.join(tmpdir, fn) for fn in ["a", "b"]]
165 |     files = open_files(fns, "wb")
166 |     assert isinstance(files, OpenFiles)
167 |     assert isinstance(files[0], OpenFile)
168 |     assert len(files) == 2
169 |     with files as of:
170 |         assert len(of) == 2
171 |         assert not of[0].closed
172 |         assert of[0].name.endswith("a")
173 |     assert of[0].closed
174 |     assert repr(files) == "<List of 2 OpenFile instances>"
175 | 
176 | 
177 | def test_not_local():
178 |     with pytest.raises(ValueError, match="attribute local_file=True"):
179 |         open_local("memory://afile")
180 | 


--------------------------------------------------------------------------------
/fsspec/implementations/tests/test_zip.py:
--------------------------------------------------------------------------------
  1 | import zipfile
  2 | from contextlib import contextmanager
  3 | 
  4 | import os
  5 | import pickle
  6 | import pytest
  7 | import sys
  8 | import tempfile
  9 | import fsspec
 10 | 
 11 | 
 12 | @contextmanager
 13 | def tempzip(data={}):
 14 |     f = tempfile.mkstemp(suffix="zip")[1]
 15 |     with zipfile.ZipFile(f, mode="w") as z:
 16 |         for k, v in data.items():
 17 |             z.writestr(k, v)
 18 |     try:
 19 |         yield f
 20 |     finally:
 21 |         try:
 22 |             os.remove(f)
 23 |         except (IOError, OSError):
 24 |             pass
 25 | 
 26 | 
 27 | data = {"a": b"", "b": b"hello", "deeply/nested/path": b"stuff"}
 28 | 
 29 | 
 30 | def test_empty():
 31 |     with tempzip() as z:
 32 |         fs = fsspec.filesystem("zip", fo=z)
 33 |         assert fs.find("") == []
 34 |         assert fs.find("", withdirs=True) == []
 35 |         with pytest.raises(FileNotFoundError):
 36 |             fs.info("")
 37 |         assert fs.ls("") == []
 38 | 
 39 | 
 40 | def test_glob():
 41 |     with tempzip(data) as z:
 42 |         fs = fsspec.filesystem("zip", fo=z)
 43 |         assert fs.glob("*/*/*th") == ["deeply/nested/path"]
 44 | 
 45 | 
 46 | @pytest.mark.xfail(sys.version_info < (3, 6), reason="zip-info odd on py35")
 47 | def test_mapping():
 48 |     with tempzip(data) as z:
 49 |         fs = fsspec.filesystem("zip", fo=z)
 50 |         m = fs.get_mapper("")
 51 |         assert list(m) == ["a", "b", "deeply/nested/path"]
 52 |         assert m["b"] == data["b"]
 53 | 
 54 | 
 55 | @pytest.mark.xfail(sys.version_info < (3, 6), reason="zip not supported on py35")
 56 | def test_pickle():
 57 |     with tempzip(data) as z:
 58 |         fs = fsspec.filesystem("zip", fo=z)
 59 |         fs2 = pickle.loads(pickle.dumps(fs))
 60 |         assert fs2.cat("b") == b"hello"
 61 | 
 62 | 
 63 | def test_all_dirnames():
 64 |     with tempzip() as z:
 65 |         fs = fsspec.filesystem("zip", fo=z)
 66 | 
 67 |         # fx are files, dx are a directories
 68 |         assert fs._all_dirnames([]) == set()
 69 |         assert fs._all_dirnames(["f1"]) == set()
 70 |         assert fs._all_dirnames(["f1", "f2"]) == set()
 71 |         assert fs._all_dirnames(["f1", "f2", "d1/f1"]) == {"d1"}
 72 |         assert fs._all_dirnames(["f1", "d1/f1", "d1/f2"]) == {"d1"}
 73 |         assert fs._all_dirnames(["f1", "d1/f1", "d2/f1"]) == {"d1", "d2"}
 74 |         assert fs._all_dirnames(["d1/d1/d1/f1"]) == {"d1", "d1/d1", "d1/d1/d1"}
 75 | 
 76 | 
 77 | def test_ls():
 78 |     with tempzip(data) as z:
 79 |         lhs = fsspec.filesystem("zip", fo=z)
 80 | 
 81 |         assert lhs.ls("") == ["a", "b", "deeply/"]
 82 |         assert lhs.ls("/") == lhs.ls("")
 83 | 
 84 |         assert lhs.ls("deeply") == ["deeply/nested/"]
 85 |         assert lhs.ls("deeply/") == lhs.ls("deeply")
 86 | 
 87 |         assert lhs.ls("deeply/nested") == ["deeply/nested/path"]
 88 |         assert lhs.ls("deeply/nested/") == lhs.ls("deeply/nested")
 89 | 
 90 | 
 91 | def test_find():
 92 |     with tempzip(data) as z:
 93 |         lhs = fsspec.filesystem("zip", fo=z)
 94 | 
 95 |         assert lhs.find("") == ["a", "b", "deeply/nested/path"]
 96 |         assert lhs.find("", withdirs=True) == [
 97 |             "a",
 98 |             "b",
 99 |             "deeply/",
100 |             "deeply/nested/",
101 |             "deeply/nested/path",
102 |         ]
103 | 
104 |         assert lhs.find("deeply") == ["deeply/nested/path"]
105 |         assert lhs.find("deeply/") == lhs.find("deeply")
106 | 
107 | 
108 | def test_walk():
109 |     with tempzip(data) as z:
110 |         fs = fsspec.filesystem("zip", fo=z)
111 |         expected = [
112 |             # (dirname, list of subdirs, list of files)
113 |             ("", ["deeply"], ["a", "b"]),
114 |             ("deeply", ["nested"], []),
115 |             ("deeply/nested", [], ["path"]),
116 |         ]
117 |         assert list(fs.walk("")) == expected
118 | 
119 | 
120 | def test_info():
121 |     with tempzip(data) as z:
122 |         fs_cache = fsspec.filesystem("zip", fo=z)
123 | 
124 |         with pytest.raises(FileNotFoundError):
125 |             fs_cache.info("i-do-not-exist")
126 | 
127 |         # Iterate over all directories
128 |         # The ZipFile does not include additional information about the directories,
129 |         for d in fs_cache._all_dirnames(data.keys()):
130 |             lhs = fs_cache.info(d)
131 |             expected = {"name": f"{d}/", "size": 0, "type": "directory"}
132 |             assert lhs == expected
133 | 
134 |         # Iterate over all files
135 |         for f, v in data.items():
136 |             lhs = fs_cache.info(f)
137 |             assert lhs["name"] == f
138 |             assert lhs["size"] == len(v)
139 |             assert lhs["type"] == "file"
140 | 
141 |             # There are many flags specific to Zip Files.
142 |             # These are two we can use to check we are getting some of them
143 |             assert "CRC" in lhs
144 |             assert "compress_size" in lhs
145 | 
146 | 
147 | @pytest.mark.parametrize("scale", [128, 512, 4096])
148 | def test_isdir_isfile(scale):
149 |     def make_nested_dir(i):
150 |         x = f"{i}"
151 |         table = x.maketrans("0123456789", "ABCDEFGHIJ")
152 |         return "/".join(x.translate(table))
153 | 
154 |     scaled_data = {f"{make_nested_dir(i)}/{i}": b"" for i in range(1, scale + 1)}
155 |     with tempzip(scaled_data) as z:
156 |         fs = fsspec.filesystem("zip", fo=z)
157 | 
158 |         lhs_dirs, lhs_files = fs._all_dirnames(scaled_data.keys()), scaled_data.keys()
159 | 
160 |         # Warm-up the Cache, this is done in both cases anyways...
161 |         fs._get_dirs()
162 | 
163 |         entries = lhs_files | lhs_dirs
164 | 
165 |         assert lhs_dirs == {e for e in entries if fs.isdir(e)}
166 |         assert lhs_files == {e for e in entries if fs.isfile(e)}
167 | 


--------------------------------------------------------------------------------
/fsspec/implementations/reference.py:
--------------------------------------------------------------------------------
  1 | import json
  2 | from ..asyn import AsyncFileSystem
  3 | from ..core import open, filesystem
  4 | 
  5 | 
  6 | class ReferenceFileSystem(AsyncFileSystem):
  7 |     """View byte ranges of some other file as a file system
  8 | 
  9 |     Initial version: single file system target, which must support
 10 |     async, and must allow start and end args in _cat_file. Later versions
 11 |     may allow multiple arbitrary URLs for the targets.
 12 | 
 13 |     This FileSystem is read-only. It is designed to be used with async
 14 |     targets (for now). This FileSystem only allows whole-file access, no
 15 |     ``open``. We do not get original file details from the target FS.
 16 | 
 17 |     Configuration is by passing a dict of references at init, or a URL to
 18 |     a JSON file containing the same; this dict
 19 |     can also contain concrete data for some set of paths.
 20 | 
 21 |     Reference dict format:
 22 |     {path0: bytes_data, path1: (target_url, offset, size)}
 23 | 
 24 |     https://github.com/intake/fsspec-reference-maker/blob/main/README.md
 25 |     """
 26 | 
 27 |     protocol = "reference"
 28 | 
 29 |     def __init__(
 30 |         self,
 31 |         references,
 32 |         target=None,
 33 |         ref_storage_args=None,
 34 |         target_protocol=None,
 35 |         target_options=None,
 36 |         fs=None,
 37 |         **kwargs
 38 |     ):
 39 |         """
 40 | 
 41 |         Parameters
 42 |         ----------
 43 |         references : dict or str
 44 |             The set of references to use for this instance, with a structure as above.
 45 |             If str, will use fsspec.open, in conjunction with ref_storage_args to
 46 |             open and parse JSON at this location.
 47 |         target : str
 48 |             For any references having target_url as None, this is the default file
 49 |             target to use
 50 |         ref_storage_args : dict
 51 |             If references is a str, use these kwargs for loading the JSON file
 52 |         target_protocol : str
 53 |             If fs is None, instantiate a file system using this protocol
 54 |         target_options : dict
 55 |             If fs is None, instantiate a filesystem using these kwargs
 56 |         fs : file system instance
 57 |             Directly provide a file system, if you want to configure it beforehand. This
 58 |             takes precedence over target_protocol/target_options
 59 |         kwargs : passed to parent class
 60 |         """
 61 |         if fs is not None:
 62 |             if not fs.async_impl:
 63 |                 raise NotImplementedError("Only works with async targets")
 64 |             kwargs["loop"] = fs.loop
 65 |         super().__init__(**kwargs)
 66 |         if fs is None:
 67 |             fs = filesystem(target_protocol, loop=self.loop, **(target_options or {}))
 68 |         if not fs.async_impl:
 69 |             raise NotImplementedError("Only works with async targets")
 70 |         if isinstance(references, str):
 71 |             with open(references, "rb", **(ref_storage_args or {})) as f:
 72 |                 references = json.load(f)
 73 |         self.references = references
 74 |         self.target = target
 75 |         self._process_references()
 76 |         self.fs = fs
 77 | 
 78 |     async def _cat_file(self, path):
 79 |         path = self._strip_protocol(path)
 80 |         part = self.references[path]
 81 |         if isinstance(part, bytes):
 82 |             return part
 83 |         elif isinstance(part, str):
 84 |             return part.encode()
 85 |         url, start, size = part
 86 |         end = start + size
 87 |         if url is None:
 88 |             url = self.target
 89 |         return await self.fs._cat_file(url, start=start, end=end)
 90 | 
 91 |     def _process_references(self):
 92 |         if "zarr_consolidated_format" in self.references:
 93 |             self.references = _unmodel_hdf5(self.references)
 94 |         self.dircache = {"": []}
 95 |         for path, part in self.references.items():
 96 |             if isinstance(part, (bytes, str)):
 97 |                 size = len(part)
 98 |             else:
 99 |                 _, start, end = part
100 |                 size = end - start
101 |             par = self._parent(path)
102 |             par0 = par
103 |             while par0:
104 |                 # build parent directories
105 |                 if par0 not in self.dircache:
106 |                     self.dircache[par0] = []
107 |                     self.dircache.setdefault(self._parent(par0), []).append(
108 |                         {"name": par0, "type": "directory", "size": 0}
109 |                     )
110 |                 par0 = self._parent(par0)
111 | 
112 |             self.dircache[par].append({"name": path, "type": "file", "size": size})
113 | 
114 |     def ls(self, path, detail=True, **kwargs):
115 |         path = self._strip_protocol(path)
116 |         out = self._ls_from_cache(path)
117 |         if detail:
118 |             return out
119 |         return [o["name"] for o in out]
120 | 
121 | 
122 | def _unmodel_hdf5(references):
123 |     """Special JSON format from HDF5"""
124 |     # see https://gist.github.com/ajelenak/80354a95b449cedea5cca508004f97a9
125 |     import re
126 | 
127 |     ref = {}
128 |     for key, value in references["metadata"].items():
129 |         if key.endswith(".zchunkstore"):
130 |             source = value.pop("source")["uri"]
131 |             match = re.findall(r"https://([^.]+)\.s3\.amazonaws\.com", source)
132 |             if match:
133 |                 source = source.replace(
134 |                     f"https://{match[0]}.s3.amazonaws.com", match[0]
135 |                 )
136 |             for k, v in value.items():
137 |                 ref[k] = (source, v["offset"], v["offset"] + v["size"])
138 |         else:
139 |             ref[key] = json.dumps(value).encode()
140 |     return ref
141 | 


--------------------------------------------------------------------------------
/docs/source/changelog.rst:
--------------------------------------------------------------------------------
  1 | Changelog
  2 | =========
  3 | 
  4 | Dev
  5 | -------------
  6 | 
  7 | Features:
  8 | 
  9 | - Add dbfs:// support
 10 | 
 11 | Fixes:
 12 | 
 13 | - random appending of a directory within the filesystems ``find()`` method
 14 | 
 15 | Version 0.8.5
 16 | -------------
 17 | 
 18 | Features:
 19 | 
 20 | - config system
 21 | - libarchive implementation
 22 | - add reference file system implementation
 23 | 
 24 | Version 0.8.4
 25 | -------------
 26 | 
 27 | Features:
 28 | 
 29 | - function ``can_be_local`` to see whether URL is compatible with ``open_local``
 30 | - concurrent cat with filecaches, if backend supports it
 31 | - jupyter FS
 32 | 
 33 | Fixes:
 34 | 
 35 | - dircache expiry after transaction
 36 | - blockcache garbage collection
 37 | - close for HDFS
 38 | - windows tests
 39 | - glob depth with "**"
 40 | 
 41 | Version 0.8.3
 42 | -------------
 43 | 
 44 | Features:
 45 | 
 46 | - error options for cat
 47 | - memory fs created time in detailed `ls`
 48 | 
 49 | 
 50 | Fixes:
 51 | 
 52 | - duplicate directories could appear in MemoryFileSystem
 53 | - Added support for hat dollar lbrace rbrace regex character escapes in glob
 54 | - Fix blockcache (was doing unnecessary work)
 55 | - handle multibyte dtypes in readinto
 56 | - Fix missing kwargs in call to _copy in asyn
 57 | 
 58 | Other:
 59 | 
 60 | - Stop inheriting from pyarrow.filesystem for pyarrow>=2.0
 61 | - Raise low-level program friendly OSError.
 62 | - Guard against instance reuse in new processes
 63 | - Make hash_name a method on CachingFileSystem to make it easier to change.
 64 | - Use get_event_loop for py3.6 compatibility
 65 | 
 66 | Version 0.8.2
 67 | -------------
 68 | 
 69 | Fixes:
 70 | 
 71 | - More careful strip for caching
 72 | 
 73 | Version 0.8.1
 74 | -------------
 75 | 
 76 | Features:
 77 | 
 78 | - add sign to base class
 79 | - Allow calling of coroutines from normal code when running async
 80 | - Implement writing for cached many files
 81 | - Allow concurrent caching of remote files
 82 | - Add gdrive:// protocol
 83 | 
 84 | Fixes:
 85 | 
 86 | - Fix memfs with exact ls
 87 | - HTTPFileSystem requires requests and aiohttp in registry
 88 | 
 89 | Other:
 90 | 
 91 | - Allow http kwargs to clientSession
 92 | - Use extras_require in setup.py for optional dependencies
 93 | - Replacing md5 with sha256 for hash (CVE req)
 94 | - Test against Python 3.8, drop 3.5 testing
 95 | - add az alias for abfs
 96 | 
 97 | Version 0.8.0
 98 | -------------
 99 | 
100 | Major release allowing async implementations with concurrent batch
101 | operations.
102 | 
103 | Features:
104 | 
105 | - async filesystem spec, first applied to HTTP
106 | - OpenFiles cContext for multiple files
107 | - Document async, and ensure docstrings
108 | - Make LocalFileOpener iterable
109 | - handle smb:// protocol using smbprotocol package
110 | - allow Path object in open
111 | - simplecache write mode
112 | 
113 | Fixes:
114 | 
115 | - test_local: fix username not in home path
116 | - Tighten cacheFS if dir deleted
117 | - Fix race condition of lzma import when using threads
118 | - properly rewind MemoryFile
119 | - OpenFile newline in reduce
120 | 
121 | Other:
122 | 
123 | - Add aiobotocore to deps for s3fs check
124 | - Set default clobber=True on impl register
125 | - Use _get_kwargs_from_url when unchaining
126 | - Add cache_type and cache_options to HTTPFileSystem constructor
127 | 
128 | Version 0.7.5
129 | -------------
130 | 
131 | * async implemented for HTTP as prototype (read-only)
132 | * write for simplecache
133 | * added SMB (Samba, protocol >=2) implementation
134 | 
135 | Version 0.7.4
136 | -------------
137 | 
138 | * panel-based GUI
139 | 
140 | 0.7.3 series
141 | ------------
142 | 
143 | * added ``git`` and ``github`` interfaces
144 | * added chained syntax for open, open_files and get_mapper
145 | * adapt webHDFS for HttpFS
146 | * added open_local
147 | * added ``simplecache``, and compression to both file caches
148 | 
149 | 
150 | Version 0.6.2
151 | -------------
152 | 
153 | * Added ``adl`` and ``abfs`` protocols to the known implementations registry (:pr:`209`)
154 | * Fixed issue with whole-file caching and implementations providing multiple protocols (:pr:`219`)
155 | 
156 | Version 0.6.1
157 | -------------
158 | 
159 | * ``LocalFileSystem`` is now considered a filestore by pyarrow (:pr:`211`)
160 | * Fixed bug in HDFS filesystem with ``cache_options`` (:pr:`202`)
161 | * Fixed instance caching bug with multiple instances (:pr:`203`)
162 | 
163 | 
164 | Version 0.6.0
165 | -------------
166 | 
167 | * Fixed issues with filesystem instance caching. This was causing authorization errors
168 |   in downstream libraries like ``gcsfs`` and ``s3fs`` in multi-threaded code (:pr:`155`, :pr:`181`)
169 | * Changed the default file caching strategy to :class:`fsspec.caching.ReadAheadCache` (:pr:`193`)
170 | * Moved file caches to the new ``fsspec.caching`` module. They're still available from
171 |   their old location in ``fsspec.core``, but we recommend using the new location for new code (:pr:`195`)
172 | * Added a new file caching strategy, :class:`fsspec.caching.BlockCache` for fetching and caching
173 |   file reads in blocks (:pr:`191`).
174 | * Fixed equality checks for file system instance to return ``False`` when compared to objects
175 |   other than file systems (:pr:`192`)
176 | * Fixed a bug in :meth:`fsspec.FSMap.keys` returning a generator, which was consumed upon iteration (:pr:`189`).
177 | * Removed the magic addition of aliases in ``AbstractFileSystem.__init__``. Now alias methods are always
178 |   present (:pr:`177`)
179 | * Deprecated passing ``trim`` to :class:`fsspec.spec.AbstractBufferedFile`. Pass it in ``storage_options`` instead (:pr:`188`)
180 | * Improved handling of requests for :class:`fsspec.implementations.http.HTTPFileSystem` when the
181 |   HTTP server responds with an (incorrect) content-length of 0 (:pr:`163`)
182 | * Added a ``detail=True`` parameter to :meth:`fsspec.spec.AbstractFileSystem.ls` (:pr:`168`)
183 | * Fixed handling of UNC/DFS paths (:issue:`154`)
184 | 


--------------------------------------------------------------------------------
/fsspec/implementations/tests/test_libarchive.py:
--------------------------------------------------------------------------------
  1 | from contextlib import contextmanager
  2 | 
  3 | import os
  4 | import pickle
  5 | import pytest
  6 | import tempfile
  7 | import fsspec
  8 | 
  9 | libarchive = pytest.importorskip("libarchive")
 10 | 
 11 | 
 12 | @contextmanager
 13 | def temparchive(data={}):
 14 |     f = tempfile.mkstemp(suffix="7z")[1]
 15 |     with libarchive.file_writer(f, "7zip") as archive:
 16 |         for k, v in data.items():
 17 |             archive.add_file_from_memory(entry_path=k, entry_size=len(v), entry_data=v)
 18 |     try:
 19 |         yield f
 20 |     finally:
 21 |         try:
 22 |             os.remove(f)
 23 |         except (IOError, OSError):
 24 |             pass
 25 | 
 26 | 
 27 | data = {"a": b"", "b": b"hello", "deeply/nested/path": b"stuff"}
 28 | 
 29 | 
 30 | def test_empty():
 31 |     with temparchive() as archive_file:
 32 |         fs = fsspec.filesystem("libarchive", fo=archive_file)
 33 |         assert fs.find("") == []
 34 |         assert fs.find("", withdirs=True) == []
 35 |         with pytest.raises(FileNotFoundError):
 36 |             fs.info("")
 37 |         assert fs.ls("") == []
 38 | 
 39 | 
 40 | def test_mapping():
 41 |     with temparchive(data) as archive_file:
 42 |         fs = fsspec.filesystem("libarchive", fo=archive_file)
 43 |         m = fs.get_mapper("")
 44 | 
 45 |         fs._get_dirs()
 46 |         print(fs.dir_cache)
 47 | 
 48 |         assert list(m) == ["a", "b", "deeply/nested/path"]
 49 |         assert m["b"] == data["b"]
 50 | 
 51 | 
 52 | def test_pickle():
 53 |     with temparchive(data) as archive_file:
 54 |         fs = fsspec.filesystem("libarchive", fo=archive_file)
 55 |         fs2 = pickle.loads(pickle.dumps(fs))
 56 |         assert fs2 is fs
 57 |         assert fs2.cat("b") == b"hello"
 58 | 
 59 | 
 60 | def test_all_dirnames():
 61 |     with temparchive() as archive_file:
 62 |         fs = fsspec.filesystem("libarchive", fo=archive_file)
 63 | 
 64 |         # fx are files, dx are a directories
 65 |         assert fs._all_dirnames([]) == set()
 66 |         assert fs._all_dirnames(["f1"]) == set()
 67 |         assert fs._all_dirnames(["f1", "f2"]) == set()
 68 |         assert fs._all_dirnames(["f1", "f2", "d1/f1"]) == {"d1"}
 69 |         assert fs._all_dirnames(["f1", "d1/f1", "d1/f2"]) == {"d1"}
 70 |         assert fs._all_dirnames(["f1", "d1/f1", "d2/f1"]) == {"d1", "d2"}
 71 |         assert fs._all_dirnames(["d1/d1/d1/f1"]) == {"d1", "d1/d1", "d1/d1/d1"}
 72 | 
 73 | 
 74 | def test_ls():
 75 |     with temparchive(data) as archive_file:
 76 |         lhs = fsspec.filesystem("libarchive", fo=archive_file)
 77 | 
 78 |         assert lhs.ls("") == ["a", "b", "deeply/"]
 79 |         assert lhs.ls("/") == lhs.ls("")
 80 | 
 81 |         assert lhs.ls("deeply") == ["deeply/nested/"]
 82 |         assert lhs.ls("deeply/") == lhs.ls("deeply")
 83 | 
 84 |         assert lhs.ls("deeply/nested") == ["deeply/nested/path"]
 85 |         assert lhs.ls("deeply/nested/") == lhs.ls("deeply/nested")
 86 | 
 87 | 
 88 | def test_find():
 89 |     with temparchive(data) as archive_file:
 90 |         lhs = fsspec.filesystem("libarchive", fo=archive_file)
 91 | 
 92 |         assert lhs.find("") == ["a", "b", "deeply/nested/path"]
 93 |         assert lhs.find("", withdirs=True) == [
 94 |             "a",
 95 |             "b",
 96 |             "deeply/",
 97 |             "deeply/nested/",
 98 |             "deeply/nested/path",
 99 |         ]
100 | 
101 |         assert lhs.find("deeply") == ["deeply/nested/path"]
102 |         assert lhs.find("deeply/") == lhs.find("deeply")
103 | 
104 | 
105 | def test_walk():
106 |     with temparchive(data) as archive_file:
107 |         fs = fsspec.filesystem("libarchive", fo=archive_file)
108 |         expected = [
109 |             # (dirname, list of subdirs, list of files)
110 |             ("", ["deeply"], ["a", "b"]),
111 |             ("deeply", ["nested"], []),
112 |             ("deeply/nested", [], ["path"]),
113 |         ]
114 |         for lhs, rhs in zip(fs.walk(""), expected):
115 |             assert lhs[0] == rhs[0]
116 |             assert sorted(lhs[1]) == sorted(rhs[1])
117 |             assert sorted(lhs[2]) == sorted(rhs[2])
118 | 
119 | 
120 | def test_info():
121 |     with temparchive(data) as archive_file:
122 |         fs_cache = fsspec.filesystem("libarchive", fo=archive_file)
123 | 
124 |         with pytest.raises(FileNotFoundError):
125 |             fs_cache.info("i-do-not-exist")
126 | 
127 |         # Iterate over all directories
128 |         # The 7zip archive does not include additional information about the
129 |         # directories
130 |         for d in fs_cache._all_dirnames(data.keys()):
131 |             lhs = fs_cache.info(d)
132 |             expected = {"name": f"{d}/", "size": 0, "type": "directory"}
133 |             assert lhs == expected
134 | 
135 |         # Iterate over all files
136 |         for f, v in data.items():
137 |             lhs = fs_cache.info(f)
138 |             assert lhs["name"] == f
139 |             assert lhs["size"] == len(v)
140 |             assert lhs["type"] == "file"
141 | 
142 |             # These are the specific flags retrieved from the archived files
143 |             assert "created" in lhs
144 |             assert "mode" in lhs
145 |             assert "uid" in lhs
146 |             assert "gid" in lhs
147 |             assert "mtime" in lhs
148 | 
149 | 
150 | @pytest.mark.parametrize("scale", [128, 512, 4096])
151 | def test_isdir_isfile(scale):
152 |     def make_nested_dir(i):
153 |         x = f"{i}"
154 |         table = x.maketrans("0123456789", "ABCDEFGHIJ")
155 |         return "/".join(x.translate(table))
156 | 
157 |     scaled_data = {f"{make_nested_dir(i)}/{i}": b"" for i in range(1, scale + 1)}
158 |     with temparchive(scaled_data) as archive_file:
159 |         fs = fsspec.filesystem("libarchive", fo=archive_file)
160 | 
161 |         lhs_dirs, lhs_files = fs._all_dirnames(scaled_data.keys()), scaled_data.keys()
162 | 
163 |         # Warm-up the Cache, this is done in both cases anyways...
164 |         fs._get_dirs()
165 | 
166 |         entries = lhs_files | lhs_dirs
167 | 
168 |         assert lhs_dirs == {e for e in entries if fs.isdir(e)}
169 |         assert lhs_files == {e for e in entries if fs.isfile(e)}
170 | 


--------------------------------------------------------------------------------
/docs/source/conf.py:
--------------------------------------------------------------------------------
  1 | # -*- coding: utf-8 -*-
  2 | #
  3 | # fsspec documentation build configuration file, created by
  4 | # sphinx-quickstart on Mon Jan 15 18:11:02 2018.
  5 | #
  6 | # This file is execfile()d with the current directory set to its
  7 | # containing dir.
  8 | #
  9 | # Note that not all possible configuration values are present in this
 10 | # autogenerated file.
 11 | #
 12 | # All configuration values have a default; values that are commented out
 13 | # serve to show the default.
 14 | 
 15 | # If extensions (or modules to document with autodoc) are in another directory,
 16 | # add these directories to sys.path here. If the directory is relative to the
 17 | # documentation root, use os.path.abspath to make it absolute, like shown here.
 18 | #
 19 | import os
 20 | import sys
 21 | 
 22 | sys.path.insert(0, os.path.abspath("../.."))
 23 | 
 24 | 
 25 | # -- General configuration ------------------------------------------------
 26 | 
 27 | # If your documentation needs a minimal Sphinx version, state it here.
 28 | #
 29 | # needs_sphinx = '1.0'
 30 | 
 31 | # Add any Sphinx extension module names here, as strings. They can be
 32 | # extensions coming with Sphinx (named 'sphinx.ext.*') or your custom
 33 | # ones.
 34 | extensions = [
 35 |     "sphinx.ext.autodoc",
 36 |     "sphinx.ext.viewcode",
 37 |     "sphinx.ext.autosummary",
 38 |     "sphinx.ext.extlinks",
 39 |     "numpydoc",
 40 | ]
 41 | 
 42 | # Add any paths that contain templates here, relative to this directory.
 43 | templates_path = ["_templates"]
 44 | 
 45 | # The suffix(es) of source filenames.
 46 | # You can specify multiple suffix as a list of string:
 47 | #
 48 | # source_suffix = ['.rst', '.md']
 49 | source_suffix = ".rst"
 50 | 
 51 | # The master toctree document.
 52 | master_doc = "index"
 53 | 
 54 | # General information about the project.
 55 | project = "fsspec"
 56 | copyright = "2018, Martin Durant"
 57 | author = "Martin Durant"
 58 | 
 59 | # The version info for the project you're documenting, acts as replacement for
 60 | # |version| and |release|, also used in various other places throughout the
 61 | # built documents.
 62 | #
 63 | # The short X.Y version.
 64 | import fsspec
 65 | 
 66 | version = fsspec.__version__
 67 | # The full version, including alpha/beta/rc tags.
 68 | release = fsspec.__version__
 69 | 
 70 | # The language for content autogenerated by Sphinx. Refer to documentation
 71 | # for a list of supported languages.
 72 | #
 73 | # This is also used if you do content translation via gettext catalogs.
 74 | # Usually you set "language" from the command line for these cases.
 75 | language = None
 76 | 
 77 | # List of patterns, relative to source directory, that match files and
 78 | # directories to ignore when looking for source files.
 79 | # This patterns also effect to html_static_path and html_extra_path
 80 | exclude_patterns = []
 81 | 
 82 | # The name of the Pygments (syntax highlighting) style to use.
 83 | pygments_style = "sphinx"
 84 | 
 85 | # If true, `todo` and `todoList` produce output, else they produce nothing.
 86 | todo_include_todos = False
 87 | 
 88 | 
 89 | # -- Options for HTML output ----------------------------------------------
 90 | 
 91 | # The theme to use for HTML and HTML Help pages.  See the documentation for
 92 | # a list of builtin themes.
 93 | #
 94 | html_theme = "sphinx_rtd_theme"
 95 | 
 96 | # Theme options are theme-specific and customize the look and feel of a theme
 97 | # further.  For a list of options available for each theme, see the
 98 | # documentation.
 99 | #
100 | # html_theme_options = {}
101 | 
102 | # Add any paths that contain custom static files (such as style sheets) here,
103 | # relative to this directory. They are copied after the builtin static files,
104 | # so a file named "default.css" will overwrite the builtin "default.css".
105 | html_static_path = []
106 | 
107 | # Custom sidebar templates, must be a dictionary that maps document names
108 | # to template names.
109 | #
110 | # This is required for the alabaster theme
111 | # refs: http://alabaster.readthedocs.io/en/latest/installation.html#sidebars
112 | html_sidebars = {
113 |     "**": [
114 |         "relations.html",  # needs 'show_related': True theme option to display
115 |         "searchbox.html",
116 |     ]
117 | }
118 | 
119 | 
120 | # -- Options for HTMLHelp output ------------------------------------------
121 | 
122 | # Output file base name for HTML help builder.
123 | htmlhelp_basename = "fsspecdoc"
124 | 
125 | 
126 | # -- Options for LaTeX output ---------------------------------------------
127 | 
128 | latex_elements = {
129 |     # The paper size ('letterpaper' or 'a4paper').
130 |     #
131 |     # 'papersize': 'letterpaper',
132 |     # The font size ('10pt', '11pt' or '12pt').
133 |     #
134 |     # 'pointsize': '10pt',
135 |     # Additional stuff for the LaTeX preamble.
136 |     #
137 |     # 'preamble': '',
138 |     # Latex figure (float) alignment
139 |     #
140 |     # 'figure_align': 'htbp',
141 | }
142 | 
143 | # Grouping the document tree into LaTeX files. List of tuples
144 | # (source start file, target name, title,
145 | #  author, documentclass [howto, manual, or own class]).
146 | latex_documents = [
147 |     (master_doc, "fsspec.tex", "fsspec Documentation", "Joseph Crail", "manual")
148 | ]
149 | 
150 | 
151 | # -- Options for manual page output ---------------------------------------
152 | 
153 | # One entry per manual page. List of tuples
154 | # (source start file, name, description, authors, manual section).
155 | man_pages = [(master_doc, "fsspec", "fsspec Documentation", [author], 1)]
156 | 
157 | 
158 | # -- Options for Texinfo output -------------------------------------------
159 | 
160 | # Grouping the document tree into Texinfo files. List of tuples
161 | # (source start file, target name, title, author,
162 | #  dir menu entry, description, category)
163 | texinfo_documents = [
164 |     (
165 |         master_doc,
166 |         "fsspec",
167 |         "fsspec Documentation",
168 |         author,
169 |         "fsspec",
170 |         "One line description of project.",
171 |         "Miscellaneous",
172 |     )
173 | ]
174 | 
175 | extlinks = {
176 |     "issue": ("https://github.com/intake/filesystem_spec/issues/%s", "GH#"),
177 |     "pr": ("https://github.com/intake/filesystem_spec/pull/%s", "GH#"),
178 | }
179 | 


--------------------------------------------------------------------------------
/docs/source/api.rst:
--------------------------------------------------------------------------------
  1 | API Reference
  2 | =============
  3 | 
  4 | .. currentmodule:: fsspec
  5 | 
  6 | User Functions
  7 | --------------
  8 | 
  9 | .. autosummary::
 10 |    fsspec.open_files
 11 |    fsspec.open
 12 |    fsspec.open_local
 13 |    fsspec.filesystem
 14 |    fsspec.get_filesystem_class
 15 |    fsspec.get_mapper
 16 |    fsspec.fuse.run
 17 |    fsspec.gui.FileSelector
 18 | 
 19 | .. autofunction:: fsspec.open_files
 20 | .. autofunction:: fsspec.open
 21 | .. autofunction:: fsspec.open_local
 22 | .. autofunction:: fsspec.filesystem
 23 | .. autofunction:: fsspec.get_filesystem_class
 24 | .. autofunction:: fsspec.get_mapper
 25 | .. autofunction:: fsspec.fuse.run
 26 | .. autoclass:: fsspec.gui.FileSelector
 27 |    :members:
 28 | 
 29 | Base Classes
 30 | ------------
 31 | 
 32 | .. autosummary::
 33 |    fsspec.spec.AbstractFileSystem
 34 |    fsspec.spec.Transaction
 35 |    fsspec.spec.AbstractBufferedFile
 36 |    fsspec.FSMap
 37 |    fsspec.asyn.AsyncFileSystem
 38 |    fsspec.core.OpenFile
 39 |    fsspec.core.OpenFiles
 40 |    fsspec.core.BaseCache
 41 |    fsspec.core.get_fs_token_paths
 42 |    fsspec.dircache.DirCache
 43 |    fsspec.registry.ReadOnlyRegistry
 44 |    fsspec.registry.register_implementation
 45 | 
 46 | .. autoclass:: fsspec.spec.AbstractFileSystem
 47 |    :members:
 48 | 
 49 | .. autoclass:: fsspec.spec.Transaction
 50 |    :members:
 51 | 
 52 | .. autoclass:: fsspec.spec.AbstractBufferedFile
 53 |    :members:
 54 | 
 55 | .. autoclass:: fsspec.asyn.AsyncFileSystem
 56 | 
 57 | .. autoclass:: fsspec.FSMap
 58 |    :members:
 59 | 
 60 | .. autoclass:: fsspec.core.OpenFile
 61 |    :members:
 62 | 
 63 | .. autoclass:: fsspec.core.OpenFiles
 64 | 
 65 | .. autoclass:: fsspec.core.BaseCache
 66 |    :members:
 67 | 
 68 | .. autofunction:: fsspec.core.get_fs_token_paths
 69 | 
 70 | .. autoclass:: fsspec.dircache.DirCache
 71 |    :members: __init__
 72 | 
 73 | .. autoclass:: fsspec.registry.ReadOnlyRegistry
 74 |    :members: __init__
 75 | 
 76 | .. autofunction:: fsspec.registry.register_implementation
 77 | 
 78 | .. _implementations:
 79 | 
 80 | Built-in Implementations
 81 | ------------------------
 82 | 
 83 | .. autosummary::
 84 |    fsspec.implementations.ftp.FTPFileSystem
 85 |    fsspec.implementations.hdfs.PyArrowHDFS
 86 |    fsspec.implementations.dask.DaskWorkerFileSystem
 87 |    fsspec.implementations.http.HTTPFileSystem
 88 |    fsspec.implementations.local.LocalFileSystem
 89 |    fsspec.implementations.memory.MemoryFileSystem
 90 |    fsspec.implementations.github.GithubFileSystem
 91 |    fsspec.implementations.sftp.SFTPFileSystem
 92 |    fsspec.implementations.webhdfs.WebHDFS
 93 |    fsspec.implementations.zip.ZipFileSystem
 94 |    fsspec.implementations.cached.CachingFileSystem
 95 |    fsspec.implementations.cached.WholeFileCacheFileSystem
 96 |    fsspec.implementations.cached.SimpleCacheFileSystem
 97 |    fsspec.implementations.git.GitFileSystem
 98 |    fsspec.implementations.smb.SMBFileSystem
 99 |    fsspec.implementations.jupyter.JupyterFileSystem
100 |    fsspec.implementations.libarchive.LibArchiveFileSystem
101 |    fsspec.implementations.dbfs.DatabricksFileSystem
102 |    fsspec.implementations.reference.ReferenceFileSystem
103 | 
104 | .. autoclass:: fsspec.implementations.ftp.FTPFileSystem
105 |    :members: __init__
106 | 
107 | .. autoclass:: fsspec.implementations.hdfs.PyArrowHDFS
108 |    :members: __init__
109 | 
110 | .. autoclass:: fsspec.implementations.dask.DaskWorkerFileSystem
111 |    :members: __init__
112 | 
113 | .. autoclass:: fsspec.implementations.http.HTTPFileSystem
114 |    :members: __init__
115 | 
116 | .. autoclass:: fsspec.implementations.local.LocalFileSystem
117 |    :members: __init__
118 | 
119 | .. autoclass:: fsspec.implementations.memory.MemoryFileSystem
120 |    :members: __init__
121 | 
122 | .. autoclass:: fsspec.implementations.sftp.SFTPFileSystem
123 |    :members: __init__
124 | 
125 | .. autoclass:: fsspec.implementations.webhdfs.WebHDFS
126 |    :members: __init__
127 | 
128 | .. autoclass:: fsspec.implementations.zip.ZipFileSystem
129 |    :members: __init__
130 | 
131 | .. autoclass:: fsspec.implementations.cached.CachingFileSystem
132 |    :members: __init__
133 | 
134 | .. autoclass:: fsspec.implementations.cached.WholeFileCacheFileSystem
135 |    :members: __init__
136 | 
137 | .. autoclass:: fsspec.implementations.cached.SimpleCacheFileSystem
138 |    :members: __init__
139 | 
140 | .. autoclass:: fsspec.implementations.github.GithubFileSystem
141 |    :members: __init__
142 | 
143 | .. autoclass:: fsspec.implementations.git.GitFileSystem
144 |    :members: __init__
145 | 
146 | .. autoclass:: fsspec.implementations.smb.SMBFileSystem
147 |    :members: __init__
148 | 
149 | .. autoclass:: fsspec.implementations.jupyter.JupyterFileSystem
150 |    :members: __init__
151 | 
152 | .. autoclass:: fsspec.implementations.libarchive.LibArchiveFileSystem
153 |    :members: __init__
154 | 
155 | .. autoclass:: fsspec.implementations.dbfs.DatabricksFileSystem
156 |    :members: __init__
157 | 
158 | .. autoclass:: fsspec.implementations.reference.ReferenceFileSystem
159 |    :members: __init__
160 | 
161 | Other Known Implementations
162 | ---------------------------
163 | 
164 | - `s3fs`_ for Amazon S3 and other compatible stores
165 | - `gcsfs`_ for Google Cloud Storage
166 | - `adl`_ for Azure DataLake storage
167 | - `abfs`_ for Azure Blob service
168 | - `dropbox`_ for access to dropbox shares
169 | - `gdrive`_ to access Google Drive and shares (experimental)
170 | 
171 | .. _s3fs: https://s3fs.readthedocs.io/en/latest/
172 | .. _gcsfs: https://gcsfs.readthedocs.io/en/latest/
173 | .. _adl: https://github.com/dask/adlfs
174 | .. _abfs: https://github.com/dask/adlfs
175 | .. _dropbox: https://github.com/MarineChap/intake_dropbox
176 | .. _gdrive: https://github.com/intake/gdrivefs
177 | 
178 | .. _readbuffering:
179 | 
180 | Read Buffering
181 | --------------
182 | 
183 | .. autosummary::
184 | 
185 |   fsspec.caching.ReadAheadCache
186 |   fsspec.caching.BytesCache
187 |   fsspec.caching.MMapCache
188 |   fsspec.caching.BlockCache
189 | 
190 | .. autoclass:: fsspec.caching.ReadAheadCache
191 |    :members:
192 | 
193 | .. autoclass:: fsspec.caching.BytesCache
194 |    :members:
195 | 
196 | .. autoclass:: fsspec.caching.MMapCache
197 |    :members:
198 | 
199 | .. autoclass:: fsspec.caching.BlockCache
200 |    :members:
201 | 


--------------------------------------------------------------------------------
/fsspec/tests/test_file.py:
--------------------------------------------------------------------------------
  1 | """Tests abstract buffered file API, using FTP implementation"""
  2 | import pickle
  3 | import sys
  4 | import pytest
  5 | from fsspec.implementations.tests.test_ftp import FTPFileSystem
  6 | 
  7 | data = b"hello" * 10000
  8 | 
  9 | 
 10 | @pytest.mark.xfail(
 11 |     sys.version_info < (3, 6),
 12 |     reason="py35 error, see https://github.com/intake/filesystem_spec/issues/147",
 13 | )
 14 | def test_pickle(ftp_writable):
 15 |     host, port, user, pw = ftp_writable
 16 |     ftp = FTPFileSystem(host=host, port=port, username=user, password=pw)
 17 | 
 18 |     f = ftp.open("/out", "rb")
 19 | 
 20 |     f2 = pickle.loads(pickle.dumps(f))
 21 |     assert f == f2
 22 | 
 23 | 
 24 | def test_file_read_attributes(ftp_writable):
 25 |     host, port, user, pw = ftp_writable
 26 |     ftp = FTPFileSystem(host=host, port=port, username=user, password=pw)
 27 | 
 28 |     f = ftp.open("/out", "rb")
 29 |     assert f.info()["size"] == len(data)
 30 |     assert f.tell() == 0
 31 |     assert f.seekable()
 32 |     assert f.readable()
 33 |     assert not f.writable()
 34 |     out = bytearray(len(data))
 35 | 
 36 |     assert f.read() == data
 37 |     assert f.read() == b""
 38 |     f.seek(0)
 39 |     assert f.readuntil(b"l") == b"hel"
 40 |     assert f.tell() == 3
 41 | 
 42 |     f.readinto1(out)
 43 |     assert out[:-3] == data[3:]
 44 |     with pytest.raises(ValueError):
 45 |         f.write(b"")
 46 |     f.close()
 47 |     with pytest.raises(ValueError):
 48 |         f.read()(b"")
 49 | 
 50 | 
 51 | def test_seek(ftp_writable):
 52 |     host, port, user, pw = ftp_writable
 53 |     ftp = FTPFileSystem(host=host, port=port, username=user, password=pw)
 54 | 
 55 |     f = ftp.open("/out", "rb")
 56 | 
 57 |     assert f.seek(-10, 2) == len(data) - 10
 58 |     assert f.tell() == len(data) - 10
 59 |     assert f.seek(-1, 1) == len(data) - 11
 60 |     with pytest.raises(ValueError):
 61 |         f.seek(-1)
 62 |     with pytest.raises(ValueError):
 63 |         f.seek(0, 7)
 64 | 
 65 | 
 66 | def test_file_idempotent(ftp_writable):
 67 |     host, port, user, pw = ftp_writable
 68 |     ftp = FTPFileSystem(host=host, port=port, username=user, password=pw)
 69 | 
 70 |     f = ftp.open("/out", "rb")
 71 |     f2 = ftp.open("/out", "rb")
 72 |     assert hash(f) == hash(f2)
 73 |     assert f == f2
 74 |     ftp.touch("/out2")
 75 |     f2 = ftp.open("/out2", "rb")
 76 |     assert hash(f2) != hash(f)
 77 |     assert f != f2
 78 |     f2 = ftp.open("/out", "wb")
 79 |     assert hash(f2) != hash(f)
 80 | 
 81 | 
 82 | def test_file_text_attributes(ftp_writable):
 83 |     host, port, user, pw = ftp_writable
 84 |     ftp = FTPFileSystem(host=host, port=port, username=user, password=pw)
 85 | 
 86 |     data = b"hello\n" * 1000
 87 |     with ftp.open("/out2", "wb") as f:
 88 |         f.write(data)
 89 | 
 90 |     f = ftp.open("/out2", "rb")
 91 |     assert f.readline() == b"hello\n"
 92 |     f.seek(0)
 93 |     assert list(f) == [d + b"\n" for d in data.split()]
 94 |     f.seek(0)
 95 |     assert f.readlines() == [d + b"\n" for d in data.split()]
 96 | 
 97 |     f = ftp.open("/out2", "rt")
 98 |     assert f.readline() == "hello\n"
 99 |     assert f.encoding
100 | 
101 | 
102 | def test_file_write_attributes(ftp_writable):
103 |     host, port, user, pw = ftp_writable
104 |     ftp = FTPFileSystem(host=host, port=port, username=user, password=pw)
105 |     f = ftp.open("/out2", "wb")
106 |     with pytest.raises(ValueError):
107 |         f.info()
108 |     with pytest.raises(OSError):
109 |         f.seek(0)
110 |     with pytest.raises(ValueError):
111 |         f.read(0)
112 |     assert not f.readable()
113 |     assert f.writable()
114 | 
115 |     f.flush()  # no-op
116 | 
117 |     assert f.write(b"hello") == 5
118 |     assert f.write(b"hello") == 5
119 |     assert not f.closed
120 |     f.close()
121 |     assert f.closed
122 |     with pytest.raises(ValueError):
123 |         f.write(b"")
124 |     with pytest.raises(ValueError):
125 |         f.flush()
126 | 
127 | 
128 | def test_midread_cache(ftp_writable):
129 |     host, port, user, pw = ftp_writable
130 |     fs = FTPFileSystem(host=host, port=port, username=user, password=pw)
131 |     fn = "/myfile"
132 |     with fs.open(fn, "wb") as f:
133 |         f.write(b"a" * 175627146)
134 |     with fs.open(fn, "rb") as f:
135 |         f.seek(175561610)
136 |         d1 = f.read(65536)
137 |         assert len(d1) == 65536
138 | 
139 |         f.seek(4)
140 |         size = 17562198
141 |         d2 = f.read(size)
142 |         assert len(d2) == size
143 | 
144 |         f.seek(17562288)
145 |         size = 17562187
146 |         d3 = f.read(size)
147 |         assert len(d3) == size
148 | 
149 | 
150 | def test_read_block(ftp_writable):
151 |     # not the same as test_read_block in test_utils, this depends on the
152 |     # behaviour of the bytest caching
153 |     from fsspec.utils import read_block
154 | 
155 |     host, port, user, pw = ftp_writable
156 |     fs = FTPFileSystem(host=host, port=port, username=user, password=pw)
157 |     fn = "/myfile"
158 |     with fs.open(fn, "wb") as f:
159 |         f.write(b"a,b\n1,2")
160 |     f = fs.open(fn, "rb", cache_type="bytes")
161 |     assert read_block(f, 0, 6400, b"\n") == b"a,b\n1,2"
162 | 
163 | 
164 | def test_with_gzip(ftp_writable):
165 |     import gzip
166 | 
167 |     data = b"some compressable stuff"
168 |     host, port, user, pw = ftp_writable
169 |     fs = FTPFileSystem(host=host, port=port, username=user, password=pw)
170 |     fn = "/myfile"
171 |     with fs.open(fn, "wb") as f:
172 |         gf = gzip.GzipFile(fileobj=f, mode="w")
173 |         gf.write(data)
174 |         gf.close()
175 |     with fs.open(fn, "rb") as f:
176 |         gf = gzip.GzipFile(fileobj=f, mode="r")
177 |         assert gf.read() == data
178 | 
179 | 
180 | def test_with_zip(ftp_writable):
181 |     import zipfile
182 | 
183 |     data = b"hello zip"
184 |     host, port, user, pw = ftp_writable
185 |     fs = FTPFileSystem(host=host, port=port, username=user, password=pw)
186 |     fn = "/myfile.zip"
187 |     inner_file = "test.txt"
188 |     with fs.open(fn, "wb") as f:
189 |         zf = zipfile.ZipFile(f, mode="w")
190 |         zf.writestr(inner_file, data)
191 |         zf.close()
192 |     with fs.open(fn, "rb") as f:
193 |         zf = zipfile.ZipFile(f, mode="r")
194 |         assert zf.read(inner_file) == data
195 | 


--------------------------------------------------------------------------------
/fsspec/implementations/hdfs.py:
--------------------------------------------------------------------------------
  1 | import weakref
  2 | from ..spec import AbstractFileSystem
  3 | from ..utils import infer_storage_options
  4 | from pyarrow.hdfs import HadoopFileSystem
  5 | 
  6 | 
  7 | class PyArrowHDFS(AbstractFileSystem):
  8 |     """Adapted version of Arrow's HadoopFileSystem
  9 | 
 10 |     This is a very simple wrapper over pa.hdfs.HadoopFileSystem, which
 11 |     passes on all calls to the underlying class.
 12 |     """
 13 | 
 14 |     protocol = "hdfs"
 15 | 
 16 |     def __init__(
 17 |         self,
 18 |         host="default",
 19 |         port=0,
 20 |         user=None,
 21 |         kerb_ticket=None,
 22 |         driver="libhdfs",
 23 |         extra_conf=None,
 24 |         **kwargs
 25 |     ):
 26 |         """
 27 | 
 28 |         Parameters
 29 |         ----------
 30 |         host: str
 31 |             Hostname, IP or "default" to try to read from Hadoop config
 32 |         port: int
 33 |             Port to connect on, or default from Hadoop config if 0
 34 |         user: str or None
 35 |             If given, connect as this username
 36 |         kerb_ticket: str or None
 37 |             If given, use this ticket for authentication
 38 |         driver: 'libhdfs' or 'libhdfs3'
 39 |             Binary driver; libhdfs if the JNI library and default
 40 |         extra_conf: None or dict
 41 |             Passed on to HadoopFileSystem
 42 |         """
 43 |         if self._cached:
 44 |             return
 45 |         AbstractFileSystem.__init__(self, **kwargs)
 46 |         self.pars = (host, port, user, kerb_ticket, driver, extra_conf)
 47 |         pahdfs = HadoopFileSystem(
 48 |             host=host,
 49 |             port=port,
 50 |             user=user,
 51 |             kerb_ticket=kerb_ticket,
 52 |             driver=driver,
 53 |             extra_conf=extra_conf,
 54 |         )
 55 |         weakref.finalize(self, lambda: pahdfs.close())
 56 |         self.pahdfs = pahdfs
 57 | 
 58 |     def _open(
 59 |         self,
 60 |         path,
 61 |         mode="rb",
 62 |         block_size=None,
 63 |         autocommit=True,
 64 |         cache_options=None,
 65 |         **kwargs
 66 |     ):
 67 |         """
 68 | 
 69 |         Parameters
 70 |         ----------
 71 |         path: str
 72 |             Location of file; should start with '/'
 73 |         mode: str
 74 |         block_size: int
 75 |             Hadoop block size, e.g., 2**26
 76 |         autocommit: True
 77 |             Transactions are not yet implemented for HDFS; errors if not True
 78 |         kwargs: dict or None
 79 |             Hadoop config parameters
 80 | 
 81 |         Returns
 82 |         -------
 83 |         HDFSFile file-like instance
 84 |         """
 85 | 
 86 |         return HDFSFile(
 87 |             self,
 88 |             path,
 89 |             mode,
 90 |             block_size=block_size,
 91 |             autocommit=autocommit,
 92 |             cache_options=cache_options,
 93 |             **kwargs
 94 |         )
 95 | 
 96 |     def __reduce_ex__(self, protocol):
 97 |         return PyArrowHDFS, self.pars
 98 | 
 99 |     def ls(self, path, detail=True):
100 |         out = self.pahdfs.ls(path, detail)
101 |         if detail:
102 |             for p in out:
103 |                 p["type"] = p["kind"]
104 |                 p["name"] = self._strip_protocol(p["name"])
105 |         else:
106 |             out = [self._strip_protocol(p) for p in out]
107 |         return out
108 | 
109 |     @staticmethod
110 |     def _get_kwargs_from_urls(path):
111 |         ops = infer_storage_options(path)
112 |         out = {}
113 |         if ops.get("host", None):
114 |             out["host"] = ops["host"]
115 |         if ops.get("username", None):
116 |             out["user"] = ops["username"]
117 |         if ops.get("port", None):
118 |             out["port"] = ops["port"]
119 |         return out
120 | 
121 |     def close(self):
122 |         self.pahdfs.close()
123 | 
124 |     @classmethod
125 |     def _strip_protocol(cls, path):
126 |         ops = infer_storage_options(path)
127 |         return ops["path"]
128 | 
129 |     def __getattribute__(self, item):
130 |         if item in [
131 |             "_open",
132 |             "close",
133 |             "__init__",
134 |             "__getattribute__",
135 |             "__reduce_ex__",
136 |             "open",
137 |             "ls",
138 |             "makedirs",
139 |         ]:
140 |             # all the methods defined in this class. Note `open` here, since
141 |             # it calls `_open`, but is actually in superclass
142 |             return lambda *args, **kw: getattr(PyArrowHDFS, item)(self, *args, **kw)
143 |         if item == "__class__":
144 |             return PyArrowHDFS
145 |         d = object.__getattribute__(self, "__dict__")
146 |         pahdfs = d.get("pahdfs", None)  # fs is not immediately defined
147 |         if pahdfs is not None and item in [
148 |             "chmod",
149 |             "chown",
150 |             "user",
151 |             "df",
152 |             "disk_usage",
153 |             "download",
154 |             "driver",
155 |             "exists",
156 |             "extra_conf",
157 |             "get_capacity",
158 |             "get_space_used",
159 |             "host",
160 |             "is_open",
161 |             "kerb_ticket",
162 |             "strip_protocol",
163 |             "mkdir",
164 |             "mv",
165 |             "port",
166 |             "get_capacity",
167 |             "get_space_used",
168 |             "df",
169 |             "chmod",
170 |             "chown",
171 |             "disk_usage",
172 |             "download",
173 |             "upload",
174 |             "_get_kwargs_from_urls",
175 |             "read_parquet",
176 |             "rm",
177 |             "stat",
178 |             "upload",
179 |         ]:
180 |             return getattr(pahdfs, item)
181 |         else:
182 |             # attributes of the superclass, while target is being set up
183 |             return super().__getattribute__(item)
184 | 
185 | 
186 | class HDFSFile(object):
187 |     """Wrapper around arrow's HdfsFile
188 | 
189 |     Allows seek beyond EOF and (eventually) commit/discard
190 |     """
191 | 
192 |     def __init__(
193 |         self,
194 |         fs,
195 |         path,
196 |         mode,
197 |         block_size,
198 |         autocommit=True,
199 |         cache_type="readahead",
200 |         cache_options=None,
201 |         **kwargs
202 |     ):
203 |         # TODO: Inherit from AbstractBufferedFile?
204 |         if not autocommit:
205 |             raise NotImplementedError(
206 |                 "HDFSFile cannot be opened with 'autocommit=False'."
207 |             )
208 | 
209 |         self.fs = fs
210 |         self.path = path
211 |         self.mode = mode
212 |         self.block_size = block_size
213 |         self.fh = fs.pahdfs.open(path, mode, block_size, **kwargs)
214 |         if self.fh.readable():
215 |             self.seek_size = self.size()
216 | 
217 |     def seek(self, loc, whence=0):
218 |         if whence == 0 and self.readable():
219 |             loc = min(loc, self.seek_size)
220 |         return self.fh.seek(loc, whence)
221 | 
222 |     def __getattr__(self, item):
223 |         return getattr(self.fh, item)
224 | 
225 |     def __reduce_ex__(self, protocol):
226 |         return HDFSFile, (self.fs, self.path, self.mode, self.block_size)
227 | 
228 |     def __enter__(self):
229 |         return self
230 | 
231 |     def __exit__(self, exc_type, exc_val, exc_tb):
232 |         self.close()
233 | 


--------------------------------------------------------------------------------
/fsspec/implementations/libarchive.py:
--------------------------------------------------------------------------------
  1 | from __future__ import print_function, division, absolute_import
  2 | 
  3 | from contextlib import contextmanager
  4 | 
  5 | import libarchive
  6 | from fsspec import AbstractFileSystem, open_files
  7 | from fsspec.utils import tokenize, DEFAULT_BLOCK_SIZE
  8 | from fsspec.implementations.memory import MemoryFile
  9 | 
 10 | 
 11 | class LibArchiveFileSystem(AbstractFileSystem):
 12 |     """Compressed archives as a file-system (read-only)
 13 | 
 14 |     Supports the following formats:
 15 |     tar, pax , cpio, ISO9660, zip, mtree, shar, ar, raw, xar, lha/lzh, rar
 16 |     Microsoft CAB, 7-Zip, WARC
 17 | 
 18 |     See the libarchive documentation for further restrictions.
 19 | 
 20 |     Keeps file object open while instance lives.
 21 | 
 22 |     This class is pickleable, but not necessarily thread-safe
 23 |     """
 24 | 
 25 |     root_marker = ""
 26 |     protocol = "libarchive"
 27 | 
 28 |     def __init__(
 29 |         self,
 30 |         fo="",
 31 |         mode="r",
 32 |         target_protocol=None,
 33 |         target_options=None,
 34 |         block_size=DEFAULT_BLOCK_SIZE,
 35 |         **kwargs
 36 |     ):
 37 |         """
 38 |         Parameters
 39 |         ----------
 40 |         fo: str or file-like
 41 |             Contains ZIP, and must exist. If a str, will fetch file using
 42 |             `open_files()`, which must return one file exactly.
 43 |         mode: str
 44 |             Currently, only 'r' accepted
 45 |         target_protocol: str (optional)
 46 |             If ``fo`` is a string, this value can be used to override the
 47 |             FS protocol inferred from a URL
 48 |         target_options: dict (optional)
 49 |             Kwargs passed when instantiating the target FS, if ``fo`` is
 50 |             a string.
 51 |         """
 52 |         super().__init__(self, **kwargs)
 53 |         if mode != "r":
 54 |             raise ValueError("Only read from archive files accepted")
 55 |         if isinstance(fo, str):
 56 |             files = open_files(fo, protocol=target_protocol, **(target_options or {}))
 57 |             if len(files) != 1:
 58 |                 raise ValueError(
 59 |                     'Path "{}" did not resolve to exactly'
 60 |                     'one file: "{}"'.format(fo, files)
 61 |                 )
 62 |             fo = files[0]
 63 |         self.fo = fo.__enter__()  # the whole instance is a context
 64 |         # self.arc_reader =
 65 |         self.block_size = block_size
 66 |         self.dir_cache = None
 67 | 
 68 |     @contextmanager
 69 |     def _open_archive(self):
 70 |         self.fo.seek(0)
 71 |         with libarchive.fd_reader(self.fo.fileno(), block_size=self.block_size) as arc:
 72 |             yield arc
 73 | 
 74 |     @classmethod
 75 |     def _strip_protocol(cls, path):
 76 |         # file paths are always relative to the archive root
 77 |         return super()._strip_protocol(path).lstrip("/")
 78 | 
 79 |     def _get_dirs(self):
 80 |         fields = {
 81 |             "name": "pathname",
 82 |             "size": "size",
 83 |             "created": "ctime",
 84 |             "mode": "mode",
 85 |             "uid": "uid",
 86 |             "gid": "gid",
 87 |             "mtime": "mtime",
 88 |         }
 89 | 
 90 |         if self.dir_cache is not None:
 91 |             return
 92 | 
 93 |         self.dir_cache = {}
 94 |         list_names = []
 95 |         with self._open_archive() as arc:
 96 |             for entry in arc:
 97 |                 if not entry.isdir and not entry.isfile:
 98 |                     # Skip symbolic links, fifo entries, etc.
 99 |                     continue
100 |                 self.dir_cache.update(
101 |                     {
102 |                         dirname
103 |                         + "/": {"name": dirname + "/", "size": 0, "type": "directory"}
104 |                         for dirname in self._all_dirnames(set(entry.name))
105 |                     }
106 |                 )
107 |                 f = {key: getattr(entry, fields[key]) for key in fields}
108 |                 f["type"] = "directory" if entry.isdir else "file"
109 |                 list_names.append(entry.name)
110 | 
111 |                 self.dir_cache[f["name"]] = f
112 |         # libarchive does not seem to return an entry for the directories (at least
113 |         # not in all formats), so get the directories names from the files names
114 |         self.dir_cache.update(
115 |             {
116 |                 dirname + "/": {"name": dirname + "/", "size": 0, "type": "directory"}
117 |                 for dirname in self._all_dirnames(list_names)
118 |             }
119 |         )
120 | 
121 |     def info(self, path, **kwargs):
122 |         self._get_dirs()
123 |         path = self._strip_protocol(path)
124 |         if path in self.dir_cache:
125 |             return self.dir_cache[path]
126 |         elif path + "/" in self.dir_cache:
127 |             return self.dir_cache[path + "/"]
128 |         else:
129 |             raise FileNotFoundError(path)
130 | 
131 |     def ls(self, path, detail=False, **kwargs):
132 |         self._get_dirs()
133 |         paths = {}
134 | 
135 |         for p, f in self.dir_cache.items():
136 |             p = p.rstrip("/")
137 |             if "/" in p:
138 |                 root = p.rsplit("/", 1)[0]
139 |             else:
140 |                 root = ""
141 |             if root == path.rstrip("/"):
142 |                 paths[p] = f
143 |             elif all(
144 |                 (a == b)
145 |                 for a, b in zip(path.split("/"), [""] + p.strip("/").split("/"))
146 |             ):
147 |                 # root directory entry
148 |                 ppath = p.rstrip("/").split("/", 1)[0]
149 |                 if ppath not in paths:
150 |                     out = {"name": ppath + "/", "size": 0, "type": "directory"}
151 |                     paths[ppath] = out
152 |         out = list(paths.values())
153 |         if detail:
154 |             return out
155 |         else:
156 |             return list(sorted(f["name"] for f in out))
157 | 
158 |     def _open(
159 |         self,
160 |         path,
161 |         mode="rb",
162 |         block_size=None,
163 |         autocommit=True,
164 |         cache_options=None,
165 |         **kwargs
166 |     ):
167 |         path = self._strip_protocol(path)
168 |         if mode != "rb":
169 |             raise NotImplementedError
170 | 
171 |         data = bytes()
172 |         with self._open_archive() as arc:
173 |             # FIXME? dropwhile would increase performance but less readable
174 |             for entry in arc:
175 |                 if entry.pathname != path:
176 |                     continue
177 |                 for block in entry.get_blocks(entry.size):
178 |                     data = block
179 |                     break
180 |                 else:
181 |                     raise ValueError
182 |         return MemoryFile(fs=self, path=path, data=data)
183 | 
184 |     def ukey(self, path):
185 |         return tokenize(path, self.fo, self.protocol)
186 | 
187 |     def _all_dirnames(self, paths):
188 |         """Returns *all* directory names for each path in paths, including intermediate ones.
189 | 
190 |         Parameters
191 |         ----------
192 |         paths: Iterable of path strings
193 |         """
194 |         if len(paths) == 0:
195 |             return set()
196 | 
197 |         dirnames = {self._parent(path) for path in paths} - {self.root_marker}
198 |         return dirnames | self._all_dirnames(dirnames)
199 | 


--------------------------------------------------------------------------------
/fsspec/implementations/github.py:
--------------------------------------------------------------------------------
  1 | import requests
  2 | from ..spec import AbstractFileSystem
  3 | from ..utils import infer_storage_options
  4 | from .memory import MemoryFile
  5 | 
  6 | 
  7 | class GithubFileSystem(AbstractFileSystem):
  8 |     """Interface to files in github
  9 | 
 10 |     An instance of this class provides the files residing within a remote github
 11 |     repository. You may specify a point in the repos history, by SHA, branch
 12 |     or tag (default is current master).
 13 | 
 14 |     Given that code files tend to be small, and that github does not support
 15 |     retrieving partial content, we always fetch whole files.
 16 | 
 17 |     When using fsspec.open, allows URIs of the form:
 18 | 
 19 |     - "github://path/file", in which case you must specify org, repo and
 20 |       may specify sha in the extra args
 21 |     - 'github://org:repo@/precip/catalog.yml', where the org and repo are
 22 |       part of the URI
 23 |     - 'github://org:repo@sha/precip/catalog.yml', where tha sha is also included
 24 | 
 25 |     ``sha`` can be the full or abbreviated hex of the commit you want to fetch
 26 |     from, or a branch or tag name (so long as it doesn't contain special characters
 27 |     like "/", "?", which would have to be HTTP-encoded).
 28 | 
 29 |     For authorised access, you must provide username and token, which can be made
 30 |     at https://github.com/settings/tokens
 31 |     """
 32 | 
 33 |     url = "https://api.github.com/repos/{org}/{repo}/git/trees/{sha}"
 34 |     rurl = "https://raw.githubusercontent.com/{org}/{repo}/{sha}/{path}"
 35 |     protocol = "github"
 36 | 
 37 |     def __init__(self, org, repo, sha="master", username=None, token=None, **kwargs):
 38 |         super().__init__(**kwargs)
 39 |         self.org = org
 40 |         self.repo = repo
 41 |         self.root = sha
 42 |         if (username is None) ^ (token is None):
 43 |             raise ValueError("Auth required both username and token")
 44 |         self.username = username
 45 |         self.token = token
 46 |         self.ls("")
 47 | 
 48 |     @property
 49 |     def kw(self):
 50 |         if self.username:
 51 |             return {"auth": (self.username, self.token)}
 52 |         return {}
 53 | 
 54 |     @classmethod
 55 |     def repos(cls, org_or_user, is_org=True):
 56 |         """List repo names for given org or user
 57 | 
 58 |         This may become the top level of the FS
 59 | 
 60 |         Parameters
 61 |         ----------
 62 |         org_or_user: str
 63 |             Nmae of the github org or user to query
 64 |         is_org: bool (default True)
 65 |             Whether the name is an organisation (True) or user (False)
 66 | 
 67 |         Returns
 68 |         -------
 69 |         List of string
 70 |         """
 71 |         r = requests.get(
 72 |             "https://api.github.com/{part}/{org}/repos".format(
 73 |                 part=["users", "orgs"][is_org], org=org_or_user
 74 |             )
 75 |         )
 76 |         r.raise_for_status()
 77 |         return [repo["name"] for repo in r.json()]
 78 | 
 79 |     @property
 80 |     def tags(self):
 81 |         """Names of tags in the repo"""
 82 |         r = requests.get(
 83 |             "https://api.github.com/repos/{org}/{repo}/tags"
 84 |             "".format(org=self.org, repo=self.repo),
 85 |             **self.kw
 86 |         )
 87 |         r.raise_for_status()
 88 |         return [t["name"] for t in r.json()]
 89 | 
 90 |     @property
 91 |     def branches(self):
 92 |         """Names of branches in the repo"""
 93 |         r = requests.get(
 94 |             "https://api.github.com/repos/{org}/{repo}/branches"
 95 |             "".format(org=self.org, repo=self.repo),
 96 |             **self.kw
 97 |         )
 98 |         r.raise_for_status()
 99 |         return [t["name"] for t in r.json()]
100 | 
101 |     @property
102 |     def refs(self):
103 |         """Named references, tags and branches"""
104 |         return {"tags": self.tags, "branches": self.branches}
105 | 
106 |     def ls(self, path, detail=False, sha=None, _sha=None, **kwargs):
107 |         """List files at given path
108 | 
109 |         Parameters
110 |         ----------
111 |         path: str
112 |             Location to list, relative to repo root
113 |         detail: bool
114 |             If True, returns list of dicts, one per file; if False, returns
115 |             list of full filenames only
116 |         sha: str (optional)
117 |             List at the given point in the repo history, branch or tag name or commit
118 |             SHA
119 |         _sha: str (optional)
120 |             List this specific tree object (used internally to descend into trees)
121 |         """
122 |         path = self._strip_protocol(path)
123 |         if path == "":
124 |             _sha = sha or self.root
125 |         if _sha is None:
126 |             parts = path.rstrip("/").split("/")
127 |             so_far = ""
128 |             _sha = sha or self.root
129 |             for part in parts:
130 |                 out = self.ls(so_far, True, sha=sha, _sha=_sha)
131 |                 so_far += "/" + part if so_far else part
132 |                 out = [o for o in out if o["name"] == so_far]
133 |                 if not out:
134 |                     raise FileNotFoundError(path)
135 |                 out = out[0]
136 |                 if out["type"] == "file":
137 |                     if detail:
138 |                         return [out]
139 |                     else:
140 |                         return path
141 |                 _sha = out["sha"]
142 |         if path not in self.dircache or sha not in [self.root, None]:
143 |             r = requests.get(
144 |                 self.url.format(org=self.org, repo=self.repo, sha=_sha), **self.kw
145 |             )
146 |             if r.status_code == 404:
147 |                 raise FileNotFoundError(path)
148 |             r.raise_for_status()
149 |             out = [
150 |                 {
151 |                     "name": path + "/" + f["path"] if path else f["path"],
152 |                     "mode": f["mode"],
153 |                     "type": {"blob": "file", "tree": "directory"}[f["type"]],
154 |                     "size": f.get("size", 0),
155 |                     "sha": f["sha"],
156 |                 }
157 |                 for f in r.json()["tree"]
158 |             ]
159 |             if sha in [self.root, None]:
160 |                 self.dircache[path] = out
161 |         else:
162 |             out = self.dircache[path]
163 |         if detail:
164 |             return out
165 |         else:
166 |             return sorted([f["name"] for f in out])
167 | 
168 |     def invalidate_cache(self, path=None):
169 |         self.dircache.clear()
170 | 
171 |     @classmethod
172 |     def _strip_protocol(cls, path):
173 |         opts = infer_storage_options(path)
174 |         if "username" not in opts:
175 |             return super()._strip_protocol(path)
176 |         return opts["path"].lstrip("/")
177 | 
178 |     @staticmethod
179 |     def _get_kwargs_from_urls(path):
180 |         opts = infer_storage_options(path)
181 |         if "username" not in opts:
182 |             return {}
183 |         out = {"org": opts["username"], "repo": opts["password"]}
184 |         if opts["host"]:
185 |             out["sha"] = opts["host"]
186 |         return out
187 | 
188 |     def _open(
189 |         self,
190 |         path,
191 |         mode="rb",
192 |         block_size=None,
193 |         autocommit=True,
194 |         cache_options=None,
195 |         sha=None,
196 |         **kwargs
197 |     ):
198 |         if mode != "rb":
199 |             raise NotImplementedError
200 |         url = self.rurl.format(
201 |             org=self.org, repo=self.repo, path=path, sha=sha or self.root
202 |         )
203 |         r = requests.get(url, **self.kw)
204 |         if r.status_code == 404:
205 |             raise FileNotFoundError(path)
206 |         r.raise_for_status()
207 |         return MemoryFile(None, None, r.content)
208 | 


--------------------------------------------------------------------------------
/fsspec/mapping.py:
--------------------------------------------------------------------------------
  1 | import array
  2 | from collections.abc import MutableMapping
  3 | from .core import url_to_fs
  4 | 
  5 | 
  6 | class FSMap(MutableMapping):
  7 |     """Wrap a FileSystem instance as a mutable wrapping.
  8 | 
  9 |     The keys of the mapping become files under the given root, and the
 10 |     values (which must be bytes) the contents of those files.
 11 | 
 12 |     Parameters
 13 |     ----------
 14 |     root: string
 15 |         prefix for all the files
 16 |     fs: FileSystem instance
 17 |     check: bool (=True)
 18 |         performs a touch at the location, to check for write access.
 19 | 
 20 |     Examples
 21 |     --------
 22 |     >>> fs = FileSystem(**parameters) # doctest: +SKIP
 23 |     >>> d = FSMap('my-data/path/', fs) # doctest: +SKIP
 24 |     or, more likely
 25 |     >>> d = fs.get_mapper('my-data/path/')
 26 | 
 27 |     >>> d['loc1'] = b'Hello World' # doctest: +SKIP
 28 |     >>> list(d.keys()) # doctest: +SKIP
 29 |     ['loc1']
 30 |     >>> d['loc1'] # doctest: +SKIP
 31 |     b'Hello World'
 32 |     """
 33 | 
 34 |     def __init__(self, root, fs, check=False, create=False, missing_exceptions=None):
 35 |         self.fs = fs
 36 |         self.root = fs._strip_protocol(root).rstrip(
 37 |             "/"
 38 |         )  # we join on '/' in _key_to_str
 39 |         if missing_exceptions is None:
 40 |             missing_exceptions = (
 41 |                 FileNotFoundError,
 42 |                 IsADirectoryError,
 43 |                 NotADirectoryError,
 44 |             )
 45 |         self.missing_exceptions = missing_exceptions
 46 |         if create:
 47 |             if not self.fs.exists(root):
 48 |                 self.fs.mkdir(root)
 49 |         if check:
 50 |             if not self.fs.exists(root):
 51 |                 raise ValueError(
 52 |                     "Path %s does not exist. Create "
 53 |                     " with the ``create=True`` keyword" % root
 54 |                 )
 55 |             self.fs.touch(root + "/a")
 56 |             self.fs.rm(root + "/a")
 57 | 
 58 |     def clear(self):
 59 |         """Remove all keys below root - empties out mapping"""
 60 |         try:
 61 |             self.fs.rm(self.root, True)
 62 |             self.fs.mkdir(self.root)
 63 |         except:  # noqa: E722
 64 |             pass
 65 | 
 66 |     def getitems(self, keys, on_error="raise"):
 67 |         """Fetch multiple items from the store
 68 | 
 69 |         If the backend is async-able, this might proceed concurrently
 70 | 
 71 |         Parameters
 72 |         ----------
 73 |         keys: list(str)
 74 |             They keys to be fetched
 75 |         on_error : "raise", "omit", "return"
 76 |             If raise, an underlying exception will be raised (converted to KeyError
 77 |             if the type is in self.missing_exceptions); if omit, keys with exception
 78 |             will simply not be included in the output; if "return", all keys are
 79 |             included in the output, but the value will be bytes or an exception
 80 |             instance.
 81 | 
 82 |         Returns
 83 |         -------
 84 |         dict(key, bytes|exception)
 85 |         """
 86 |         keys2 = [self._key_to_str(k) for k in keys]
 87 |         oe = on_error if on_error == "raise" else "return"
 88 |         try:
 89 |             out = self.fs.cat(keys2, on_error=oe)
 90 |         except self.missing_exceptions as e:
 91 |             raise KeyError from e
 92 |         out = {
 93 |             k: (KeyError() if isinstance(v, self.missing_exceptions) else v)
 94 |             for k, v in out.items()
 95 |         }
 96 |         return {
 97 |             key: out[k2]
 98 |             for key, k2 in zip(keys, keys2)
 99 |             if on_error == "return" or not isinstance(out[k2], BaseException)
100 |         }
101 | 
102 |     def setitems(self, values_dict):
103 |         """Set the values of multiple items in the store
104 | 
105 |         Parameters
106 |         ----------
107 |         values_dict: dict(str, bytes)
108 |         """
109 |         values = {self._key_to_str(k): maybe_convert(v) for k, v in values_dict.items()}
110 |         self.fs.pipe(values)
111 | 
112 |     def delitems(self, keys):
113 |         """Remove multiple keys from the store"""
114 |         self.fs.rm([self._key_to_str(k) for k in keys])
115 | 
116 |     def _key_to_str(self, key):
117 |         """Generate full path for the key"""
118 |         if isinstance(key, (tuple, list)):
119 |             key = str(tuple(key))
120 |         else:
121 |             key = str(key)
122 |         return "/".join([self.root, key]) if self.root else key
123 | 
124 |     def _str_to_key(self, s):
125 |         """Strip path of to leave key name"""
126 |         return s[len(self.root) :].lstrip("/")
127 | 
128 |     def __getitem__(self, key, default=None):
129 |         """Retrieve data"""
130 |         k = self._key_to_str(key)
131 |         try:
132 |             result = self.fs.cat(k)
133 |         except self.missing_exceptions:
134 |             if default is not None:
135 |                 return default
136 |             raise KeyError(key)
137 |         return result
138 | 
139 |     def pop(self, key, default=None):
140 |         result = self.__getitem__(key, default)
141 |         try:
142 |             del self[key]
143 |         except KeyError:
144 |             pass
145 |         return result
146 | 
147 |     def __setitem__(self, key, value):
148 |         """Store value in key"""
149 |         key = self._key_to_str(key)
150 |         self.fs.mkdirs(self.fs._parent(key), exist_ok=True)
151 |         self.fs.pipe_file(key, maybe_convert(value))
152 | 
153 |     def __iter__(self):
154 |         return (self._str_to_key(x) for x in self.fs.find(self.root))
155 | 
156 |     def __len__(self):
157 |         return len(self.fs.find(self.root))
158 | 
159 |     def __delitem__(self, key):
160 |         """Remove key"""
161 |         try:
162 |             self.fs.rm(self._key_to_str(key))
163 |         except:  # noqa: E722
164 |             raise KeyError
165 | 
166 |     def __contains__(self, key):
167 |         """Does key exist in mapping?"""
168 |         path = self._key_to_str(key)
169 |         return self.fs.exists(path) and self.fs.isfile(path)
170 | 
171 |     def __reduce__(self):
172 |         return FSMap, (self.root, self.fs, False, False, self.missing_exceptions)
173 | 
174 | 
175 | def maybe_convert(value):
176 |     if isinstance(value, array.array) or hasattr(value, "__array__"):
177 |         # bytes-like things
178 |         value = bytearray(memoryview(value))
179 |     return value
180 | 
181 | 
182 | def get_mapper(url, check=False, create=False, missing_exceptions=None, **kwargs):
183 |     """Create key-value interface for given URL and options
184 | 
185 |     The URL will be of the form "protocol://location" and point to the root
186 |     of the mapper required. All keys will be file-names below this location,
187 |     and their values the contents of each key.
188 | 
189 |     Also accepts compound URLs like zip::s3://bucket/file.zip , see ``fsspec.open``.
190 | 
191 |     Parameters
192 |     ----------
193 |     url: str
194 |         Root URL of mapping
195 |     check: bool
196 |         Whether to attempt to read from the location before instantiation, to
197 |         check that the mapping does exist
198 |     create: bool
199 |         Whether to make the directory corresponding to the root before
200 |         instantiating
201 |     missing_exceptions: None or tuple
202 |         If given, these excpetion types will be regarded as missing keys and
203 |         return KeyError when trying to read data. By default, you get
204 |         (FileNotFoundError, IsADirectoryError, NotADirectoryError)
205 | 
206 |     Returns
207 |     -------
208 |     ``FSMap`` instance, the dict-like key-value store.
209 |     """
210 |     # Removing protocol here - could defer to each open() on the backend
211 |     fs, urlpath = url_to_fs(url, **kwargs)
212 |     return FSMap(urlpath, fs, check, create, missing_exceptions=missing_exceptions)
213 | 


--------------------------------------------------------------------------------