├── requirements.txt ├── fsspec ├── tests │ ├── __init__.py │ ├── test_gui.py │ ├── test_async.py │ ├── test_fuse.py │ ├── test_config.py │ ├── test_caches.py │ ├── test_mapping.py │ ├── test_registry.py │ ├── test_compression.py │ ├── test_core.py │ └── test_file.py ├── implementations │ ├── __init__.py │ ├── tests │ │ ├── __init__.py │ │ ├── test_dask.py │ │ ├── conftest.py │ │ ├── test_common.py │ │ ├── test_hdfs.py │ │ ├── test_jupyter.py │ │ ├── test_git.py │ │ ├── cassettes │ │ │ └── test_dbfs_file_listing.yaml │ │ ├── test_reference.py │ │ ├── test_memory.py │ │ ├── test_smb.py │ │ ├── test_webhdfs.py │ │ ├── test_sftp.py │ │ ├── test_ftp.py │ │ ├── test_dbfs.py │ │ ├── test_zip.py │ │ └── test_libarchive.py │ ├── dvc.py │ ├── git.py │ ├── jupyter.py │ ├── sftp.py │ ├── dask.py │ ├── zip.py │ ├── reference.py │ ├── hdfs.py │ ├── libarchive.py │ └── github.py ├── __init__.py ├── conftest.py ├── transaction.py ├── dircache.py ├── config.py ├── compression.py ├── fuse.py └── mapping.py ├── .gitattributes ├── readthedocs.yml ├── docs ├── source │ ├── img │ │ └── gui.png │ ├── index.rst │ ├── developer.rst │ ├── usage.rst │ ├── intro.rst │ ├── changelog.rst │ ├── conf.py │ └── api.rst ├── environment.yml ├── README.md ├── Makefile └── make.bat ├── MANIFEST.in ├── ci ├── environment-py38.yml └── environment-win.yml ├── pyproject.toml ├── .coveragerc ├── .pre-commit-config.yaml ├── setup.cfg ├── .github └── workflows │ ├── pypipublish.yaml │ └── main.yaml ├── LICENSE ├── setup.py ├── .gitignore ├── tox.ini └── README.md /requirements.txt: -------------------------------------------------------------------------------- 1 | -------------------------------------------------------------------------------- /fsspec/tests/__init__.py: -------------------------------------------------------------------------------- 1 | -------------------------------------------------------------------------------- /fsspec/implementations/__init__.py: -------------------------------------------------------------------------------- 1 | -------------------------------------------------------------------------------- /fsspec/implementations/tests/__init__.py: -------------------------------------------------------------------------------- 1 | -------------------------------------------------------------------------------- /.gitattributes: -------------------------------------------------------------------------------- 1 | fsspec/_version.py export-subst 2 | -------------------------------------------------------------------------------- /readthedocs.yml: -------------------------------------------------------------------------------- 1 | conda: 2 | file: docs/environment.yml 3 | -------------------------------------------------------------------------------- /docs/source/img/gui.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/andersy005/filesystem_spec/master/docs/source/img/gui.png -------------------------------------------------------------------------------- /MANIFEST.in: -------------------------------------------------------------------------------- 1 | include versioneer.py 2 | include fsspec/_version.py 3 | 4 | include LICENSE 5 | include README.rst 6 | include requirements.txt 7 | -------------------------------------------------------------------------------- /ci/environment-py38.yml: -------------------------------------------------------------------------------- 1 | name: test_env 2 | channels: 3 | - conda-forge 4 | - defaults 5 | dependencies: 6 | - python=3.8 7 | - tox 8 | - tox-conda 9 | -------------------------------------------------------------------------------- /pyproject.toml: -------------------------------------------------------------------------------- 1 | [tool.black] 2 | # Revert to py34 target syntax to accomodate 3 | # errors in trailing commas. 4 | # https://github.com/psf/black/pull/763 5 | target_version = ['py34'] 6 | -------------------------------------------------------------------------------- /fsspec/tests/test_gui.py: -------------------------------------------------------------------------------- 1 | import pytest 2 | 3 | panel = pytest.importorskip("panel") 4 | 5 | 6 | def test_basic(): 7 | import fsspec.gui 8 | 9 | gui = fsspec.gui.FileSelector() 10 | assert "url" in str(gui.panel) 11 | -------------------------------------------------------------------------------- /docs/environment.yml: -------------------------------------------------------------------------------- 1 | name: fsspec 2 | channels: 3 | - defaults 4 | - conda-forge 5 | dependencies: 6 | - python=3.7 7 | - paramiko 8 | - requests 9 | - numpydoc 10 | - aiohttp 11 | - pygit2 12 | - distributed 13 | - pyarrow=1 14 | - libarchive 15 | - smbprotocol 16 | -------------------------------------------------------------------------------- /docs/README.md: -------------------------------------------------------------------------------- 1 | # Building Documentation 2 | 3 | A basic python environment with packages listed in `./requirements.txt` is 4 | required to build the docs, see ``environment.yml``. 5 | 6 | To make HTML documentation: 7 | 8 | ```bash 9 | make html 10 | ``` 11 | 12 | Outputs to `build/html/index.html` 13 | -------------------------------------------------------------------------------- /ci/environment-win.yml: -------------------------------------------------------------------------------- 1 | name: test_env 2 | channels: 3 | - conda-forge 4 | - defaults 5 | dependencies: 6 | - aiohttp 7 | - pip 8 | - requests 9 | - zstandard 10 | - python-snappy 11 | - lz4 12 | - pyftpdlib 13 | - cloudpickle 14 | - pytest 15 | - pytest-benchmark 16 | - pytest-cov 17 | - pytest-vcr 18 | - python-libarchive-c 19 | - numpy 20 | - nomkl 21 | -------------------------------------------------------------------------------- /.coveragerc: -------------------------------------------------------------------------------- 1 | [run] 2 | omit = 3 | */test_*.py 4 | fsspec/_version.py 5 | fsspec/implementations/dvc.py 6 | fsspec/implementations/github.py 7 | fsspec/implementations/hdfs.py 8 | source = 9 | fsspec 10 | 11 | [report] 12 | # Regexes for lines to exclude from consideration 13 | exclude_lines = 14 | pragma: no cover 15 | 16 | raise AssertionError 17 | raise NotImplementedError 18 | pass 19 | 20 | ignore_errors = True 21 | -------------------------------------------------------------------------------- /.pre-commit-config.yaml: -------------------------------------------------------------------------------- 1 | exclude: > 2 | (?x)^( 3 | \.tox/.* 4 | )$ 5 | default_language_version: 6 | python: python3.7 7 | repos: 8 | - repo: local 9 | hooks: 10 | - id: black 11 | name: black 12 | entry: black 13 | language: python 14 | require_serial: true 15 | types: [python] 16 | - repo: https://github.com/pre-commit/pre-commit-hooks 17 | rev: v2.3.0 18 | hooks: 19 | - id: flake8 20 | -------------------------------------------------------------------------------- /setup.cfg: -------------------------------------------------------------------------------- 1 | [metadata] 2 | long_description: file: README.rst 3 | 4 | [versioneer] 5 | VCS = git 6 | style = pep440 7 | versionfile_source = fsspec/_version.py 8 | versionfile_build = fsspec/_version.py 9 | tag_prefix = "" 10 | 11 | [flake8] 12 | exclude = .tox,build,docs/source/conf.py,versioneer.py 13 | max-line-length = 88 14 | ignore = 15 | # Assigning lambda expression 16 | E731 17 | # Ambiguous variable names 18 | E741 19 | # line break before binary operator 20 | W503 21 | # whitespace before : 22 | E203 23 | -------------------------------------------------------------------------------- /fsspec/tests/test_async.py: -------------------------------------------------------------------------------- 1 | import pytest 2 | import asyncio 3 | import sys 4 | from fsspec.asyn import _run_until_done 5 | 6 | 7 | async def inner(): 8 | await asyncio.sleep(1) 9 | return True 10 | 11 | 12 | async def outer(): 13 | await asyncio.sleep(1) 14 | return _run_until_done(inner()) 15 | 16 | 17 | @pytest.mark.skipif(sys.version_info < (3, 7), reason="Async fails on py36") 18 | def test_runtildone(): 19 | loop = asyncio.new_event_loop() 20 | asyncio.set_event_loop(loop) 21 | assert loop.run_until_complete(outer()) 22 | loop.close() 23 | -------------------------------------------------------------------------------- /docs/Makefile: -------------------------------------------------------------------------------- 1 | # Minimal makefile for Sphinx documentation 2 | # 3 | 4 | # You can set these variables from the command line. 5 | SPHINXOPTS = 6 | SPHINXBUILD = sphinx-build 7 | SPHINXPROJ = fsspec 8 | SOURCEDIR = source 9 | BUILDDIR = build 10 | 11 | # Put it first so that "make" without argument is like "make help". 12 | help: 13 | @$(SPHINXBUILD) -M help "$(SOURCEDIR)" "$(BUILDDIR)" $(SPHINXOPTS) $(O) 14 | 15 | .PHONY: help Makefile 16 | 17 | # Catch-all target: route all unknown targets to Sphinx using the new 18 | # "make mode" option. $(O) is meant as a shortcut for $(SPHINXOPTS). 19 | %: Makefile 20 | @$(SPHINXBUILD) -M $@ "$(SOURCEDIR)" "$(BUILDDIR)" $(SPHINXOPTS) $(O) 21 | -------------------------------------------------------------------------------- /fsspec/implementations/tests/test_dask.py: -------------------------------------------------------------------------------- 1 | import pytest 2 | import fsspec 3 | 4 | pytest.importorskip("distributed") 5 | 6 | 7 | @pytest.fixture() 8 | def cli(tmpdir): 9 | import dask.distributed 10 | 11 | client = dask.distributed.Client(n_workers=1) 12 | 13 | def setup(): 14 | m = fsspec.filesystem("memory") 15 | with m.open("afile", "wb") as f: 16 | f.write(b"data") 17 | 18 | client.run(setup) 19 | try: 20 | yield client 21 | finally: 22 | client.close() 23 | 24 | 25 | def test_basic(cli): 26 | 27 | fs = fsspec.filesystem("dask", target_protocol="memory") 28 | assert fs.ls("") == ["afile"] 29 | assert fs.cat("afile") == b"data" 30 | -------------------------------------------------------------------------------- /fsspec/implementations/tests/conftest.py: -------------------------------------------------------------------------------- 1 | import tempfile 2 | 3 | import pytest 4 | 5 | from fsspec.implementations.local import LocalFileSystem 6 | 7 | 8 | # A dummy filesystem that has a list of protocols 9 | class MultiProtocolFileSystem(LocalFileSystem): 10 | protocol = ["file", "other"] 11 | 12 | 13 | FILESYSTEMS = {"local": LocalFileSystem, "multi": MultiProtocolFileSystem} 14 | 15 | READ_ONLY_FILESYSTEMS = [] 16 | 17 | 18 | @pytest.fixture(scope="function") 19 | def fs(request): 20 | cls = FILESYSTEMS[request.param] 21 | return cls() 22 | 23 | 24 | @pytest.fixture(scope="function") 25 | def temp_file(): 26 | with tempfile.TemporaryDirectory() as temp_dir: 27 | return temp_dir + "test-file" 28 | -------------------------------------------------------------------------------- /.github/workflows/pypipublish.yaml: -------------------------------------------------------------------------------- 1 | name: Upload Python Package 2 | 3 | on: 4 | release: 5 | types: [created] 6 | 7 | jobs: 8 | deploy: 9 | runs-on: ubuntu-latest 10 | steps: 11 | - uses: actions/checkout@v2 12 | - name: Set up Python 13 | uses: actions/setup-python@v2 14 | with: 15 | python-version: "3.x" 16 | - name: Install dependencies 17 | run: | 18 | python -m pip install --upgrade pip 19 | pip install setuptools setuptools-scm wheel twine 20 | - name: Build and publish 21 | env: 22 | TWINE_USERNAME: ${{ secrets.PYPI_USERNAME }} 23 | TWINE_PASSWORD: ${{ secrets.PYPI_PASSWORD }} 24 | run: | 25 | python setup.py sdist bdist_wheel 26 | twine upload dist/* 27 | -------------------------------------------------------------------------------- /docs/make.bat: -------------------------------------------------------------------------------- 1 | @ECHO OFF 2 | 3 | pushd %~dp0 4 | 5 | REM Command file for Sphinx documentation 6 | 7 | if "%SPHINXBUILD%" == "" ( 8 | set SPHINXBUILD=sphinx-build 9 | ) 10 | set SOURCEDIR=source 11 | set BUILDDIR=build 12 | set SPHINXPROJ=fsspec 13 | 14 | if "%1" == "" goto help 15 | 16 | %SPHINXBUILD% >NUL 2>NUL 17 | if errorlevel 9009 ( 18 | echo. 19 | echo.The 'sphinx-build' command was not found. Make sure you have Sphinx 20 | echo.installed, then set the SPHINXBUILD environment variable to point 21 | echo.to the full path of the 'sphinx-build' executable. Alternatively you 22 | echo.may add the Sphinx directory to PATH. 23 | echo. 24 | echo.If you don't have Sphinx installed, grab it from 25 | echo.http://sphinx-doc.org/ 26 | exit /b 1 27 | ) 28 | 29 | %SPHINXBUILD% -M %1 %SOURCEDIR% %BUILDDIR% %SPHINXOPTS% 30 | goto end 31 | 32 | :help 33 | %SPHINXBUILD% -M help %SOURCEDIR% %BUILDDIR% %SPHINXOPTS% 34 | 35 | :end 36 | popd 37 | -------------------------------------------------------------------------------- /fsspec/implementations/tests/test_common.py: -------------------------------------------------------------------------------- 1 | import datetime 2 | import time 3 | import pytest 4 | 5 | from fsspec import AbstractFileSystem 6 | from fsspec.implementations.tests.conftest import READ_ONLY_FILESYSTEMS 7 | 8 | 9 | @pytest.mark.parametrize("fs", ["local"], indirect=["fs"]) 10 | def test_created(fs: AbstractFileSystem, temp_file): 11 | try: 12 | fs.touch(temp_file) 13 | created = fs.created(path=temp_file) 14 | assert isinstance(created, datetime.datetime) 15 | finally: 16 | if not isinstance(fs, tuple(READ_ONLY_FILESYSTEMS)): 17 | fs.rm(temp_file) 18 | 19 | 20 | @pytest.mark.parametrize("fs", ["local"], indirect=["fs"]) 21 | def test_modified(fs: AbstractFileSystem, temp_file): 22 | try: 23 | fs.touch(temp_file) 24 | created = fs.created(path=temp_file) 25 | time.sleep(0.05) 26 | fs.touch(temp_file) 27 | modified = fs.modified(path=temp_file) 28 | assert isinstance(modified, datetime.datetime) 29 | assert modified > created 30 | finally: 31 | fs.rm(temp_file) 32 | -------------------------------------------------------------------------------- /fsspec/__init__.py: -------------------------------------------------------------------------------- 1 | try: 2 | from importlib.metadata import entry_points 3 | except ImportError: # python < 3.8 4 | try: 5 | from importlib_metadata import entry_points 6 | except ImportError: 7 | entry_points = None 8 | 9 | 10 | from . import caching 11 | from ._version import get_versions 12 | from .core import get_fs_token_paths, open, open_files, open_local 13 | from .mapping import FSMap, get_mapper 14 | from .registry import ( 15 | filesystem, 16 | get_filesystem_class, 17 | register_implementation, 18 | registry, 19 | ) 20 | from .spec import AbstractFileSystem 21 | 22 | __version__ = get_versions()["version"] 23 | del get_versions 24 | 25 | 26 | __all__ = [ 27 | "AbstractFileSystem", 28 | "FSMap", 29 | "filesystem", 30 | "register_implementation", 31 | "get_filesystem_class", 32 | "get_fs_token_paths", 33 | "get_mapper", 34 | "open", 35 | "open_files", 36 | "open_local", 37 | "registry", 38 | "caching", 39 | ] 40 | 41 | if entry_points is not None: 42 | entry_points = entry_points() 43 | for spec in entry_points.get("fsspec.specs", []): 44 | err_msg = f"Unable to load filesystem from {spec}" 45 | register_implementation(spec.name, spec.module, errtxt=err_msg) 46 | -------------------------------------------------------------------------------- /fsspec/conftest.py: -------------------------------------------------------------------------------- 1 | import os 2 | import shutil 3 | import subprocess 4 | import sys 5 | import time 6 | 7 | import pytest 8 | 9 | import fsspec 10 | from fsspec.implementations.cached import CachingFileSystem 11 | 12 | 13 | @pytest.fixture() 14 | def m(): 15 | """ 16 | Fixture providing a memory filesystem. 17 | """ 18 | m = fsspec.filesystem("memory") 19 | m.store.clear() 20 | try: 21 | yield m 22 | finally: 23 | m.store.clear() 24 | 25 | 26 | @pytest.fixture 27 | def ftp_writable(tmpdir): 28 | """ 29 | Fixture providing a writable FTP filesystem. 30 | """ 31 | pytest.importorskip("pyftpdlib") 32 | from fsspec.implementations.ftp import FTPFileSystem 33 | 34 | FTPFileSystem.clear_instance_cache() # remove lingering connections 35 | CachingFileSystem.clear_instance_cache() 36 | d = str(tmpdir) 37 | with open(os.path.join(d, "out"), "wb") as f: 38 | f.write(b"hello" * 10000) 39 | P = subprocess.Popen( 40 | [sys.executable, "-m", "pyftpdlib", "-d", d, "-u", "user", "-P", "pass", "-w"] 41 | ) 42 | try: 43 | time.sleep(1) 44 | yield "localhost", 2121, "user", "pass" 45 | finally: 46 | P.terminate() 47 | P.wait() 48 | try: 49 | shutil.rmtree(tmpdir) 50 | except Exception: 51 | pass 52 | -------------------------------------------------------------------------------- /LICENSE: -------------------------------------------------------------------------------- 1 | BSD 3-Clause License 2 | 3 | Copyright (c) 2018, Martin Durant 4 | All rights reserved. 5 | 6 | Redistribution and use in source and binary forms, with or without 7 | modification, are permitted provided that the following conditions are met: 8 | 9 | * Redistributions of source code must retain the above copyright notice, this 10 | list of conditions and the following disclaimer. 11 | 12 | * Redistributions in binary form must reproduce the above copyright notice, 13 | this list of conditions and the following disclaimer in the documentation 14 | and/or other materials provided with the distribution. 15 | 16 | * Neither the name of the copyright holder nor the names of its 17 | contributors may be used to endorse or promote products derived from 18 | this software without specific prior written permission. 19 | 20 | THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" 21 | AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE 22 | IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE 23 | DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE 24 | FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL 25 | DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR 26 | SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER 27 | CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, 28 | OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE 29 | OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. 30 | -------------------------------------------------------------------------------- /fsspec/implementations/tests/test_hdfs.py: -------------------------------------------------------------------------------- 1 | import pytest 2 | import fsspec 3 | 4 | pyarrow = pytest.importorskip("pyarrow") 5 | 6 | basedir = "/tmp/test-fsspec" 7 | data = b"\n".join([b"some test data"] * 1000) 8 | 9 | 10 | @pytest.fixture 11 | def hdfs(request): 12 | try: 13 | hdfs = pyarrow.hdfs.connect() 14 | except IOError: 15 | pytest.skip("No HDFS configured") 16 | 17 | if hdfs.exists(basedir): 18 | hdfs.rm(basedir, recursive=True) 19 | 20 | hdfs.mkdir(basedir) 21 | 22 | with hdfs.open(basedir + "/file", "wb") as f: 23 | f.write(data) 24 | 25 | yield hdfs 26 | 27 | if hdfs.exists(basedir): 28 | hdfs.rm(basedir, recursive=True) 29 | 30 | 31 | def test_ls(hdfs): 32 | h = fsspec.filesystem("hdfs") 33 | out = [f["name"] for f in h.ls(basedir)] 34 | assert out == hdfs.ls(basedir) 35 | 36 | 37 | def test_walk(hdfs): 38 | h = fsspec.filesystem("hdfs") 39 | out = h.walk(basedir) 40 | assert list(out) == list(hdfs.walk(basedir)) 41 | 42 | 43 | def test_isdir(hdfs): 44 | h = fsspec.filesystem("hdfs") 45 | assert h.isdir(basedir) 46 | assert not h.isdir(basedir + "/file") 47 | 48 | 49 | def test_exists(hdfs): 50 | h = fsspec.filesystem("hdfs") 51 | assert not h.exists(basedir + "/notafile") 52 | 53 | 54 | def test_read(hdfs): 55 | h = fsspec.filesystem("hdfs") 56 | out = basedir + "/file" 57 | with h.open(out, "rb") as f: 58 | assert f.read() == data 59 | with h.open(out, "rb", block_size=0) as f: 60 | assert f.read() == data 61 | with h.open(out, "rb") as f: 62 | assert f.read(100) + f.read() == data 63 | -------------------------------------------------------------------------------- /fsspec/implementations/tests/test_jupyter.py: -------------------------------------------------------------------------------- 1 | import os 2 | import shlex 3 | import subprocess 4 | import time 5 | import pytest 6 | 7 | import fsspec 8 | 9 | pytest.importorskip("notebook") 10 | requests = pytest.importorskip("requests") 11 | 12 | 13 | @pytest.fixture() 14 | def jupyter(tmpdir): 15 | 16 | tmpdir = str(tmpdir) 17 | os.environ["JUPYTER_TOKEN"] = "blah" 18 | try: 19 | cmd = f"jupyter notebook --notebook-dir={tmpdir} --no-browser --port=5566" 20 | P = subprocess.Popen(shlex.split(cmd)) 21 | except FileNotFoundError: 22 | pytest.skip("notebook not installed correctly") 23 | try: 24 | timeout = 15 25 | while True: 26 | try: 27 | r = requests.get("http://localhost:5566/?token=blah") 28 | r.raise_for_status() 29 | break 30 | except (requests.exceptions.BaseHTTPError, IOError): 31 | time.sleep(0.1) 32 | timeout -= 0.1 33 | if timeout < 0: 34 | pytest.xfail("Timed out for jupyter") 35 | yield "http://localhost:5566/?token=blah", tmpdir 36 | finally: 37 | P.terminate() 38 | 39 | 40 | def test_simple(jupyter): 41 | url, d = jupyter 42 | fs = fsspec.filesystem("jupyter", url=url) 43 | assert fs.ls("") == [] 44 | 45 | fs.pipe("afile", b"data") 46 | assert fs.cat("afile") == b"data" 47 | assert "afile" in os.listdir(d) 48 | 49 | with fs.open("bfile", "wb") as f: 50 | f.write(b"more") 51 | with fs.open("bfile", "rb") as f: 52 | assert f.read() == b"more" 53 | 54 | assert fs.info("bfile")["size"] == 4 55 | fs.rm("afile") 56 | 57 | assert "afile" not in os.listdir(d) 58 | -------------------------------------------------------------------------------- /setup.py: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env python 2 | import os 3 | 4 | from setuptools import setup 5 | import versioneer 6 | 7 | here = os.path.abspath(os.path.dirname(__file__)) 8 | with open(os.path.join(here, "README.md"), encoding="utf-8") as f: 9 | long_description = f.read() 10 | 11 | setup( 12 | name="fsspec", 13 | version=versioneer.get_version(), 14 | cmdclass=versioneer.get_cmdclass(), 15 | classifiers=[ 16 | "Development Status :: 4 - Beta", 17 | "Intended Audience :: Developers", 18 | "License :: OSI Approved :: BSD License", 19 | "Operating System :: OS Independent", 20 | "Programming Language :: Python :: 3.6", 21 | "Programming Language :: Python :: 3.7", 22 | "Programming Language :: Python :: 3.8", 23 | ], 24 | description="File-system specification", 25 | long_description=long_description, 26 | long_description_content_type="text/markdown", 27 | url="http://github.com/intake/filesystem_spec", 28 | maintainer="Martin Durant", 29 | maintainer_email="mdurant@anaconda.com", 30 | license="BSD", 31 | keywords="file", 32 | packages=["fsspec", "fsspec.implementations"], 33 | python_requires=">3.6", 34 | install_requires=open("requirements.txt").read().strip().split("\n"), 35 | extras_require={ 36 | ":python_version < '3.8'": ['importlib_metadata'], 37 | "abfs": ["adlfs"], 38 | "adl": ["adlfs"], 39 | "dask": ["dask", "distributed"], 40 | "dropbox": ["dropboxdrivefs", "requests", "dropbox"], 41 | "gcs": ["gcsfs"], 42 | "git": ["pygit2"], 43 | "github": ["requests"], 44 | "gs": ["gcsfs"], 45 | "hdfs": ["pyarrow"], 46 | "http": ["requests", "aiohttp"], 47 | "sftp": ["paramiko"], 48 | "s3": ["s3fs"], 49 | "smb": ["smbprotocol"], 50 | "ssh": ["paramiko"], 51 | }, 52 | zip_safe=False, 53 | ) 54 | -------------------------------------------------------------------------------- /fsspec/tests/test_fuse.py: -------------------------------------------------------------------------------- 1 | import os 2 | import signal 3 | import time 4 | from multiprocessing import Process 5 | 6 | import pytest 7 | 8 | try: 9 | pytest.importorskip("fuse") # noqa: E402 10 | except OSError: 11 | # can succeed in importing fuse, but fail to load so 12 | pytest.importorskip("nonexistent") # noqa: E402 13 | 14 | from fsspec.fuse import run 15 | from fsspec.implementations.memory import MemoryFileSystem 16 | 17 | 18 | def host_fuse(mountdir): 19 | fs = MemoryFileSystem() 20 | fs.touch("/mounted/testfile") 21 | run(fs, "/mounted/", mountdir) 22 | 23 | 24 | def test_basic(tmpdir, capfd): 25 | mountdir = str(tmpdir.mkdir("mount")) 26 | 27 | fuse_process = Process(target=host_fuse, args=(str(mountdir),)) 28 | fuse_process.start() 29 | 30 | try: 31 | timeout = 10 32 | while True: 33 | try: 34 | # can fail with device not ready while waiting for fuse 35 | if "testfile" in os.listdir(mountdir): 36 | break 37 | except Exception: 38 | pass 39 | timeout -= 1 40 | time.sleep(1) 41 | assert timeout > 0, "Timeout" 42 | 43 | fn = os.path.join(mountdir, "test") 44 | with open(fn, "wb") as f: 45 | f.write(b"data") 46 | 47 | with open(fn) as f: 48 | assert f.read() == "data" 49 | 50 | os.remove(fn) 51 | 52 | os.mkdir(fn) 53 | assert os.listdir(fn) == [] 54 | 55 | os.mkdir(fn + "/inner") 56 | 57 | with pytest.raises(OSError): 58 | os.rmdir(fn) 59 | 60 | captured = capfd.readouterr() 61 | assert "Traceback" not in captured.out 62 | assert "Traceback" not in captured.err 63 | 64 | os.rmdir(fn + "/inner") 65 | os.rmdir(fn) 66 | finally: 67 | os.kill(fuse_process.pid, signal.SIGTERM) 68 | fuse_process.join() 69 | -------------------------------------------------------------------------------- /.gitignore: -------------------------------------------------------------------------------- 1 | # Dask 2 | dask-worker-space 3 | 4 | # Byte-compiled / optimized / DLL files 5 | __pycache__/ 6 | *.py[cod] 7 | *$py.class 8 | 9 | # C extensions 10 | *.so 11 | 12 | # Distribution / packaging 13 | .Python 14 | env/ 15 | build/ 16 | develop-eggs/ 17 | dist/ 18 | downloads/ 19 | eggs/ 20 | .eggs/ 21 | lib/ 22 | lib64/ 23 | parts/ 24 | sdist/ 25 | var/ 26 | wheels/ 27 | *.egg-info/ 28 | .installed.cfg 29 | *.egg 30 | pip-wheel-metadata/ 31 | 32 | # PyInstaller 33 | # Usually these files are written by a python script from a template 34 | # before PyInstaller builds the exe, so as to inject date/other infos into it. 35 | *.manifest 36 | *.spec 37 | 38 | # Installer logs 39 | pip-log.txt 40 | pip-delete-this-directory.txt 41 | 42 | # Unit test / coverage reports 43 | htmlcov/ 44 | .tox/ 45 | .coverage 46 | .coverage.* 47 | .cache 48 | nosetests.xml 49 | coverage.xml 50 | *.cover 51 | .hypothesis/ 52 | .pytest_cache/ 53 | 54 | # Translations 55 | *.mo 56 | *.pot 57 | 58 | # Django stuff: 59 | *.log 60 | local_settings.py 61 | 62 | # Flask stuff: 63 | instance/ 64 | .webassets-cache 65 | 66 | # Scrapy stuff: 67 | .scrapy 68 | 69 | # Sphinx documentation 70 | docs/_build/ 71 | 72 | # PyBuilder 73 | target/ 74 | 75 | # Jupyter Notebook 76 | .ipynb_checkpoints 77 | 78 | # pyenv 79 | .python-version 80 | 81 | # celery beat schedule file 82 | celerybeat-schedule 83 | 84 | # SageMath parsed files 85 | *.sage.py 86 | 87 | # dotenv 88 | .env 89 | 90 | # virtualenv 91 | .venv 92 | venv/ 93 | ENV/ 94 | 95 | # Spyder project settings 96 | .spyderproject 97 | .spyproject 98 | 99 | # Rope project settings 100 | .ropeproject 101 | 102 | # mkdocs documentation 103 | /site 104 | 105 | # mypy 106 | .mypy_cache/ 107 | 108 | # jetbrains ide stuff 109 | *.iml 110 | .idea/ 111 | 112 | # vscode ide stuff 113 | *.code-workspace 114 | .history 115 | .vscode 116 | -------------------------------------------------------------------------------- /.github/workflows/main.yaml: -------------------------------------------------------------------------------- 1 | name: CI 2 | 3 | on: 4 | push: 5 | branches: "*" 6 | pull_request: 7 | branches: master 8 | 9 | jobs: 10 | linux: 11 | name: ${{ matrix.TOXENV }}-pytest 12 | runs-on: ubuntu-latest 13 | strategy: 14 | fail-fast: false 15 | matrix: 16 | TOXENV: [py37, py38, s3fs, gcsfs] 17 | 18 | env: 19 | TOXENV: ${{ matrix.TOXENV }} 20 | CIRUN: true 21 | 22 | steps: 23 | - name: Checkout 24 | uses: actions/checkout@v2 25 | 26 | - name: Setup Miniconda 27 | uses: conda-incubator/setup-miniconda@v2 28 | with: 29 | auto-update-conda: true 30 | auto-activate-base: false 31 | activate-environment: test_env 32 | environment-file: ci/environment-py38.yml 33 | 34 | - name: Run Tests 35 | shell: bash -l {0} 36 | run: | 37 | tox -v 38 | 39 | win: 40 | name: ${{ matrix.TOXENV }}-pytest-win 41 | runs-on: windows-2019 42 | strategy: 43 | fail-fast: false 44 | matrix: 45 | TOXENV: [py38] 46 | 47 | env: 48 | TOXENV: ${{ matrix.TOXENV }} 49 | CIRUN: true 50 | 51 | steps: 52 | - name: Checkout 53 | uses: actions/checkout@v2 54 | 55 | - name: Setup Miniconda 56 | uses: conda-incubator/setup-miniconda@v2 57 | with: 58 | auto-update-conda: true 59 | auto-activate-base: false 60 | activate-environment: test_env 61 | environment-file: ci/environment-win.yml 62 | 63 | - name: Run Tests 64 | shell: bash -l {0} 65 | run: | 66 | pytest -v 67 | 68 | lint: 69 | name: lint 70 | runs-on: ubuntu-latest 71 | steps: 72 | - uses: actions/checkout@v2 73 | - uses: actions/setup-python@v2 74 | - name: pip-install 75 | shell: bash -l {0} 76 | run: | 77 | pip install flake8 black 78 | - name: Lint 79 | shell: bash -l {0} 80 | run: | 81 | flake8 fsspec 82 | black fsspec --check 83 | -------------------------------------------------------------------------------- /fsspec/tests/test_config.py: -------------------------------------------------------------------------------- 1 | import os 2 | import pytest 3 | import fsspec 4 | from fsspec.config import conf, set_conf_env, set_conf_files 5 | 6 | 7 | @pytest.fixture 8 | def clean_conf(): 9 | """Tests should start and end with clean config dict""" 10 | conf.clear() 11 | yield 12 | conf.clear() 13 | 14 | 15 | def test_from_env(clean_conf): 16 | env = { 17 | "FSSPEC_PROTO_KEY": "value", 18 | "FSSPEC_PROTO_LONG_KEY": "othervalue", 19 | "FSSPEC_MALFORMED": "novalue", 20 | } 21 | cd = {} 22 | set_conf_env(conf_dict=cd, envdict=env) 23 | assert cd == {"proto": {"key": "value", "long_key": "othervalue"}} 24 | 25 | 26 | def test_from_file_ini(clean_conf, tmpdir): 27 | file1 = os.path.join(tmpdir, "1.ini") 28 | file2 = os.path.join(tmpdir, "2.ini") 29 | with open(file1, "w") as f: 30 | f.write( 31 | """[proto] 32 | key=value 33 | other_key:othervalue 34 | overwritten=dont_see 35 | """ 36 | ) 37 | with open(file2, "w") as f: 38 | f.write( 39 | """[proto] 40 | overwritten=see 41 | """ 42 | ) 43 | cd = {} 44 | set_conf_files(tmpdir, cd) 45 | assert cd == { 46 | "proto": {"key": "value", "other_key": "othervalue", "overwritten": "see"} 47 | } 48 | 49 | 50 | def test_from_file_json(clean_conf, tmpdir): 51 | file1 = os.path.join(tmpdir, "1.json") 52 | file2 = os.path.join(tmpdir, "2.json") 53 | with open(file1, "w") as f: 54 | f.write( 55 | """{"proto": 56 | {"key": "value", 57 | "other_key": "othervalue", 58 | "overwritten": false}} 59 | """ 60 | ) 61 | with open(file2, "w") as f: 62 | f.write( 63 | """{"proto": 64 | {"overwritten": true}} 65 | """ 66 | ) 67 | cd = {} 68 | set_conf_files(tmpdir, cd) 69 | assert cd == { 70 | "proto": {"key": "value", "other_key": "othervalue", "overwritten": True} 71 | } 72 | 73 | 74 | def test_apply(clean_conf): 75 | conf["file"] = {"auto_mkdir": "test"} 76 | fs = fsspec.filesystem("file") 77 | assert fs.auto_mkdir == "test" 78 | fs = fsspec.filesystem("file", auto_mkdir=True) 79 | assert fs.auto_mkdir is True 80 | -------------------------------------------------------------------------------- /tox.ini: -------------------------------------------------------------------------------- 1 | # content of: tox.ini , put in same dir as setup.py 2 | [tox] 3 | envlist = {py36,py37,py38} 4 | 5 | [core] 6 | conda_channels= 7 | defaults 8 | conda-forge 9 | conda_deps= 10 | pip 11 | paramiko 12 | requests 13 | zstandard 14 | python-snappy 15 | aiohttp 16 | lz4 17 | distributed 18 | dask 19 | pyarrow 20 | panel 21 | notebook 22 | pygit2 23 | git 24 | s3fs 25 | pyftpdlib 26 | cloudpickle 27 | pytest 28 | pytest-benchmark 29 | pytest-cov 30 | pytest-vcr 31 | fusepy 32 | msgpack-python 33 | python-libarchive-c 34 | numpy 35 | nomkl 36 | deps= 37 | hadoop-test-cluster==0.1.0 38 | smbprotocol 39 | py36,py37: importlib_metadata 40 | 41 | [testenv] 42 | description=Run test suite against target versions. 43 | conda_channels= 44 | {[core]conda_channels} 45 | conda_deps= 46 | {[core]conda_deps} 47 | deps= 48 | {[core]deps} 49 | commands = 50 | py.test --cov=fsspec -v -r s {posargs} 51 | passenv = CIRUN 52 | 53 | [testenv:s3fs] 54 | description=Run s3fs (@master) test suite against fsspec. 55 | extras=s3 56 | conda_channels= 57 | defaults 58 | conda-forge 59 | conda_deps= 60 | {[core]conda_deps} 61 | httpretty 62 | aiobotocore 63 | moto 64 | flask 65 | changedir=.tox/s3fs/tmp 66 | whitelist_externals= 67 | rm 68 | git 69 | setenv= 70 | BOTO_CONFIG=/dev/null 71 | AWS_ACCESS_KEY_ID=foobar_key 72 | AWS_SECRET_ACCESS_KEY=foobar_secret 73 | commands= 74 | rm -rf s3fs 75 | git clone https://github.com/dask/s3fs 76 | py.test -vv s3fs/s3fs 77 | 78 | [testenv:gcsfs] 79 | description=Run gcsfs (@master) test suite against fsspec. 80 | extras=gcs 81 | conda_channels= 82 | defaults 83 | conda-forge 84 | conda_deps= 85 | {[core]conda_deps} 86 | deps= 87 | {[core]deps} 88 | vcrpy 89 | google-auth-oauthlib 90 | changedir=.tox/gcsfs/tmp 91 | whitelist_externals= 92 | rm 93 | git 94 | setenv= 95 | GCSFS_RECORD_MODE=none 96 | commands= 97 | rm -rf gcsfs 98 | git clone https://github.com/dask/gcsfs 99 | py.test -vv gcsfs/gcsfs -k 'not fuse' 100 | -------------------------------------------------------------------------------- /fsspec/implementations/tests/test_git.py: -------------------------------------------------------------------------------- 1 | import fsspec 2 | import os 3 | import pytest 4 | import shutil 5 | import tempfile 6 | import subprocess 7 | 8 | pygit2 = pytest.importorskip("pygit2") 9 | 10 | 11 | @pytest.fixture() 12 | def repo(): 13 | orig_dir = os.getcwd() 14 | d = tempfile.mkdtemp() 15 | try: 16 | os.chdir(d) 17 | subprocess.call("git init", shell=True, cwd=d) 18 | subprocess.call("git init", shell=True, cwd=d) 19 | subprocess.call('git config user.email "you@example.com"', shell=True, cwd=d) 20 | subprocess.call('git config user.name "Your Name"', shell=True, cwd=d) 21 | open(os.path.join(d, "file1"), "wb").write(b"data0") 22 | subprocess.call("git add file1", shell=True, cwd=d) 23 | subprocess.call('git commit -m "init"', shell=True, cwd=d) 24 | sha = open(os.path.join(d, ".git/refs/heads/master"), "r").read().strip() 25 | open(os.path.join(d, "file1"), "wb").write(b"data00") 26 | subprocess.check_output('git commit -a -m "tagger"', shell=True, cwd=d) 27 | subprocess.call('git tag -a thetag -m "make tag"', shell=True, cwd=d) 28 | open(os.path.join(d, "file2"), "wb").write(b"data000") 29 | subprocess.call("git add file2", shell=True) 30 | subprocess.call('git commit -m "master tip"', shell=True, cwd=d) 31 | subprocess.call("git checkout -b abranch", shell=True, cwd=d) 32 | os.mkdir("inner") 33 | open(os.path.join(d, "inner", "file1"), "wb").write(b"data3") 34 | subprocess.call("git add inner/file1", shell=True, cwd=d) 35 | subprocess.call('git commit -m "branch tip"', shell=True, cwd=d) 36 | yield d, sha 37 | finally: 38 | os.chdir(orig_dir) 39 | shutil.rmtree(d) 40 | 41 | 42 | def test_refs(repo): 43 | d, sha = repo 44 | with fsspec.open("git://file1", path=d, ref=sha) as f: 45 | assert f.read() == b"data0" 46 | 47 | with fsspec.open("git://file1", path=d, ref="thetag") as f: 48 | assert f.read() == b"data00" 49 | 50 | with fsspec.open("git://file2", path=d, ref="master") as f: 51 | assert f.read() == b"data000" 52 | 53 | with fsspec.open("git://file2", path=d, ref=None) as f: 54 | assert f.read() == b"data000" 55 | 56 | with fsspec.open("git://inner/file1", path=d, ref="abranch") as f: 57 | assert f.read() == b"data3" 58 | -------------------------------------------------------------------------------- /fsspec/implementations/tests/cassettes/test_dbfs_file_listing.yaml: -------------------------------------------------------------------------------- 1 | interactions: 2 | - request: 3 | body: '{"path": "/"}' 4 | headers: 5 | Accept: 6 | - '*/*' 7 | Accept-Encoding: 8 | - gzip, deflate 9 | Connection: 10 | - keep-alive 11 | Content-Length: 12 | - '13' 13 | Content-Type: 14 | - application/json 15 | User-Agent: 16 | - python-requests/2.25.1 17 | authorization: 18 | - DUMMY 19 | method: GET 20 | uri: https://my_instance.com/api/2.0/dbfs/list 21 | response: 22 | body: 23 | string: !!binary | 24 | H4sIAAAAAAAEA4zLMQ5AQBBG4bv89Qr1HsAFlCKyGDFByMxsQ9ydaHSy7cv3Toy8kMLXJ/ZgEzzy 25 | 8imVbUJwYG0HFniTSO61rfJB8MXlvmMIFjrhftZMSONimrwaqaXjdU+2UUn+cXPdAAAA//8DAHlY 26 | NJf+AAAA 27 | headers: 28 | content-encoding: 29 | - gzip 30 | content-type: 31 | - application/json 32 | server: 33 | - databricks 34 | strict-transport-security: 35 | - max-age=31536000; includeSubDomains; preload 36 | transfer-encoding: 37 | - chunked 38 | vary: 39 | - Accept-Encoding 40 | x-content-type-options: 41 | - nosniff 42 | status: 43 | code: 200 44 | message: OK 45 | - request: 46 | body: '{"path": "/"}' 47 | headers: 48 | Accept: 49 | - '*/*' 50 | Accept-Encoding: 51 | - gzip, deflate 52 | Connection: 53 | - keep-alive 54 | Content-Length: 55 | - '13' 56 | Content-Type: 57 | - application/json 58 | User-Agent: 59 | - python-requests/2.25.1 60 | authorization: 61 | - DUMMY 62 | method: GET 63 | uri: https://my_instance.com/api/2.0/dbfs/list 64 | response: 65 | body: 66 | string: !!binary | 67 | H4sIAAAAAAAEA4zLMQ5AQBBG4bv89Qr1HsAFlCKyGDFByMxsQ9ydaHSy7cv3Toy8kMLXJ/ZgEzzy 68 | 8imVbUJwYG0HFniTSO61rfJB8MXlvmMIFjrhftZMSONimrwaqaXjdU+2UUn+cXPdAAAA//8DAHlY 69 | NJf+AAAA 70 | headers: 71 | content-encoding: 72 | - gzip 73 | content-type: 74 | - application/json 75 | server: 76 | - databricks 77 | strict-transport-security: 78 | - max-age=31536000; includeSubDomains; preload 79 | transfer-encoding: 80 | - chunked 81 | vary: 82 | - Accept-Encoding 83 | x-content-type-options: 84 | - nosniff 85 | status: 86 | code: 200 87 | message: OK 88 | version: 1 89 | -------------------------------------------------------------------------------- /fsspec/implementations/dvc.py: -------------------------------------------------------------------------------- 1 | import os 2 | from fsspec.spec import AbstractFileSystem 3 | from fsspec.implementations.local import LocalFileSystem 4 | import dvc.repo 5 | 6 | lfs = LocalFileSystem() 7 | 8 | 9 | class DVCFileSystem(AbstractFileSystem): 10 | """DVC backend (experimental) 11 | 12 | Load data files that are versioned using the `Data Version Control`_ system 13 | 14 | .. _Data Version Control: https://dvc.org/ 15 | 16 | This interface is incomplete and experimental. 17 | """ 18 | 19 | root_marker = "" 20 | 21 | def __init__(self, path=None, **kwargs): 22 | """ 23 | 24 | Parameters 25 | ---------- 26 | path: str (optional) 27 | Location of the repo to access; defaults to the current directory. 28 | """ 29 | super().__init__(**kwargs) 30 | self.repo = dvc.repo.Repo(path) 31 | self.path = self.repo.find_root() 32 | 33 | @classmethod 34 | def _strip_protocol(cls, path): 35 | return super()._strip_protocol(path).lstrip("/") 36 | 37 | def ls(self, path, detail=False, **kwargs): 38 | path = self._strip_protocol(path) 39 | allfiles = self.repo.tree.walk(os.path.join(self.repo.root_dir, path)) 40 | dirname, dirs, files = next(allfiles) 41 | out = [os.path.join(path, f) for f in dirs + files] 42 | details = [] 43 | 44 | for f in out: 45 | full = os.path.join(self.repo.root_dir, f) 46 | file_info = lfs.info(full) 47 | if lfs.isdir(full): 48 | details.append(file_info) 49 | else: 50 | try: 51 | extra = self.repo.find_out_by_relpath(full).dumpd() 52 | except dvc.exceptions.OutputNotFoundError: 53 | continue 54 | details.append(dict(**extra, **file_info)) 55 | details[-1]["name"] = f 56 | if detail: 57 | return details 58 | return [d["name"] for d in details] 59 | 60 | def ukey(self, path): 61 | return self.info(path)["md5"] 62 | 63 | def _open( 64 | self, 65 | path, 66 | mode="rb", 67 | block_size=None, 68 | autocommit=True, 69 | cache_options=None, 70 | **kwargs 71 | ): 72 | # returns a context file object (i.e., needs to be used with ``with`` 73 | path = self._strip_protocol(path) 74 | return self.repo.open_by_relpath(path) 75 | -------------------------------------------------------------------------------- /fsspec/transaction.py: -------------------------------------------------------------------------------- 1 | class Transaction(object): 2 | """Filesystem transaction write context 3 | 4 | Gathers files for deferred commit or discard, so that several write 5 | operations can be finalized semi-atomically. This works by having this 6 | instance as the ``.transaction`` attribute of the given filesystem 7 | """ 8 | 9 | def __init__(self, fs): 10 | """ 11 | Parameters 12 | ---------- 13 | fs: FileSystem instance 14 | """ 15 | self.fs = fs 16 | self.files = [] 17 | 18 | def __enter__(self): 19 | self.start() 20 | 21 | def __exit__(self, exc_type, exc_val, exc_tb): 22 | """End transaction and commit, if exit is not due to exception""" 23 | # only commit if there was no exception 24 | self.complete(commit=exc_type is None) 25 | self.fs._intrans = False 26 | self.fs._transaction = None 27 | 28 | def start(self): 29 | """Start a transaction on this FileSystem""" 30 | self.files = [] # clean up after previous failed completions 31 | self.fs._intrans = True 32 | 33 | def complete(self, commit=True): 34 | """Finish transaction: commit or discard all deferred files""" 35 | for f in self.files: 36 | if commit: 37 | f.commit() 38 | else: 39 | f.discard() 40 | self.files = [] 41 | self.fs._intrans = False 42 | 43 | 44 | class FileActor(object): 45 | def __init__(self): 46 | self.files = [] 47 | 48 | def commit(self): 49 | for f in self.files: 50 | f.commit() 51 | self.files.clear() 52 | 53 | def discard(self): 54 | for f in self.files: 55 | f.discard() 56 | self.files.clear() 57 | 58 | def append(self, f): 59 | self.files.append(f) 60 | 61 | 62 | class DaskTransaction(Transaction): 63 | def __init__(self, fs): 64 | """ 65 | Parameters 66 | ---------- 67 | fs: FileSystem instance 68 | """ 69 | import distributed 70 | 71 | super().__init__(fs) 72 | client = distributed.default_client() 73 | self.files = client.submit(FileActor, actor=True).result() 74 | 75 | def complete(self, commit=True): 76 | """Finish transaction: commit or discard all deferred files""" 77 | if commit: 78 | self.files.commit().result() 79 | else: 80 | self.files.discard().result() 81 | self.fs._intrans = False 82 | -------------------------------------------------------------------------------- /fsspec/tests/test_caches.py: -------------------------------------------------------------------------------- 1 | import pickle 2 | import string 3 | 4 | import pytest 5 | from fsspec.caching import BlockCache, caches 6 | 7 | 8 | def test_cache_getitem(Cache_imp): 9 | cacher = Cache_imp(4, letters_fetcher, len(string.ascii_letters)) 10 | assert cacher._fetch(0, 4) == b"abcd" 11 | assert cacher._fetch(None, 4) == b"abcd" 12 | assert cacher._fetch(2, 4) == b"cd" 13 | 14 | 15 | def test_block_cache_lru(): 16 | cache = BlockCache(4, letters_fetcher, len(string.ascii_letters), maxblocks=2) 17 | # miss 18 | cache._fetch(0, 2) 19 | assert cache.cache_info().misses == 1 20 | assert cache.cache_info().currsize == 1 21 | 22 | # hit 23 | cache._fetch(0, 2) 24 | assert cache.cache_info().misses == 1 25 | assert cache.cache_info().currsize == 1 26 | 27 | # miss 28 | cache._fetch(4, 6) 29 | assert cache.cache_info().misses == 2 30 | assert cache.cache_info().currsize == 2 31 | 32 | # miss & evict 33 | cache._fetch(12, 13) 34 | assert cache.cache_info().misses == 3 35 | assert cache.cache_info().currsize == 2 36 | 37 | 38 | def _fetcher(start, end): 39 | return b"0" * (end - start) 40 | 41 | 42 | def letters_fetcher(start, end): 43 | return string.ascii_letters[start:end].encode() 44 | 45 | 46 | @pytest.fixture(params=caches.values(), ids=list(caches.keys())) 47 | def Cache_imp(request): 48 | return request.param 49 | 50 | 51 | def test_cache_empty_file(Cache_imp): 52 | blocksize = 5 53 | size = 0 54 | cache = Cache_imp(blocksize, _fetcher, size) 55 | assert cache._fetch(0, 0) == b"" 56 | 57 | 58 | def test_cache_pickleable(Cache_imp): 59 | blocksize = 5 60 | size = 100 61 | cache = Cache_imp(blocksize, _fetcher, size) 62 | cache._fetch(0, 5) # fill in cache 63 | unpickled = pickle.loads(pickle.dumps(cache)) 64 | assert isinstance(unpickled, Cache_imp) 65 | assert unpickled.blocksize == blocksize 66 | assert unpickled.size == size 67 | assert unpickled._fetch(0, 10) == b"0" * 10 68 | 69 | 70 | @pytest.mark.parametrize( 71 | "size_requests", 72 | [[(0, 30), (0, 35), (51, 52)], [(0, 1), (1, 11), (1, 52)], [(0, 52), (11, 15)]], 73 | ) 74 | @pytest.mark.parametrize("blocksize", [1, 10, 52, 100]) 75 | def test_cache_basic(Cache_imp, blocksize, size_requests): 76 | cache = Cache_imp(blocksize, letters_fetcher, len(string.ascii_letters)) 77 | 78 | for start, end in size_requests: 79 | result = cache._fetch(start, end) 80 | expected = string.ascii_letters[start:end].encode() 81 | assert result == expected 82 | -------------------------------------------------------------------------------- /docs/source/index.rst: -------------------------------------------------------------------------------- 1 | FSSPEC: Filesystem interfaces for Python 2 | ====================================== 3 | 4 | Filesystem Spec (FSSPEC) is a project to unify various projects and classes to work with remote filesystems and 5 | file-system-like abstractions using a standard pythonic interface. 6 | 7 | 8 | .. _highlight: 9 | 10 | Highlights 11 | ---------- 12 | 13 | - based on s3fs and gcsfs 14 | - ``fsspec`` instances are serializable and can be passed between processes/machines 15 | - the ``OpenFiles`` file-like instances are also serializable 16 | - implementations provide random access, to enable only the part of a file required to be read; plus a template 17 | to base other file-like classes on 18 | - file access can use transparent compression and text-mode 19 | - any file-system directory can be viewed as a key-value/mapping store 20 | - if installed, all file-system classes also subclass from ``pyarrow.filesystem.FileSystem``, so 21 | can work with any arrow function expecting such an instance 22 | - writes can be transactional: stored in a temporary location and only moved to the final 23 | destination when the transaction is committed 24 | - FUSE: mount any path from any backend to a point on your file-system 25 | - cached instances tokenised on the instance parameters 26 | 27 | These are described further in the :doc:`features` section. 28 | 29 | Installation 30 | ------------ 31 | 32 | pip install fsspec 33 | 34 | Not all included filesystems are usable by default without installing extra 35 | dependencies. For example to be able to access data in S3:: 36 | 37 | pip install fsspec[s3] 38 | 39 | or 40 | 41 | conda install -c conda-forge fsspec 42 | 43 | Implementations 44 | --------------- 45 | 46 | This repo contains several file-system implementations, see :ref:`implementations`. However, 47 | the external projects ``s3fs`` and ``gcsfs`` depend on ``fsspec`` and share the same behaviours. 48 | ``Dask`` and ``Intake`` use ``fsspec`` internally for their IO needs. 49 | 50 | The current list of known implementations can be found as follows 51 | 52 | .. code-block:: python 53 | 54 | from fsspec.registry import known_implementations 55 | known_implementations 56 | 57 | These are only imported on request, which may fail if a required dependency is missing. The dictionary 58 | ``fsspec.registry`` contains all imported implementations, and can be mutated by user code, if necessary. 59 | 60 | 61 | .. toctree:: 62 | :maxdepth: 2 63 | :caption: Contents: 64 | 65 | intro.rst 66 | usage.rst 67 | features.rst 68 | api.rst 69 | changelog.rst 70 | developer.rst 71 | 72 | 73 | Indices and tables 74 | ================== 75 | 76 | * :ref:`genindex` 77 | * :ref:`modindex` 78 | * :ref:`search` 79 | -------------------------------------------------------------------------------- /fsspec/implementations/tests/test_reference.py: -------------------------------------------------------------------------------- 1 | import json 2 | import pytest 3 | 4 | import fsspec 5 | from .test_http import data, realfile, server # noqa: F401 6 | from fsspec.implementations.reference import _unmodel_hdf5 7 | 8 | 9 | def test_simple(server): # noqa: F811 10 | 11 | refs = {"a": b"data", "b": (realfile, 0, 5), "c": (realfile, 1, 5)} 12 | h = fsspec.filesystem("http") 13 | fs = fsspec.filesystem("reference", references=refs, fs=h) 14 | 15 | assert fs.cat("a") == b"data" 16 | assert fs.cat("b") == data[:5] 17 | assert fs.cat("c") == data[1 : 1 + 5] 18 | 19 | 20 | def test_ls(server): # noqa: F811 21 | refs = {"a": b"data", "b": (realfile, 0, 5), "c/d": (realfile, 1, 6)} 22 | h = fsspec.filesystem("http") 23 | fs = fsspec.filesystem("reference", references=refs, fs=h) 24 | 25 | assert fs.ls("", detail=False) == ["a", "b", "c"] 26 | assert {"name": "c", "type": "directory", "size": 0} in fs.ls("", detail=True) 27 | assert fs.find("") == ["a", "b", "c/d"] 28 | assert fs.find("", withdirs=True) == ["a", "b", "c", "c/d"] 29 | 30 | 31 | def test_err(m): 32 | with pytest.raises(NotImplementedError): 33 | fsspec.filesystem("reference", references={}, fs=m) 34 | with pytest.raises(NotImplementedError): 35 | fsspec.filesystem("reference", references={}, target_protocol="memory") 36 | 37 | 38 | def test_defaults(server): # noqa: F811 39 | refs = {"a": b"data", "b": (None, 0, 5)} 40 | fs = fsspec.filesystem( 41 | "reference", references=refs, target_protocol="http", target=realfile 42 | ) 43 | 44 | assert fs.cat("a") == b"data" 45 | assert fs.cat("b") == data[:5] 46 | 47 | 48 | jdata = """{ 49 | "metadata": { 50 | ".zattrs": { 51 | "Conventions": "UGRID-0.9.0" 52 | }, 53 | ".zgroup": { 54 | "zarr_format": 2 55 | }, 56 | "adcirc_mesh/.zarray": { 57 | "chunks": [ 58 | 1 59 | ], 60 | "dtype": "_``. 15 | There is no attempt to convert strings, but the kwarg keys will 16 | be lower-cased. 17 | 18 | Parameters 19 | ---------- 20 | conf_dict : dict(str, dict) 21 | This dict will be mutated 22 | envdict : dict-like(str, str) 23 | Source for the values - usually the real environment 24 | """ 25 | for key in envdict: 26 | if key.startswith("FSSPEC"): 27 | if key.count("_") < 2: 28 | continue 29 | _, proto, kwarg = key.split("_", 2) 30 | conf_dict.setdefault(proto.lower(), {})[kwarg.lower()] = envdict[key] 31 | 32 | 33 | def set_conf_files(cdir, conf_dict): 34 | """Set config values from files 35 | 36 | Scans for INI and JSON files in the given dictionary, and uses their 37 | contents to set the config. In case of repeated values, later values 38 | win. 39 | 40 | In the case of INI files, all values are strings, and these will not 41 | be converted. 42 | 43 | Parameters 44 | ---------- 45 | cdir : str 46 | Directory to search 47 | conf_dict : dict(str, dict) 48 | This dict will be mutated 49 | """ 50 | if not os.path.isdir(cdir): 51 | return 52 | allfiles = sorted(os.listdir(cdir)) 53 | for fn in allfiles: 54 | if fn.endswith(".ini"): 55 | ini = configparser.ConfigParser() 56 | ini.read(os.path.join(cdir, fn)) 57 | for key in ini: 58 | if key == "DEFAULT": 59 | continue 60 | conf_dict.setdefault(key, {}).update(dict(ini[key])) 61 | if fn.endswith(".json"): 62 | js = json.load(open(os.path.join(cdir, fn))) 63 | for key in js: 64 | conf_dict.setdefault(key, {}).update(dict(js[key])) 65 | 66 | 67 | def apply_config(cls, kwargs, conf_dict=conf): 68 | """Supply default values for kwargs when instantiating class 69 | 70 | Augments the passed kwargs, by finding entries in the config dict 71 | which match the classes ``.protocol`` attribute (one or more str) 72 | 73 | Parameters 74 | ---------- 75 | cls : file system implementation 76 | kwargs : dict 77 | conf_dict : dict of dict 78 | Typically this is the global configuration 79 | 80 | Returns 81 | ------- 82 | dict : the modified set of kwargs 83 | """ 84 | protos = cls.protocol if isinstance(cls.protocol, (tuple, list)) else [cls.protocol] 85 | kw = {} 86 | for proto in protos: 87 | # default kwargs from the current state of the config 88 | if proto in conf_dict: 89 | kw.update(conf_dict[proto]) 90 | # explicit kwargs always win 91 | kw.update(**kwargs) 92 | kwargs = kw 93 | return kwargs 94 | 95 | 96 | set_conf_files(conf_dir, conf) 97 | set_conf_env(conf) 98 | -------------------------------------------------------------------------------- /fsspec/implementations/tests/test_memory.py: -------------------------------------------------------------------------------- 1 | import pytest 2 | import sys 3 | 4 | 5 | def test_1(m): 6 | m.touch("/somefile") # NB: is found with or without initial / 7 | m.touch("afiles/and/anothers") 8 | files = m.find("") 9 | if "somefile" in files: 10 | assert files == ["afiles/and/anothers", "somefile"] 11 | else: 12 | assert files == ["/somefile", "afiles/and/anothers"] 13 | 14 | files = sorted(m.get_mapper("")) 15 | if "somefile" in files: 16 | assert files == ["afiles/and/anothers", "somefile"] 17 | else: 18 | assert files == ["/somefile", "afiles/and/anothers"] 19 | 20 | 21 | @pytest.mark.xfail( 22 | sys.version_info < (3, 6), 23 | reason="py35 error, see https://github.com/intake/filesystem_spec/issues/148", 24 | ) 25 | def test_ls(m): 26 | m.mkdir("/dir") 27 | m.mkdir("/dir/dir1") 28 | 29 | m.touch("/dir/afile") 30 | m.touch("/dir/dir1/bfile") 31 | m.touch("/dir/dir1/cfile") 32 | 33 | assert m.ls("/", False) == ["/dir/"] 34 | assert m.ls("/dir", False) == ["/dir/afile", "/dir/dir1/"] 35 | assert m.ls("/dir", True)[0]["type"] == "file" 36 | assert m.ls("/dir", True)[1]["type"] == "directory" 37 | 38 | assert len(m.ls("/dir/dir1")) == 2 39 | 40 | 41 | def test_directories(m): 42 | with pytest.raises(NotADirectoryError): 43 | m.mkdir("outer/inner", create_parents=False) 44 | m.mkdir("outer/inner") 45 | 46 | assert m.ls("outer") 47 | assert m.ls("outer/inner") == [] 48 | 49 | with pytest.raises(OSError): 50 | m.rmdir("outer") 51 | 52 | m.rmdir("outer/inner") 53 | m.rmdir("outer") 54 | 55 | assert not m.store 56 | 57 | 58 | def test_mv_recursive(m): 59 | m.mkdir("src") 60 | m.touch("src/file.txt") 61 | m.mv("src", "dest", recursive=True) 62 | assert m.exists("dest/file.txt") 63 | assert not m.exists("src") 64 | 65 | 66 | def test_rm_no_psuedo_dir(m): 67 | m.touch("/dir1/dir2/file") 68 | m.rm("/dir1", recursive=True) 69 | assert not m.exists("/dir1/dir2/file") 70 | assert not m.exists("/dir1/dir2") 71 | assert not m.exists("/dir1") 72 | 73 | with pytest.raises(FileNotFoundError): 74 | m.rm("/dir1", recursive=True) 75 | 76 | 77 | def test_rewind(m): 78 | # https://github.com/intake/filesystem_spec/issues/349 79 | with m.open("src/file.txt", "w") as f: 80 | f.write("content") 81 | with m.open("src/file.txt") as f: 82 | assert f.tell() == 0 83 | 84 | 85 | def test_no_rewind_append_mode(m): 86 | # https://github.com/intake/filesystem_spec/issues/349 87 | with m.open("src/file.txt", "w") as f: 88 | f.write("content") 89 | with m.open("src/file.txt", "a") as f: 90 | assert f.tell() == 7 91 | 92 | 93 | def test_moves(m): 94 | m.touch("source.txt") 95 | m.mv("source.txt", "target.txt") 96 | 97 | m.touch("source2.txt") 98 | m.mv("source2.txt", "target2.txt", recursive=True) 99 | assert m.find("") == ["target.txt", "target2.txt"] 100 | 101 | 102 | def test_rm_reursive_empty_subdir(m): 103 | # https://github.com/intake/filesystem_spec/issues/500 104 | m.mkdir("recdir") 105 | m.mkdir("recdir/subdir2") 106 | m.rm("recdir/", recursive=True) 107 | assert not m.exists("dir") 108 | -------------------------------------------------------------------------------- /docs/source/developer.rst: -------------------------------------------------------------------------------- 1 | Developing with fsspec 2 | ---------------------- 3 | 4 | Whereas the majority of the documentation describes the use of ``fsspec`` 5 | from the end-user's point of view, ``fsspec`` is used by many libraries 6 | as the primary/only interface to file operations. 7 | 8 | Clients of the library 9 | ~~~~~~~~~~~~~~~~~~~~~~ 10 | 11 | The most common entrance point for libraries which wish to rely on ``fsspec`` 12 | will be ``open`` or ``open_files``, as a way of generating an object compatible 13 | with the python file interface. This actually produces an ``OpenFile`` instance, 14 | which can be serialised across a network, and resources are only engaged when 15 | entering a context, e.g. 16 | 17 | .. code-block:: python 18 | 19 | with fsspec.open("protocol://path", 'rb', param=value) as f: 20 | process_file(f) 21 | 22 | Note the backend-specific parameters that can be passed in this call. 23 | 24 | In cases where the caller wants to control the context directly, they can use the 25 | ``open`` method of the ``OpenFile``, or get the filesystem object directly, 26 | skipping the ``OpenFile`` route. In the latter case, text encoding and compression 27 | or **not** handled for you. The file-like object can also be used as a context 28 | manager, or the ``close()`` method must be called explicitly to release resources. 29 | 30 | .. code-block:: python 31 | 32 | # OpenFile route 33 | of = fsspec.open("protocol://path", 'rb', param=value) 34 | f = of.open() 35 | process_file(f) 36 | f.close() 37 | 38 | # filesystem class route, context 39 | fs = fsspec.filesystem("protocol", param=value) 40 | with fs.open("path", "rb") as f: 41 | process_file(f) 42 | 43 | # filesystem class route, explicit close 44 | fs = fsspec.filesystem("protocol", param=value) 45 | f = fs.open("path", "rb") 46 | process_file(f) 47 | f.close() 48 | 49 | Implementing a backend 50 | ~~~~~~~~~~~~~~~~~~~~~~ 51 | 52 | The class ``AbstractFileSystem`` provides a template of the methods 53 | that a potential implementation should supply, as well as default 54 | implementation of functionality that depends on these. Methods that 55 | *could* be implemented are marked with ``NotImplementedError`` or 56 | ``pass`` (the patter specifically for directory operations that might 57 | not be required for some backends where directories are emulated. 58 | 59 | Note that not all of the methods need to be implemented: for example, 60 | some implementations may be read-only, in which case things like ``pipe``, 61 | ``put``, ``touch``, ``rm``, etc., can be left as not-implemented 62 | (or you might implement them are raise PermissionError, OSError 30 or some 63 | read-only exception). 64 | 65 | We may eventually refactor ``AbstractFileSystem`` to split the default implementation, 66 | the set of methods that you might implement in a new backend, and the 67 | documented end-user API. 68 | 69 | For now, new backends must register themselves on import 70 | (``register_implementation``) or post a PR to the ``fsspec`` repo 71 | asking to be included in ``fsspec.registry.known_implementations``. 72 | 73 | Implementing async 74 | ~~~~~~~~~~~~~~~~~~ 75 | 76 | Starting in version 0.7.5, we provide async operations for some methods 77 | of some implementations. 78 | 79 | This section will contain details on how to implement backends offering 80 | async, once the details are ironed out on our end. 81 | -------------------------------------------------------------------------------- /README.md: -------------------------------------------------------------------------------- 1 | # filesystem_spec 2 | 3 | ![Build](https://github.com/intake/filesystem_spec/workflows/CI/badge.svg) 4 | [![Docs](https://readthedocs.org/projects/filesystem-spec/badge/?version=latest)](https://filesystem-spec.readthedocs.io/en/latest/?badge=latest) 5 | 6 | A specification for pythonic filesystems. 7 | 8 | ## Install 9 | 10 | ```bash 11 | pip install fsspec 12 | ``` 13 | or 14 | ```bash 15 | conda install -c conda-forge fsspec 16 | ``` 17 | 18 | ## Purpose 19 | 20 | To produce a template or specification for a file-system interface, that specific implementations should follow, 21 | so that applications making use of them can rely on a common behaviour and not have to worry about the specific 22 | internal implementation decisions with any given backend. Many such implementations are included in this package, 23 | or in sister projects such as `s3fs` and `gcsfs`. 24 | 25 | In addition, if this is well-designed, then additional functionality, such as a key-value store or FUSE 26 | mounting of the file-system implementation may be available for all implementations "for free". 27 | 28 | ## Documentation 29 | 30 | Please refer to [RTD](https://filesystem-spec.readthedocs.io/en/latest/?badge=latest) 31 | 32 | ## Develop 33 | 34 | fsspec uses [tox](https://tox.readthedocs.io/en/latest/) and 35 | [tox-conda](https://github.com/tox-dev/tox-conda) to manage dev and test 36 | environments. First, install conda with tox and tox-conda in a base environment 37 | (eg. `conda install -c conda-forge tox tox-conda`). Calls to `tox` can then be 38 | used to configure a development environment and run tests. 39 | 40 | First, setup a development conda environment via `tox -e dev`. This will 41 | install fspec dependencies, test & dev tools, and install fsspec in develop 42 | mode. Then, activate the dev environment under `.tox/dev` via `conda activate .tox/dev`. 43 | 44 | ### Testing 45 | 46 | Tests can be run directly in the activated dev environment via `pytest fsspec`. 47 | 48 | The full fsspec test suite can be run via `tox`, which will setup and execute 49 | tests against multiple dependency versions in isolated environment. Run `tox 50 | -av` to list available test environments, select environments via `tox -e `. 51 | 52 | The full fsspec suite requires a system-level docker, docker-compose, and fuse 53 | installation. See `ci/install.sh` for a detailed installation example. 54 | 55 | ### Code Formatting 56 | 57 | fsspec uses [Black](https://black.readthedocs.io/en/stable) to ensure 58 | a consistent code format throughout the project. ``black`` is automatically 59 | installed in the tox dev env, activated via `conda activate .tox/dev`. 60 | 61 | Then, run `black fsspec` from the root of the filesystem_spec repository to 62 | auto-format your code. Additionally, many editors have plugins that will apply 63 | `black` as you edit files. 64 | 65 | Optionally, you may wish to setup [pre-commit hooks](https://pre-commit.com) to 66 | automatically run `black` when you make a git commit. ``black`` is automatically 67 | installed in the tox dev env, activated via `conda activate .tox/dev`. 68 | 69 | Then, run `pre-commit install --install-hooks` from the root of the 70 | filesystem_spec repository to setup pre-commit hooks. `black` will now be run 71 | before you commit, reformatting any changed files. You can format without 72 | committing via `pre-commit run` or skip these checks with `git commit 73 | --no-verify`. 74 | -------------------------------------------------------------------------------- /fsspec/implementations/git.py: -------------------------------------------------------------------------------- 1 | import pygit2 2 | from fsspec.spec import AbstractFileSystem 3 | from .memory import MemoryFile 4 | import os 5 | 6 | 7 | class GitFileSystem(AbstractFileSystem): 8 | """Browse the files of a local git repo at any hash/tag/branch 9 | 10 | (experimental backend) 11 | """ 12 | 13 | root_marker = "" 14 | 15 | def __init__(self, path=None, ref=None, **kwargs): 16 | """ 17 | 18 | Parameters 19 | ---------- 20 | path: str (optional) 21 | Local location of the repo (uses current directory if not given) 22 | ref: str (optional) 23 | Reference to work with, could be a hash, tag or branch name. Defaults 24 | to current working tree. Note that ``ls`` and ``open`` also take hash, 25 | so this becomes the default for those operations 26 | kwargs 27 | """ 28 | super().__init__(**kwargs) 29 | self.repo = pygit2.Repository(path or os.getcwd()) 30 | self.ref = ref or "master" 31 | 32 | @classmethod 33 | def _strip_protocol(cls, path): 34 | return super()._strip_protocol(path).lstrip("/") 35 | 36 | def _path_to_object(self, path, ref): 37 | comm, ref = self.repo.resolve_refish(ref or self.ref) 38 | parts = path.split("/") 39 | tree = comm.tree 40 | for part in parts: 41 | if part and isinstance(tree, pygit2.Tree): 42 | tree = tree[part] 43 | return tree 44 | 45 | def ls(self, path, detail=True, ref=None, **kwargs): 46 | path = self._strip_protocol(path) 47 | tree = self._path_to_object(path, ref) 48 | if isinstance(tree, pygit2.Tree): 49 | out = [] 50 | for obj in tree: 51 | if isinstance(obj, pygit2.Tree): 52 | out.append( 53 | { 54 | "type": "directory", 55 | "name": "/".join([path, obj.name]).lstrip("/"), 56 | "hex": obj.hex, 57 | "mode": "%o" % obj.filemode, 58 | "size": 0, 59 | } 60 | ) 61 | else: 62 | out.append( 63 | { 64 | "type": "file", 65 | "name": "/".join([path, obj.name]).lstrip("/"), 66 | "hex": obj.hex, 67 | "mode": "%o" % obj.filemode, 68 | "size": obj.size, 69 | } 70 | ) 71 | else: 72 | obj = tree 73 | out = [ 74 | { 75 | "type": "file", 76 | "name": obj.name, 77 | "hex": obj.hex, 78 | "mode": "%o" % obj.filemode, 79 | "size": obj.size, 80 | } 81 | ] 82 | if detail: 83 | return out 84 | return [o["name"] for o in out] 85 | 86 | def ukey(self, path, ref=None): 87 | return self.info(path, ref=ref)["hex"] 88 | 89 | def _open( 90 | self, 91 | path, 92 | mode="rb", 93 | block_size=None, 94 | autocommit=True, 95 | cache_options=None, 96 | ref=None, 97 | **kwargs 98 | ): 99 | obj = self._path_to_object(path, ref or self.ref) 100 | return MemoryFile(data=obj.data) 101 | -------------------------------------------------------------------------------- /fsspec/implementations/tests/test_smb.py: -------------------------------------------------------------------------------- 1 | # -*- coding: utf-8 -*- 2 | """ 3 | Test SMBFileSystem class using a docker container 4 | """ 5 | 6 | import logging 7 | import shlex 8 | import subprocess 9 | import time 10 | import pytest 11 | import fsspec 12 | 13 | pytest.importorskip("smbprotocol") 14 | 15 | # ! pylint: disable=redefined-outer-name,missing-function-docstring 16 | 17 | 18 | def stop_docker(container): 19 | cmd = shlex.split('docker ps -a -q --filter "name=%s"' % container) 20 | cid = subprocess.check_output(cmd).strip().decode() 21 | if cid: 22 | subprocess.call(["docker", "rm", "-f", "-v", cid]) 23 | 24 | 25 | @pytest.fixture(scope="module") 26 | def smb_params(): 27 | try: 28 | pchk = ["docker", "run", "--name", "fsspec_test_smb", "hello-world"] 29 | subprocess.check_call(pchk) 30 | except (subprocess.CalledProcessError, FileNotFoundError): 31 | pytest.skip("docker run not available") 32 | return 33 | stop_docker("fsspec_test_smb") 34 | 35 | # requires docker 36 | container = "fsspec_smb" 37 | stop_docker(container) 38 | img = "docker run --name {} --detach -p 139:139 -p 445:445 dperson/samba" 39 | cfg = " -p -u 'testuser;testpass' -s 'home;/share;no;no;no;testuser'" 40 | cmd = img.format(container) + cfg 41 | cid = subprocess.check_output(shlex.split(cmd)).strip().decode() 42 | logger = logging.getLogger("fsspec") 43 | logger.debug("Container: %s", cid) 44 | try: 45 | time.sleep(1) 46 | yield dict(host="localhost", port=445, username="testuser", password="testpass") 47 | finally: 48 | import smbclient # pylint: disable=import-outside-toplevel 49 | 50 | smbclient.reset_connection_cache() 51 | stop_docker(container) 52 | 53 | 54 | def test_simple(smb_params): 55 | adir = "/home/adir" 56 | adir2 = "/home/adir/otherdir/" 57 | afile = "/home/adir/otherdir/afile" 58 | fsmb = fsspec.get_filesystem_class("smb")(**smb_params) 59 | fsmb.mkdirs(adir2) 60 | fsmb.touch(afile) 61 | assert fsmb.find(adir) == [afile] 62 | assert fsmb.ls(adir2, detail=False) == [afile] 63 | assert fsmb.info(afile)["type"] == "file" 64 | assert fsmb.info(afile)["size"] == 0 65 | assert fsmb.exists(adir) 66 | fsmb.rm(adir, recursive=True) 67 | assert not fsmb.exists(adir) 68 | 69 | 70 | def test_with_url(smb_params): 71 | smb_url = "smb://{username}:{password}@{host}:{port}/home/someuser.txt" 72 | fwo = fsspec.open(smb_url.format(**smb_params), "wb") 73 | with fwo as fwr: 74 | fwr.write(b"hello") 75 | fro = fsspec.open(smb_url.format(**smb_params), "rb") 76 | with fro as frd: 77 | read_result = frd.read() 78 | assert read_result == b"hello" 79 | 80 | 81 | def test_transaction(smb_params): 82 | afile = "/home/afolder/otherdir/afile" 83 | afile2 = "/home/afolder/otherdir/afile2" 84 | adir = "/home/afolder" 85 | adir2 = "/home/afolder/otherdir" 86 | fsmb = fsspec.get_filesystem_class("smb")(**smb_params) 87 | fsmb.mkdirs(adir2) 88 | fsmb.start_transaction() 89 | fsmb.touch(afile) 90 | assert fsmb.find(adir) == [] 91 | fsmb.end_transaction() 92 | assert fsmb.find(adir) == [afile] 93 | 94 | with fsmb.transaction: 95 | assert fsmb._intrans 96 | fsmb.touch(afile2) 97 | assert fsmb.find(adir) == [afile] 98 | assert fsmb.find(adir) == [afile, afile2] 99 | 100 | 101 | def test_makedirs_exist_ok(smb_params): 102 | fsmb = fsspec.get_filesystem_class("smb")(**smb_params) 103 | fsmb.makedirs("/home/a/b/c") 104 | fsmb.makedirs("/home/a/b/c", exist_ok=True) 105 | -------------------------------------------------------------------------------- /docs/source/usage.rst: -------------------------------------------------------------------------------- 1 | Usage 2 | ===== 3 | 4 | This is quick-start documentation to help people get familiar with the layout and functioning of ``fsspec``. 5 | 6 | Instantiate a file-system 7 | ------------------------- 8 | 9 | ``fsspec`` provides an abstract file-system interface as a template for other filesystems. In this context, 10 | "interface" means an API for working with files on the given file-system, which can mean files on some 11 | remote store, local files, files within some wrapper, or anything else that is capable of producing 12 | file-like objects. 13 | 14 | Some concrete implementations are bundled with ``fsspec`` and others can be installed separately. They 15 | can be instantiated directly, or the `registry` can be used to find them. 16 | 17 | Direct instantiation: 18 | 19 | .. code-block:: python 20 | 21 | from fsspec.implementations.local import LocalFileSystem 22 | fs = LocalFileSystem() 23 | 24 | Look-up via registry: 25 | 26 | .. code-block:: python 27 | 28 | import fsspec 29 | fs = fsspec.filesystem('file') 30 | 31 | Many filesystems also take extra parameters, some of which may be options - see :doc:`api`. 32 | 33 | .. code-block:: python 34 | 35 | import fsspec 36 | fs = fsspec.filesystem('ftp', host=host, port=port, 37 | username=user, password=pw) 38 | 39 | Use a file-system 40 | ----------------- 41 | 42 | File-system instances offer a large number of methods for getting information about and manipulating files 43 | for the given back-end. Although some specific implementations may not offer all features (e.g., ``http`` 44 | is read-only), generally all normal operations, such as ``ls``, ``rm``, should be expected to work (see the 45 | full list: :class:`fsspec.spec.AbstractFileSystem`). 46 | Note that this quick-start will prefer posix-style naming, but 47 | many common operations are aliased: ``cp()`` and ``copy()`` are identical, for instance. 48 | Functionality is generally chosen to be as close to the builtin ``os`` module's working for things like 49 | ``glob`` as possible. 50 | 51 | The ``open()`` method will return a file-like object which can be passed to any other library that expects 52 | to work with python files. These will normally be binary-mode only, but may implement internal buffering 53 | in order to limit the number of reads from a remote source. They respect the use of ``with`` contexts. If 54 | you have ``pandas`` installed, for example, you can do the following: 55 | 56 | .. code-block:: python 57 | 58 | import fsspec 59 | import pandas as pd 60 | with fsspec.open('https://raw.githubusercontent.com/dask/' 61 | 'fastparquet/master/test-data/nation.csv') as f: 62 | df = pd.read_csv(f, sep='|', header=None) 63 | 64 | Higher-level 65 | ------------ 66 | 67 | For many situations, the only function that will be needed is :func:`fsspec.open_files()`, which will return 68 | :class:`fsspec.core.OpenFile` instances created from a single URL and parameters to pass to the backend. 69 | This supports text-mode and compression on the fly, and the objects can be serialized for passing between 70 | processes or machines (so long as each has access to the same backend file-system). The protocol (i.e., 71 | backend) is inferred from the URL passed, and glob characters are expanded in read mode (search for files) 72 | or write mode (create names). Critically, the file on the backend system is not actually opened until the 73 | ``OpenFile`` instance is used in a ``with`` context. For the example above: 74 | 75 | .. code-block:: python 76 | 77 | of = fsspec.open('https://raw.githubusercontent.com/dask/' 78 | 'fastparquet/master/test-data/nation.csv', mode='r') 79 | # files is a not-yet-open OpenFile object. The "with" context actually opens it 80 | with of as f: 81 | # now f is a text-mode file 82 | df = pd.read_csv(f, sep='|', header=None) 83 | 84 | -------------------------------------------------------------------------------- /fsspec/tests/test_mapping.py: -------------------------------------------------------------------------------- 1 | import os 2 | import pickle 3 | import sys 4 | 5 | import fsspec 6 | from fsspec.implementations.memory import MemoryFileSystem 7 | import pytest 8 | 9 | 10 | def test_mapping_prefix(tmpdir): 11 | tmpdir = str(tmpdir) 12 | os.makedirs(os.path.join(tmpdir, "afolder")) 13 | open(os.path.join(tmpdir, "afile"), "w").write("test") 14 | open(os.path.join(tmpdir, "afolder", "anotherfile"), "w").write("test2") 15 | 16 | m = fsspec.get_mapper("file://" + tmpdir) 17 | assert "afile" in m 18 | assert m["afolder/anotherfile"] == b"test2" 19 | 20 | fs = fsspec.filesystem("file") 21 | m2 = fs.get_mapper(tmpdir) 22 | m3 = fs.get_mapper("file://" + tmpdir) 23 | 24 | assert m == m2 == m3 25 | 26 | 27 | def test_getitems_errors(tmpdir): 28 | tmpdir = str(tmpdir) 29 | os.makedirs(os.path.join(tmpdir, "afolder")) 30 | open(os.path.join(tmpdir, "afile"), "w").write("test") 31 | open(os.path.join(tmpdir, "afolder", "anotherfile"), "w").write("test2") 32 | m = fsspec.get_mapper("file://" + tmpdir) 33 | assert m.getitems(["afile", "bfile"], on_error="omit") == {"afile": b"test"} 34 | with pytest.raises(KeyError): 35 | m.getitems(["afile", "bfile"]) 36 | out = m.getitems(["afile", "bfile"], on_error="return") 37 | assert isinstance(out["bfile"], KeyError) 38 | m = fsspec.get_mapper("file://" + tmpdir, missing_exceptions=()) 39 | assert m.getitems(["afile", "bfile"], on_error="omit") == {"afile": b"test"} 40 | with pytest.raises(FileNotFoundError): 41 | m.getitems(["afile", "bfile"]) 42 | 43 | 44 | def test_ops(): 45 | MemoryFileSystem.store.clear() 46 | m = fsspec.get_mapper("memory://") 47 | assert not m 48 | assert list(m) == [] 49 | 50 | with pytest.raises(KeyError): 51 | m["hi"] 52 | 53 | assert m.pop("key", 0) == 0 54 | 55 | m["key0"] = b"data" 56 | assert list(m) == ["key0"] 57 | assert m["key0"] == b"data" 58 | 59 | m.clear() 60 | 61 | assert list(m) == [] 62 | 63 | 64 | def test_pickle(): 65 | m = fsspec.get_mapper("memory://") 66 | assert isinstance(m.fs, MemoryFileSystem) 67 | m["key"] = b"data" 68 | m2 = pickle.loads(pickle.dumps(m)) 69 | assert list(m) == list(m2) 70 | assert m.missing_exceptions == m2.missing_exceptions 71 | 72 | 73 | def test_keys_view(): 74 | # https://github.com/intake/filesystem_spec/issues/186 75 | m = fsspec.get_mapper("memory://") 76 | m["key"] = b"data" 77 | 78 | keys = m.keys() 79 | assert len(keys) == 1 80 | # check that we don't consume the keys 81 | assert len(keys) == 1 82 | m.clear() 83 | 84 | 85 | def test_multi(): 86 | m = fsspec.get_mapper("memory://") 87 | data = {"a": b"data1", "b": b"data2"} 88 | m.setitems(data) 89 | 90 | assert m.getitems(list(data)) == data 91 | m.delitems(list(data)) 92 | assert not list(m) 93 | 94 | 95 | def test_setitem_types(): 96 | import array 97 | 98 | m = fsspec.get_mapper("memory://") 99 | m["a"] = array.array("i", [1]) 100 | if sys.byteorder == "little": 101 | assert m["a"] == b"\x01\x00\x00\x00" 102 | else: 103 | assert m["a"] == b"\x00\x00\x00\x01" 104 | m["b"] = bytearray(b"123") 105 | assert m["b"] == b"123" 106 | m.setitems({"c": array.array("i", [1]), "d": bytearray(b"123")}) 107 | if sys.byteorder == "little": 108 | assert m["c"] == b"\x01\x00\x00\x00" 109 | else: 110 | assert m["c"] == b"\x00\x00\x00\x01" 111 | assert m["d"] == b"123" 112 | 113 | 114 | def test_setitem_numpy(): 115 | m = fsspec.get_mapper("memory://") 116 | np = pytest.importorskip("numpy") 117 | m["c"] = np.array(1, dtype=" 0, "Timeout waiting for HDFS" 32 | time.sleep(1) 33 | continue 34 | break 35 | time.sleep(7) 36 | yield "localhost" 37 | finally: 38 | subprocess.check_output(cmd0) 39 | 40 | 41 | def test_pickle(hdfs_cluster): 42 | w = WebHDFS(hdfs_cluster, user="testuser") 43 | w2 = pickle.loads(pickle.dumps(w)) 44 | assert w == w2 45 | 46 | 47 | def test_simple(hdfs_cluster): 48 | w = WebHDFS(hdfs_cluster, user="testuser") 49 | home = w.home_directory() 50 | assert home == "/user/testuser" 51 | with pytest.raises(PermissionError): 52 | w.mkdir("/root") 53 | 54 | 55 | def test_url(hdfs_cluster): 56 | url = "webhdfs://testuser@localhost:50070/user/testuser/myfile" 57 | fo = fsspec.open(url, "wb", data_proxy={"worker.example.com": "localhost"}) 58 | with fo as f: 59 | f.write(b"hello") 60 | fo = fsspec.open(url, "rb", data_proxy={"worker.example.com": "localhost"}) 61 | with fo as f: 62 | assert f.read() == b"hello" 63 | 64 | 65 | def test_workflow(hdfs_cluster): 66 | w = WebHDFS( 67 | hdfs_cluster, user="testuser", data_proxy={"worker.example.com": "localhost"} 68 | ) 69 | fn = "/user/testuser/testrun/afile" 70 | w.mkdir("/user/testuser/testrun") 71 | with w.open(fn, "wb") as f: 72 | f.write(b"hello") 73 | assert w.exists(fn) 74 | info = w.info(fn) 75 | assert info["size"] == 5 76 | assert w.isfile(fn) 77 | assert w.cat(fn) == b"hello" 78 | w.rm("/user/testuser/testrun", recursive=True) 79 | assert not w.exists(fn) 80 | 81 | 82 | def test_with_gzip(hdfs_cluster): 83 | from gzip import GzipFile 84 | 85 | w = WebHDFS( 86 | hdfs_cluster, user="testuser", data_proxy={"worker.example.com": "localhost"} 87 | ) 88 | fn = "/user/testuser/gzfile" 89 | with w.open(fn, "wb") as f: 90 | gf = GzipFile(fileobj=f, mode="w") 91 | gf.write(b"hello") 92 | gf.close() 93 | with w.open(fn, "rb") as f: 94 | gf = GzipFile(fileobj=f, mode="r") 95 | assert gf.read() == b"hello" 96 | 97 | 98 | def test_workflow_transaction(hdfs_cluster): 99 | w = WebHDFS( 100 | hdfs_cluster, user="testuser", data_proxy={"worker.example.com": "localhost"} 101 | ) 102 | fn = "/user/testuser/testrun/afile" 103 | w.mkdirs("/user/testuser/testrun") 104 | with w.transaction: 105 | with w.open(fn, "wb") as f: 106 | f.write(b"hello") 107 | assert not w.exists(fn) 108 | assert w.exists(fn) 109 | assert w.ukey(fn) 110 | files = w.ls("/user/testuser/testrun", True) 111 | summ = w.content_summary("/user/testuser/testrun") 112 | assert summ["length"] == files[0]["size"] 113 | assert summ["fileCount"] == 1 114 | 115 | w.rm("/user/testuser/testrun", recursive=True) 116 | assert not w.exists(fn) 117 | -------------------------------------------------------------------------------- /fsspec/implementations/tests/test_sftp.py: -------------------------------------------------------------------------------- 1 | import pytest 2 | import shlex 3 | import subprocess 4 | import time 5 | import fsspec 6 | 7 | pytest.importorskip("paramiko") 8 | 9 | 10 | def stop_docker(name): 11 | cmd = shlex.split('docker ps -a -q --filter "name=%s"' % name) 12 | cid = subprocess.check_output(cmd).strip().decode() 13 | if cid: 14 | subprocess.call(["docker", "rm", "-f", cid]) 15 | 16 | 17 | @pytest.fixture(scope="module") 18 | def ssh(): 19 | try: 20 | subprocess.check_call(["docker", "run", "hello-world"]) 21 | except (subprocess.CalledProcessError, FileNotFoundError): 22 | pytest.skip("docker run not available") 23 | return 24 | 25 | # requires docker 26 | cmds = [ 27 | r"apt-get update", 28 | r"apt-get install -y openssh-server", 29 | r"mkdir /var/run/sshd", 30 | "bash -c \"echo 'root:pass' | chpasswd\"", 31 | ( 32 | r"sed -i 's/PermitRootLogin prohibit-password/PermitRootLogin yes/' " 33 | r"/etc/ssh/sshd_config" 34 | ), 35 | ( 36 | r"sed 's@session\s*required\s*pam_loginuid.so@session optional " 37 | r"pam_loginuid.so@g' -i /etc/pam.d/sshd" 38 | ), 39 | r'bash -c "echo \"export VISIBLE=now\" >> /etc/profile"', 40 | r"/usr/sbin/sshd", 41 | ] 42 | name = "fsspec_sftp" 43 | stop_docker(name) 44 | cmd = "docker run -d -p 9200:22 --name {} ubuntu:16.04 sleep 9000".format(name) 45 | cid = subprocess.check_output(shlex.split(cmd)).strip().decode() 46 | for cmd in cmds: 47 | subprocess.call(["docker", "exec", cid] + shlex.split(cmd)) 48 | try: 49 | time.sleep(1) 50 | yield dict(host="localhost", port=9200, username="root", password="pass") 51 | finally: 52 | stop_docker(name) 53 | 54 | 55 | def test_simple(ssh): 56 | f = fsspec.get_filesystem_class("sftp")(**ssh) 57 | f.mkdirs("/home/someuser/deeper") 58 | f.touch("/home/someuser/deeper/afile") 59 | assert f.find("/home/someuser") == ["/home/someuser/deeper/afile"] 60 | assert f.ls("/home/someuser/deeper/") == ["/home/someuser/deeper/afile"] 61 | assert f.info("/home/someuser/deeper/afile")["type"] == "file" 62 | assert f.info("/home/someuser/deeper/afile")["size"] == 0 63 | assert f.exists("/home/someuser") 64 | f.rm("/home/someuser", recursive=True) 65 | assert not f.exists("/home/someuser") 66 | 67 | 68 | @pytest.mark.parametrize("protocol", ["sftp", "ssh"]) 69 | def test_with_url(protocol, ssh): 70 | fo = fsspec.open( 71 | protocol + "://{username}:{password}@{host}:{port}" 72 | "/home/someuserout".format(**ssh), 73 | "wb", 74 | ) 75 | with fo as f: 76 | f.write(b"hello") 77 | fo = fsspec.open( 78 | protocol + "://{username}:{password}@{host}:{port}" 79 | "/home/someuserout".format(**ssh), 80 | "rb", 81 | ) 82 | with fo as f: 83 | assert f.read() == b"hello" 84 | 85 | 86 | def test_transaction(ssh): 87 | f = fsspec.get_filesystem_class("sftp")(**ssh) 88 | f.mkdirs("/home/someuser/deeper") 89 | f.start_transaction() 90 | f.touch("/home/someuser/deeper/afile") 91 | assert f.find("/home/someuser") == [] 92 | f.end_transaction() 93 | f.find("/home/someuser") == ["/home/someuser/deeper/afile"] 94 | 95 | with f.transaction: 96 | assert f._intrans 97 | f.touch("/home/someuser/deeper/afile2") 98 | assert f.find("/home/someuser") == ["/home/someuser/deeper/afile"] 99 | assert f.find("/home/someuser") == [ 100 | "/home/someuser/deeper/afile", 101 | "/home/someuser/deeper/afile2", 102 | ] 103 | 104 | 105 | def test_makedirs_exist_ok(ssh): 106 | f = fsspec.get_filesystem_class("sftp")(**ssh) 107 | 108 | f.makedirs("/a/b/c") 109 | 110 | with pytest.raises(FileExistsError, match="/a/b/c"): 111 | f.makedirs("/a/b/c", exist_ok=False) 112 | 113 | f.makedirs("/a/b/c", exist_ok=True) 114 | -------------------------------------------------------------------------------- /fsspec/implementations/tests/test_ftp.py: -------------------------------------------------------------------------------- 1 | import os 2 | import pytest 3 | import subprocess 4 | import sys 5 | import time 6 | 7 | from fsspec.implementations.ftp import FTPFileSystem 8 | from fsspec import open_files 9 | import fsspec 10 | 11 | here = os.path.dirname(os.path.abspath(__file__)) 12 | 13 | 14 | @pytest.fixture() 15 | def ftp(): 16 | pytest.importorskip("pyftpdlib") 17 | P = subprocess.Popen( 18 | [sys.executable, "-m", "pyftpdlib", "-d", here], 19 | stderr=subprocess.STDOUT, 20 | stdout=subprocess.PIPE, 21 | ) 22 | try: 23 | time.sleep(1) 24 | yield "localhost", 2121 25 | finally: 26 | P.terminate() 27 | P.wait() 28 | 29 | 30 | def test_basic(ftp): 31 | host, port = ftp 32 | fs = FTPFileSystem(host, port) 33 | assert fs.ls("/", detail=False) == sorted(os.listdir(here)) 34 | out = fs.cat("/" + os.path.basename(__file__)) 35 | assert out == open(__file__, "rb").read() 36 | 37 | 38 | def test_not_cached(ftp): 39 | host, port = ftp 40 | fs = FTPFileSystem(host, port) 41 | fs2 = FTPFileSystem(host, port) 42 | assert fs is not fs2 43 | 44 | 45 | @pytest.mark.parametrize("cache_type", ["bytes", "mmap"]) 46 | def test_complex(ftp_writable, cache_type): 47 | from fsspec.core import BytesCache 48 | 49 | host, port, user, pw = ftp_writable 50 | files = open_files( 51 | "ftp:///ou*", 52 | host=host, 53 | port=port, 54 | username=user, 55 | password=pw, 56 | block_size=10000, 57 | cache_type=cache_type, 58 | ) 59 | assert len(files) == 1 60 | with files[0] as fo: 61 | assert fo.read(10) == b"hellohello" 62 | if isinstance(fo.cache, BytesCache): 63 | assert len(fo.cache.cache) == 10010 64 | assert fo.read(2) == b"he" 65 | assert fo.tell() == 12 66 | 67 | 68 | def test_write_small(ftp_writable): 69 | host, port, user, pw = ftp_writable 70 | fs = FTPFileSystem(host, port, user, pw) 71 | with fs.open("/out2", "wb") as f: 72 | f.write(b"oi") 73 | assert fs.cat("/out2") == b"oi" 74 | 75 | 76 | def test_with_url(ftp_writable): 77 | host, port, user, pw = ftp_writable 78 | fo = fsspec.open("ftp://{}:{}@{}:{}/out".format(user, pw, host, port), "wb") 79 | with fo as f: 80 | f.write(b"hello") 81 | fo = fsspec.open("ftp://{}:{}@{}:{}/out".format(user, pw, host, port), "rb") 82 | with fo as f: 83 | assert f.read() == b"hello" 84 | 85 | 86 | @pytest.mark.parametrize("cache_type", ["bytes", "mmap"]) 87 | def test_write_big(ftp_writable, cache_type): 88 | host, port, user, pw = ftp_writable 89 | fs = FTPFileSystem(host, port, user, pw, block_size=1000, cache_type=cache_type) 90 | fn = "/bigger" 91 | with fs.open(fn, "wb") as f: 92 | f.write(b"o" * 500) 93 | assert not fs.exists(fn) 94 | f.write(b"o" * 1000) 95 | fs.invalidate_cache() 96 | assert fs.exists(fn) 97 | f.write(b"o" * 200) 98 | f.flush() 99 | 100 | assert fs.info(fn)["size"] == 1700 101 | assert fs.cat(fn) == b"o" * 1700 102 | 103 | 104 | def test_transaction(ftp_writable): 105 | host, port, user, pw = ftp_writable 106 | fs = FTPFileSystem(host, port, user, pw) 107 | fs.mkdir("/tmp") 108 | fn = "/tr" 109 | with fs.transaction: 110 | with fs.open(fn, "wb") as f: 111 | f.write(b"not") 112 | assert not fs.exists(fn) 113 | assert fs.exists(fn) 114 | assert fs.cat(fn) == b"not" 115 | 116 | fs.rm(fn) 117 | assert not fs.exists(fn) 118 | 119 | 120 | def test_transaction_with_cache(ftp_writable): 121 | host, port, user, pw = ftp_writable 122 | fs = FTPFileSystem(host, port, user, pw) 123 | fs.mkdir("/tmp") 124 | fs.mkdir("/tmp/dir") 125 | assert "dir" in fs.ls("/tmp", detail=False) 126 | 127 | with fs.transaction: 128 | fs.rmdir("/tmp/dir") 129 | 130 | assert "dir" not in fs.ls("/tmp", detail=False) 131 | assert not fs.exists("/tmp/dir") 132 | -------------------------------------------------------------------------------- /fsspec/tests/test_registry.py: -------------------------------------------------------------------------------- 1 | import sys 2 | from unittest.mock import create_autospec, patch 3 | 4 | import pytest 5 | 6 | from fsspec.registry import ( 7 | ReadOnlyError, 8 | _registry, 9 | get_filesystem_class, 10 | known_implementations, 11 | register_implementation, 12 | registry, 13 | ) 14 | from fsspec.spec import AbstractFileSystem 15 | 16 | try: 17 | from importlib.metadata import EntryPoint 18 | except ImportError: # python < 3.8 19 | from importlib_metadata import EntryPoint 20 | 21 | 22 | @pytest.fixture() 23 | def clear_registry(): 24 | try: 25 | yield 26 | finally: 27 | _registry.clear() 28 | known_implementations.pop("test", None) 29 | 30 | 31 | @pytest.fixture() 32 | def clean_imports(): 33 | try: 34 | real_module = sys.modules["fsspec"] 35 | del sys.modules["fsspec"] 36 | yield 37 | finally: 38 | sys.modules["fsspec"] = real_module 39 | 40 | 41 | @pytest.mark.parametrize( 42 | "protocol,module,minversion,oldversion", 43 | [("s3", "s3fs", "0.3.0", "0.1.0"), ("gs", "gcsfs", "0.3.0", "0.1.0")], 44 | ) 45 | def test_minversion_s3fs(protocol, module, minversion, oldversion, monkeypatch): 46 | _registry.clear() 47 | mod = pytest.importorskip(module, minversion) 48 | 49 | assert get_filesystem_class("s3") is not None 50 | _registry.clear() 51 | 52 | monkeypatch.setattr(mod, "__version__", oldversion) 53 | with pytest.raises(RuntimeError, match=minversion): 54 | get_filesystem_class(protocol) 55 | 56 | 57 | def test_registry_readonly(): 58 | get_filesystem_class("file") 59 | assert "file" in registry 60 | assert "file" in list(registry) 61 | with pytest.raises(ReadOnlyError): 62 | del registry["file"] 63 | with pytest.raises(ReadOnlyError): 64 | registry["file"] = None 65 | with pytest.raises(ReadOnlyError): 66 | registry.clear() 67 | 68 | 69 | def test_register_cls(clear_registry): 70 | with pytest.raises(ValueError): 71 | get_filesystem_class("test") 72 | register_implementation("test", AbstractFileSystem) 73 | cls = get_filesystem_class("test") 74 | assert cls is AbstractFileSystem 75 | 76 | 77 | def test_register_str(clear_registry): 78 | with pytest.raises(ValueError): 79 | get_filesystem_class("test") 80 | register_implementation("test", "fsspec.AbstractFileSystem") 81 | assert "test" not in registry 82 | cls = get_filesystem_class("test") 83 | assert cls is AbstractFileSystem 84 | assert "test" in registry 85 | 86 | 87 | def test_register_fail(clear_registry): 88 | register_implementation("test", "doesntexist.AbstractFileSystem") 89 | with pytest.raises(ImportError): 90 | get_filesystem_class("test") 91 | 92 | register_implementation("test", "doesntexist.AbstractFileSystem") 93 | with pytest.raises(ValueError): 94 | register_implementation("test", "doesntexist.AbstractFileSystem", clobber=False) 95 | 96 | register_implementation( 97 | "test", "doesntexist.AbstractFileSystem", errtxt="hiho", clobber=True 98 | ) 99 | with pytest.raises(ImportError) as e: 100 | get_filesystem_class("test") 101 | assert "hiho" in str(e.value) 102 | register_implementation("test", AbstractFileSystem) 103 | 104 | with pytest.raises(ValueError): 105 | register_implementation("test", AbstractFileSystem, clobber=False) 106 | register_implementation("test", AbstractFileSystem, clobber=True) 107 | 108 | 109 | def test_entry_points_registered_on_import(clear_registry, clean_imports): 110 | mock_ep = create_autospec(EntryPoint, module="fsspec.spec.AbstractFileSystem") 111 | mock_ep.name = "test" # this can't be set in the constructor... 112 | if sys.version_info < (3, 8): 113 | import_location = "importlib_metadata.entry_points" 114 | else: 115 | import_location = "importlib.metadata.entry_points" 116 | with patch(import_location, return_value={"fsspec.specs": [mock_ep]}): 117 | assert "test" not in registry 118 | import fsspec 119 | 120 | get_filesystem_class("test") 121 | assert "test" in registry 122 | -------------------------------------------------------------------------------- /fsspec/implementations/jupyter.py: -------------------------------------------------------------------------------- 1 | import base64 2 | import io 3 | import fsspec 4 | import re 5 | import requests 6 | 7 | 8 | class JupyterFileSystem(fsspec.AbstractFileSystem): 9 | """View of the files as seen by a Jupyter server (notebook or lab)""" 10 | 11 | protocol = ("jupyter", "jlab") 12 | 13 | def __init__(self, url, tok=None, **kwargs): 14 | """ 15 | 16 | Parameters 17 | ---------- 18 | url : str 19 | Base URL of the server, like "http://127.0.0.1:8888". May include 20 | token in the string, which is given by the process when starting up 21 | tok : str 22 | If the token is obtained separately, can be given here 23 | kwargs 24 | """ 25 | if "?" in url: 26 | if tok is None: 27 | try: 28 | tok = re.findall("token=([a-z0-9]+)", url)[0] 29 | except IndexError as e: 30 | raise ValueError("Could not determine token") from e 31 | url = url.split("?", 1)[0] 32 | self.url = url.rstrip("/") + "/api/contents" 33 | self.session = requests.Session() 34 | if tok: 35 | self.session.headers["Authorization"] = f"token {tok}" 36 | 37 | super().__init__(**kwargs) 38 | 39 | def ls(self, path, detail=True, **kwargs): 40 | path = self._strip_protocol(path) 41 | r = self.session.get(self.url + "/" + path) 42 | if r.status_code == 404: 43 | return FileNotFoundError(path) 44 | r.raise_for_status() 45 | out = r.json() 46 | 47 | if out["type"] == "directory": 48 | out = out["content"] 49 | else: 50 | out = [out] 51 | for o in out: 52 | o["name"] = o.pop("path") 53 | o.pop("content") 54 | if o["type"] == "notebook": 55 | o["type"] = "file" 56 | if detail: 57 | return out 58 | return [o["name"] for o in out] 59 | 60 | def cat_file(self, path): 61 | path = self._strip_protocol(path) 62 | r = self.session.get(self.url + "/" + path) 63 | if r.status_code == 404: 64 | return FileNotFoundError(path) 65 | r.raise_for_status() 66 | out = r.json() 67 | if out["format"] == "text": 68 | # data should be binary 69 | return out["content"].encode() 70 | else: 71 | return base64.b64decode(out["content"]) 72 | 73 | def pipe_file(self, path, value, **_): 74 | path = self._strip_protocol(path) 75 | json = { 76 | "name": path.rsplit("/", 1)[-1], 77 | "path": path, 78 | "size": len(value), 79 | "content": base64.b64encode(value).decode(), 80 | "format": "base64", 81 | "type": "file", 82 | } 83 | self.session.put(self.url + "/" + path, json=json) 84 | 85 | def mkdir(self, path, create_parents=True, **kwargs): 86 | path = self._strip_protocol(path) 87 | if create_parents and "/" in path: 88 | self.mkdir(path.rsplit("/", 1)[0], True) 89 | json = { 90 | "name": path.rsplit("/", 1)[-1], 91 | "path": path, 92 | "size": None, 93 | "content": None, 94 | "type": "directory", 95 | } 96 | self.session.put(self.url + "/" + path, json=json) 97 | 98 | def _rm(self, path): 99 | path = self._strip_protocol(path) 100 | self.session.delete(self.url + "/" + path) 101 | 102 | def _open(self, path, mode="rb", **kwargs): 103 | path = self._strip_protocol(path) 104 | if mode == "rb": 105 | data = self.cat_file(path) 106 | return io.BytesIO(data) 107 | else: 108 | return SimpleFileWriter(self, path, mode="wb") 109 | 110 | 111 | class SimpleFileWriter(fsspec.spec.AbstractBufferedFile): 112 | def _upload_chunk(self, final=False): 113 | """Never uploads a chunk until file is done 114 | 115 | Not suitable for large files 116 | """ 117 | if final is False: 118 | return False 119 | self.buffer.seek(0) 120 | data = self.buffer.read() 121 | self.fs.pipe_file(self.path, data) 122 | -------------------------------------------------------------------------------- /fsspec/implementations/tests/test_dbfs.py: -------------------------------------------------------------------------------- 1 | """ 2 | Test-Cases for the DataBricks Filesystem. 3 | This test case is somewhat special, as there is no "mock" databricks 4 | API available. We use the "vcr" package to record the requests and 5 | responses to the real databricks API and replay them on tests. 6 | 7 | This however means, that when you change the tests (or when the API 8 | itself changes, which is very unlikely to occur as it is versioned), 9 | you need to re-record the answers. This can be done as follows: 10 | 11 | 1. Delete all casettes files in the "./cassettes" folder 12 | 2. Spin up a databricks cluster. For example, 13 | you can use an Azure Databricks instance for this. 14 | 3. Take note of the instance details (the instance URL. For example for an Azure 15 | databricks cluster, this has the form 16 | adb-..azuredatabricks.net) 17 | and your personal token (Find out more here: 18 | https://docs.databricks.com/dev-tools/api/latest/authentication.html) 19 | 4. Set the two environment variables `DBFS_INSTANCE` and `DBFS_TOKEN` 20 | 5. Now execute the tests as normal. The results of the API calls will be recorded. 21 | 6. Unset the environment variables and replay the tests. 22 | """ 23 | from urllib.parse import urlparse 24 | import os 25 | 26 | import pytest 27 | import fsspec 28 | 29 | DUMMY_INSTANCE = "my_instance.com" 30 | INSTANCE = os.getenv("DBFS_INSTANCE", DUMMY_INSTANCE) 31 | TOKEN = os.getenv("DBFS_TOKEN", "") 32 | 33 | 34 | @pytest.fixture(scope="module") 35 | def vcr_config(): 36 | """ 37 | To not record information in the instance and token details 38 | (which are sensitive), we delete them from both the 39 | request and the response before storing it. 40 | We also delete the date as it is likely to change 41 | (and will make git diffs harder). 42 | If the DBFS_TOKEN env variable is set, we record with VCR. 43 | If not, we only replay (to not accidentely record with a wrong URL). 44 | """ 45 | 46 | def before_record_response(response): 47 | try: 48 | del response["headers"]["x-databricks-org-id"] 49 | del response["headers"]["date"] 50 | except KeyError: 51 | pass 52 | return response 53 | 54 | def before_record_request(request): 55 | # Replace the instance URL 56 | uri = urlparse(request.uri) 57 | uri = uri._replace(netloc=DUMMY_INSTANCE) 58 | request.uri = uri.geturl() 59 | 60 | return request 61 | 62 | if TOKEN: 63 | return { 64 | "record_mode": "once", 65 | "filter_headers": [("authorization", "DUMMY")], 66 | "before_record_response": before_record_response, 67 | "before_record_request": before_record_request, 68 | } 69 | else: 70 | return { 71 | "record_mode": "none", 72 | } 73 | 74 | 75 | @pytest.fixture 76 | def dbfsFS(): 77 | fs = fsspec.filesystem( 78 | "dbfs", 79 | instance=INSTANCE, 80 | token=TOKEN, 81 | ) 82 | 83 | return fs 84 | 85 | 86 | @pytest.mark.vcr() 87 | def test_dbfs_file_listing(dbfsFS): 88 | assert "/FileStore" in dbfsFS.ls("/", detail=False) 89 | assert {"name": "/FileStore", "size": 0, "type": "directory"} in dbfsFS.ls( 90 | "/", detail=True 91 | ) 92 | 93 | 94 | @pytest.mark.vcr() 95 | def test_dbfs_mkdir(dbfsFS): 96 | dbfsFS.rm("/FileStore/my", recursive=True) 97 | assert "/FileStore/my" not in dbfsFS.ls("/FileStore/", detail=False) 98 | 99 | dbfsFS.mkdir("/FileStore/my/dir", create_parents=True) 100 | 101 | assert "/FileStore/my" in dbfsFS.ls("/FileStore/", detail=False) 102 | assert "/FileStore/my/dir" in dbfsFS.ls("/FileStore/my/", detail=False) 103 | 104 | with pytest.raises(FileExistsError): 105 | dbfsFS.mkdir("/FileStore/my/dir", create_parents=True, exist_ok=False) 106 | 107 | with pytest.raises(OSError): 108 | dbfsFS.rm("/FileStore/my", recursive=False) 109 | 110 | assert "/FileStore/my" in dbfsFS.ls("/FileStore/", detail=False) 111 | 112 | dbfsFS.rm("/FileStore/my", recursive=True) 113 | assert "/FileStore/my" not in dbfsFS.ls("/FileStore/", detail=False) 114 | 115 | 116 | @pytest.mark.vcr() 117 | def test_dbfs_write_and_read(dbfsFS): 118 | dbfsFS.rm("/FileStore/file.csv") 119 | assert "/FileStore/file.csv" not in dbfsFS.ls("/FileStore/", detail=False) 120 | 121 | content = b"This is a test\n" * 100000 + b"For this is the end\n" 122 | 123 | with dbfsFS.open("/FileStore/file.csv", "wb") as f: 124 | f.write(content) 125 | 126 | assert "/FileStore/file.csv" in dbfsFS.ls("/FileStore", detail=False) 127 | 128 | with dbfsFS.open("/FileStore/file.csv", "rb") as f: 129 | data = f.read() 130 | assert data == content 131 | 132 | dbfsFS.rm("/FileStore/file.csv") 133 | assert "/FileStore/file.csv" not in dbfsFS.ls("/FileStore/", detail=False) 134 | -------------------------------------------------------------------------------- /fsspec/implementations/sftp.py: -------------------------------------------------------------------------------- 1 | import paramiko 2 | from stat import S_ISDIR, S_ISLNK 3 | import types 4 | import uuid 5 | from .. import AbstractFileSystem 6 | from ..utils import infer_storage_options 7 | 8 | 9 | class SFTPFileSystem(AbstractFileSystem): 10 | """Files over SFTP/SSH 11 | 12 | Peer-to-peer filesystem over SSH using paramiko. 13 | 14 | Note: if using this with the ``open`` or ``open_files``, with full URLs, 15 | there is no way to tell if a path is relative, so all paths are assumed 16 | to be absolute. 17 | """ 18 | 19 | protocol = "sftp", "ssh" 20 | 21 | def __init__(self, host, **ssh_kwargs): 22 | """ 23 | 24 | Parameters 25 | ---------- 26 | host: str 27 | Hostname or IP as a string 28 | temppath: str 29 | Location on the server to put files, when within a transaction 30 | ssh_kwargs: dict 31 | Parameters passed on to connection. See details in 32 | http://docs.paramiko.org/en/2.4/api/client.html#paramiko.client.SSHClient.connect 33 | May include port, username, password... 34 | """ 35 | if self._cached: 36 | return 37 | super(SFTPFileSystem, self).__init__(**ssh_kwargs) 38 | self.temppath = ssh_kwargs.pop("temppath", "/tmp") 39 | self.host = host 40 | self.ssh_kwargs = ssh_kwargs 41 | self._connect() 42 | 43 | def _connect(self): 44 | self.client = paramiko.SSHClient() 45 | self.client.set_missing_host_key_policy(paramiko.AutoAddPolicy()) 46 | self.client.connect(self.host, **self.ssh_kwargs) 47 | self.ftp = self.client.open_sftp() 48 | 49 | @classmethod 50 | def _strip_protocol(cls, path): 51 | return infer_storage_options(path)["path"] 52 | 53 | @staticmethod 54 | def _get_kwargs_from_urls(urlpath): 55 | out = infer_storage_options(urlpath) 56 | out.pop("path", None) 57 | out.pop("protocol", None) 58 | return out 59 | 60 | def mkdir(self, path, mode=511): 61 | self.ftp.mkdir(path, mode) 62 | 63 | def makedirs(self, path, exist_ok=False, mode=511): 64 | if self.exists(path) and not exist_ok: 65 | raise FileExistsError("File exists: {}".format(path)) 66 | 67 | parts = path.split("/") 68 | path = "" 69 | 70 | for part in parts: 71 | path += "/" + part 72 | if not self.exists(path): 73 | self.mkdir(path, mode) 74 | 75 | def rmdir(self, path): 76 | self.ftp.rmdir(path) 77 | 78 | def info(self, path): 79 | s = self.ftp.stat(path) 80 | if S_ISDIR(s.st_mode): 81 | t = "directory" 82 | elif S_ISLNK(s.st_mode): 83 | t = "link" 84 | else: 85 | t = "file" 86 | return { 87 | "name": path + "/" if t == "directory" else path, 88 | "size": s.st_size, 89 | "type": t, 90 | "uid": s.st_uid, 91 | "gid": s.st_gid, 92 | "time": s.st_atime, 93 | "mtime": s.st_mtime, 94 | } 95 | 96 | def ls(self, path, detail=False): 97 | out = ["/".join([path.rstrip("/"), p]) for p in self.ftp.listdir(path)] 98 | out = [self.info(o) for o in out] 99 | if detail: 100 | return out 101 | return sorted([p["name"] for p in out]) 102 | 103 | def put(self, lpath, rpath): 104 | self.ftp.put(lpath, rpath) 105 | 106 | def get(self, rpath, lpath): 107 | self.ftp.get(rpath, lpath) 108 | 109 | def _open(self, path, mode="rb", block_size=None, **kwargs): 110 | """ 111 | block_size: int or None 112 | If 0, no buffering, if 1, line buffering, if >1, buffer that many 113 | bytes, if None use default from paramiko. 114 | """ 115 | if kwargs.get("autocommit", True) is False: 116 | # writes to temporary file, move on commit 117 | path2 = "{}/{}".format(self.temppath, uuid.uuid4()) 118 | f = self.ftp.open(path2, mode, bufsize=block_size if block_size else -1) 119 | f.temppath = path2 120 | f.targetpath = path 121 | f.fs = self 122 | f.commit = types.MethodType(commit_a_file, f) 123 | f.discard = types.MethodType(discard_a_file, f) 124 | else: 125 | f = self.ftp.open(path, mode, bufsize=block_size if block_size else -1) 126 | return f 127 | 128 | def _rm(self, path): 129 | if self.isdir(path): 130 | self.ftp.rmdir(path) 131 | else: 132 | self.ftp.remove(path) 133 | 134 | def mv(self, old, new): 135 | self.ftp.posix_rename(old, new) 136 | 137 | 138 | def commit_a_file(self): 139 | self.fs.mv(self.temppath, self.targetpath) 140 | 141 | 142 | def discard_a_file(self): 143 | self.fs._rm(self.temppath) 144 | -------------------------------------------------------------------------------- /docs/source/intro.rst: -------------------------------------------------------------------------------- 1 | Introduction 2 | ============ 3 | 4 | To get stuck into using the package, rather than reading about its philosophy and history, you can 5 | skip to :doc:`usage`. 6 | 7 | Background 8 | ---------- 9 | 10 | Python provides a standard interface for open files, so that alternate implementations of file-like object can 11 | work seamlessly with many function which rely only on the methods of that standard interface. A number of libraries 12 | have implemented a similar concept for file-systems, where file operations can be performed on a logical file-system 13 | which may be local, structured data store or some remote service. 14 | 15 | This repository is intended to be a place to define a standard interface that such file-systems should adhere to, 16 | such that code using them should not have to know the details of the implementation in order to operate on any of 17 | a number of backends. With hope, the community can come together to 18 | define an interface that is the best for the highest number of users, and having the specification, makes developing 19 | other file-system implementations simpler. 20 | 21 | History 22 | ------- 23 | 24 | I (Martin Durant) have been involved in building a number of remote-data file-system implementations, principally 25 | in the context of the `Dask`_ project. In particular, several are listed 26 | in `docs`_ with links to the specific repositories. 27 | With common authorship, there is much that is similar between the implementations, for example posix-like naming 28 | of the operations, and this has allowed Dask to be able to interact with the various backends and parse generic 29 | URLs in order to select amongst them. However, *some* extra code was required in each case to adapt the peculiarities 30 | of each implementation with the generic usage that Dask demanded. People may find the 31 | `code`_ which parses URLs and creates file-system 32 | instances interesting. 33 | 34 | .. _Dask: http://dask.pydata.org/en/latest/ 35 | .. _docs: http://dask.pydata.org/en/latest/remote-data-services.html 36 | .. _code: https://github.com/dask/dask/blob/master/dask/bytes/core.py#L266 37 | 38 | At the same time, the Apache `Arrow`_ project was also concerned with a similar problem, 39 | particularly a common interface to local and HDFS files, for example the 40 | `hdfs`_ interface (which actually communicated with HDFS 41 | with a choice of driver). These are mostly used internally within Arrow, but Dask was modified in order to be able 42 | to use the alternate HDFS interface (which solves some security issues with `hdfs3`). In the process, a 43 | `conversation`_ 44 | was started, and I invite all interested parties to continue the conversation in this location. 45 | 46 | .. _Arrow: https://arrow.apache.org/ 47 | .. _hdfs: https://arrow.apache.org/docs/python/filesystems.html 48 | .. _conversation: https://github.com/dask/dask/issues/2880 49 | 50 | There is a good argument that this type of code has no place in Dask, which is concerned with making graphs 51 | representing computations, and executing those graphs on a scheduler. Indeed, the file-systems are generally useful, 52 | and each has a user-base wider than just those that work via Dask. 53 | 54 | Influences 55 | ---------- 56 | 57 | The following places to consider, when choosing the definitions of how we would like the file-system specification 58 | to look: 59 | 60 | - python's `os`_ module and its `path` namespace; also other file-connected 61 | functionality in the standard library 62 | - posix/bash method naming conventions that linux/unix/osx users are familiar with; or perhaps their Windows variants 63 | - the existing implementations for the various backends (e.g., 64 | `gcsfs`_ or Arrow's 65 | `hdfs`_) 66 | - `pyfilesystems`_, an attempt to do something similar, with a 67 | plugin architecture. This conception has several types of local file-system, and a lot of well-thought-out 68 | validation code. 69 | 70 | .. _os: https://docs.python.org/3/library/os.html 71 | .. _gcsfs: http://gcsfs.readthedocs.io/en/latest/api.html#gcsfs.core.GCSFileSystem 72 | .. _pyfilesystems: https://docs.pyfilesystem.org/en/latest/index.html 73 | 74 | Not pyfilesystems? 75 | ------------------ 76 | 77 | It might have been conceivable to reuse code in ``pyfilesystems``, which has an established interface and several 78 | implementations of its own. However, it supports none of the :ref:`highlight`, critical to 79 | cloud and parallel access, and would not be easy to 80 | coerce. Following on the success of ``s3fs`` and ``gcsfs``, and their use within Dask, it seemed best to 81 | have an interface as close to those as possible. See a 82 | `discussion`_ on the topic. 83 | 84 | .. _discussion: https://github.com/intake/filesystem_spec/issues/5 85 | 86 | Structure of the package 87 | ------------------------ 88 | 89 | The best place to get a feel for the contents of ``fsspec`` is by looking through the :doc:`usage` and 90 | :doc:`api` sections. In addition, the source code will be interesting for those who wish to subclass and 91 | develop new file-system implementations. ``fsspec/spec.py`` contains the main abstract file-system class 92 | to derive from, ``AbstractFileSystem``. 93 | 94 | .. _zarr: https://zarr.readthedocs.io 95 | -------------------------------------------------------------------------------- /fsspec/implementations/dask.py: -------------------------------------------------------------------------------- 1 | from distributed.worker import get_worker 2 | from distributed.client import _get_global_client, Client 3 | import dask 4 | from fsspec.spec import AbstractFileSystem, AbstractBufferedFile 5 | from fsspec import filesystem 6 | from fsspec.utils import infer_storage_options 7 | 8 | 9 | def _get_client(client): 10 | if client is None: 11 | return _get_global_client() 12 | elif isinstance(client, Client): 13 | return client 14 | else: 15 | # e.g., connection string 16 | return Client(client) 17 | 18 | 19 | class DaskWorkerFileSystem(AbstractFileSystem): 20 | """View files accessible to a worker as any other remote file-system 21 | 22 | When instances are run on the worker, uses the real filesystem. When 23 | run on the client, they call the worker to provide information or data. 24 | 25 | **Warning** this implementation is experimental, and read-only for now. 26 | """ 27 | 28 | def __init__( 29 | self, target_protocol=None, target_options=None, fs=None, client=None, **kwargs 30 | ): 31 | super().__init__(**kwargs) 32 | if not (fs is None) ^ (target_protocol is None): 33 | raise ValueError( 34 | "Please provide one of filesystem instance (fs) or" 35 | " target_protocol, not both" 36 | ) 37 | self.target_protocol = target_protocol 38 | self.target_options = target_options 39 | self.worker = None 40 | self.client = client 41 | self.fs = fs 42 | self._determine_worker() 43 | 44 | @staticmethod 45 | def _get_kwargs_from_urls(path): 46 | so = infer_storage_options(path) 47 | if "host" in so and "port" in so: 48 | return {"client": f"{so['host']}:{so['port']}"} 49 | else: 50 | return {} 51 | 52 | def _determine_worker(self): 53 | try: 54 | get_worker() 55 | self.worker = True 56 | if self.fs is None: 57 | self.fs = filesystem( 58 | self.target_protocol, **(self.target_options or {}) 59 | ) 60 | except ValueError: 61 | self.worker = False 62 | self.client = _get_client(self.client) 63 | self.rfs = dask.delayed(self) 64 | 65 | def mkdir(self, *args, **kwargs): 66 | if self.worker: 67 | self.fs.mkdir(*args, **kwargs) 68 | else: 69 | self.rfs.mkdir(*args, **kwargs).compute() 70 | 71 | def rm(self, *args, **kwargs): 72 | if self.worker: 73 | self.fs.rm(*args, **kwargs) 74 | else: 75 | self.rfs.rm(*args, **kwargs).compute() 76 | 77 | def copy(self, *args, **kwargs): 78 | if self.worker: 79 | self.fs.copy(*args, **kwargs) 80 | else: 81 | self.rfs.copy(*args, **kwargs).compute() 82 | 83 | def mv(self, *args, **kwargs): 84 | if self.worker: 85 | self.fs.mv(*args, **kwargs) 86 | else: 87 | self.rfs.mv(*args, **kwargs).compute() 88 | 89 | def ls(self, *args, **kwargs): 90 | if self.worker: 91 | return self.fs.ls(*args, **kwargs) 92 | else: 93 | return self.rfs.ls(*args, **kwargs).compute() 94 | 95 | def _open( 96 | self, 97 | path, 98 | mode="rb", 99 | block_size=None, 100 | autocommit=True, 101 | cache_options=None, 102 | **kwargs 103 | ): 104 | if self.worker: 105 | return self.fs._open( 106 | path, 107 | mode=mode, 108 | block_size=block_size, 109 | autocommit=autocommit, 110 | cache_options=cache_options, 111 | **kwargs 112 | ) 113 | else: 114 | return DaskFile( 115 | fs=self, 116 | path=path, 117 | mode=mode, 118 | block_size=block_size, 119 | autocommit=autocommit, 120 | cache_options=cache_options, 121 | **kwargs 122 | ) 123 | 124 | def fetch_range(self, path, mode, start, end): 125 | if self.worker: 126 | with self._open(path, mode) as f: 127 | f.seek(start) 128 | return f.read(end - start) 129 | else: 130 | return self.rfs.fetch_range(path, mode, start, end).compute() 131 | 132 | 133 | class DaskFile(AbstractBufferedFile): 134 | def __init__(self, mode="rb", **kwargs): 135 | if mode != "rb": 136 | raise ValueError('Remote dask files can only be opened in "rb" mode') 137 | super().__init__(**kwargs) 138 | 139 | def _upload_chunk(self, final=False): 140 | pass 141 | 142 | def _initiate_upload(self): 143 | """ Create remote file/upload """ 144 | pass 145 | 146 | def _fetch_range(self, start, end): 147 | """Get the specified set of bytes from remote""" 148 | return self.fs.fetch_range(self.path, self.mode, start, end) 149 | -------------------------------------------------------------------------------- /fsspec/compression.py: -------------------------------------------------------------------------------- 1 | """Helper functions for a standard streaming compression API""" 2 | from bz2 import BZ2File 3 | from gzip import GzipFile 4 | from zipfile import ZipFile 5 | 6 | import fsspec.utils 7 | from fsspec.spec import AbstractBufferedFile 8 | 9 | 10 | def noop_file(file, mode, **kwargs): 11 | return file 12 | 13 | 14 | # TODO: files should also be available as contexts 15 | # should be functions of the form func(infile, mode=, **kwargs) -> file-like 16 | compr = {None: noop_file} 17 | 18 | 19 | def register_compression(name, callback, extensions, force=False): 20 | """Register an "inferable" file compression type. 21 | 22 | Registers transparent file compression type for use with fsspec.open. 23 | Compression can be specified by name in open, or "infer"-ed for any files 24 | ending with the given extensions. 25 | 26 | Args: 27 | name: (str) The compression type name. Eg. "gzip". 28 | callback: A callable of form (infile, mode, **kwargs) -> file-like. 29 | Accepts an input file-like object, the target mode and kwargs. 30 | Returns a wrapped file-like object. 31 | extensions: (str, Iterable[str]) A file extension, or list of file 32 | extensions for which to infer this compression scheme. Eg. "gz". 33 | force: (bool) Force re-registration of compression type or extensions. 34 | 35 | Raises: 36 | ValueError: If name or extensions already registered, and not force. 37 | 38 | """ 39 | if isinstance(extensions, str): 40 | extensions = [extensions] 41 | 42 | # Validate registration 43 | if name in compr and not force: 44 | raise ValueError("Duplicate compression registration: %s" % name) 45 | 46 | for ext in extensions: 47 | if ext in fsspec.utils.compressions and not force: 48 | raise ValueError( 49 | "Duplicate compression file extension: %s (%s)" % (ext, name) 50 | ) 51 | 52 | compr[name] = callback 53 | 54 | for ext in extensions: 55 | fsspec.utils.compressions[ext] = name 56 | 57 | 58 | def unzip(infile, mode="rb", filename=None, **kwargs): 59 | if "r" not in mode: 60 | filename = filename or "file" 61 | z = ZipFile(infile, mode="w", **kwargs) 62 | fo = z.open(filename, mode="w") 63 | fo.close = lambda closer=fo.close: closer() or z.close() 64 | return fo 65 | z = ZipFile(infile) 66 | if filename is None: 67 | filename = z.namelist()[0] 68 | return z.open(filename, mode="r", **kwargs) 69 | 70 | 71 | register_compression("zip", unzip, "zip") 72 | register_compression("bz2", BZ2File, "bz2") 73 | register_compression("gzip", lambda f, **kwargs: GzipFile(fileobj=f, **kwargs), "gz") 74 | 75 | try: 76 | from lzma import LZMAFile 77 | 78 | register_compression("lzma", LZMAFile, "xz") 79 | register_compression("xz", LZMAFile, "xz", force=True) 80 | except ImportError: 81 | pass 82 | 83 | try: 84 | import lzmaffi 85 | 86 | register_compression("lzma", lzmaffi.LZMAFile, "xz", force=True) 87 | register_compression("xz", lzmaffi.LZMAFile, "xz", force=True) 88 | except ImportError: 89 | pass 90 | 91 | 92 | class SnappyFile(AbstractBufferedFile): 93 | def __init__(self, infile, mode, **kwargs): 94 | import snappy 95 | 96 | self.details = {"size": 999999999} # not true, but OK if we don't seek 97 | super().__init__(fs=None, path="snappy", mode=mode.strip("b") + "b", **kwargs) 98 | self.infile = infile 99 | if "r" in mode: 100 | self.codec = snappy.StreamDecompressor() 101 | else: 102 | self.codec = snappy.StreamCompressor() 103 | 104 | def _upload_chunk(self, final=False): 105 | self.buffer.seek(0) 106 | out = self.codec.add_chunk(self.buffer.read()) 107 | self.infile.write(out) 108 | return True 109 | 110 | def seek(self, loc, whence=0): 111 | raise NotImplementedError("SnappyFile is not seekable") 112 | 113 | def seekable(self): 114 | return False 115 | 116 | def _fetch_range(self, start, end): 117 | """Get the specified set of bytes from remote""" 118 | data = self.infile.read(end - start) 119 | return self.codec.decompress(data) 120 | 121 | 122 | try: 123 | import snappy 124 | 125 | snappy.compress 126 | # Snappy may use the .sz file extension, but this is not part of the 127 | # standard implementation. 128 | register_compression("snappy", SnappyFile, []) 129 | 130 | except (ImportError, NameError): 131 | pass 132 | 133 | try: 134 | import lz4.frame 135 | 136 | register_compression("lz4", lz4.frame.open, "lz4") 137 | except ImportError: 138 | pass 139 | 140 | try: 141 | import zstandard as zstd 142 | 143 | def zstandard_file(infile, mode="rb"): 144 | if "r" in mode: 145 | cctx = zstd.ZstdDecompressor() 146 | return cctx.stream_reader(infile) 147 | else: 148 | cctx = zstd.ZstdCompressor(level=10) 149 | return cctx.stream_writer(infile) 150 | 151 | register_compression("zstd", zstandard_file, "zst") 152 | except ImportError: 153 | pass 154 | -------------------------------------------------------------------------------- /fsspec/tests/test_compression.py: -------------------------------------------------------------------------------- 1 | import pathlib 2 | 3 | import pytest 4 | 5 | import fsspec.core 6 | from fsspec.compression import compr, register_compression 7 | from fsspec.utils import compressions, infer_compression 8 | 9 | 10 | def test_infer_custom_compression(): 11 | """Inferred compression gets values from fsspec.compression.compr.""" 12 | assert infer_compression("fn.zip") == "zip" 13 | assert infer_compression("fn.gz") == "gzip" 14 | assert infer_compression("fn.unknown") is None 15 | assert infer_compression("fn.test_custom") is None 16 | assert infer_compression("fn.tst") is None 17 | 18 | register_compression("test_custom", lambda f, **kwargs: f, "tst") 19 | 20 | try: 21 | assert infer_compression("fn.zip") == "zip" 22 | assert infer_compression("fn.gz") == "gzip" 23 | assert infer_compression("fn.unknown") is None 24 | assert infer_compression("fn.test_custom") is None 25 | assert infer_compression("fn.tst") == "test_custom" 26 | 27 | # Duplicate registration in name or extension raises a value error. 28 | with pytest.raises(ValueError): 29 | register_compression("test_custom", lambda f, **kwargs: f, "tst") 30 | 31 | with pytest.raises(ValueError): 32 | register_compression("test_conflicting", lambda f, **kwargs: f, "tst") 33 | assert "test_conflicting" not in compr 34 | 35 | # ...but can be forced. 36 | register_compression( 37 | "test_conflicting", lambda f, **kwargs: f, "tst", force=True 38 | ) 39 | assert infer_compression("fn.zip") == "zip" 40 | assert infer_compression("fn.gz") == "gzip" 41 | assert infer_compression("fn.unknown") is None 42 | assert infer_compression("fn.test_custom") is None 43 | assert infer_compression("fn.tst") == "test_conflicting" 44 | 45 | finally: 46 | del compr["test_custom"] 47 | del compr["test_conflicting"] 48 | del compressions["tst"] 49 | 50 | 51 | def test_lzma_compression_name(): 52 | pytest.importorskip("lzma") 53 | assert infer_compression("fn.xz") == "xz" 54 | 55 | 56 | def test_lz4_compression(tmpdir): 57 | """Infer lz4 compression for .lz4 files if lz4 is available.""" 58 | tmp_path = pathlib.Path(str(tmpdir)) 59 | 60 | lz4 = pytest.importorskip("lz4") 61 | 62 | tmp_path.mkdir(exist_ok=True) 63 | 64 | tdat = "foobar" * 100 65 | 66 | with fsspec.core.open( 67 | str(tmp_path / "out.lz4"), mode="wt", compression="infer" 68 | ) as outfile: 69 | outfile.write(tdat) 70 | 71 | compressed = (tmp_path / "out.lz4").open("rb").read() 72 | assert lz4.frame.decompress(compressed).decode() == tdat 73 | 74 | with fsspec.core.open( 75 | str(tmp_path / "out.lz4"), mode="rt", compression="infer" 76 | ) as infile: 77 | assert infile.read() == tdat 78 | 79 | with fsspec.core.open( 80 | str(tmp_path / "out.lz4"), mode="rt", compression="lz4" 81 | ) as infile: 82 | assert infile.read() == tdat 83 | 84 | 85 | def test_zstd_compression(tmpdir): 86 | """Infer zstd compression for .zst files if zstandard is available.""" 87 | tmp_path = pathlib.Path(str(tmpdir)) 88 | 89 | zstd = pytest.importorskip("zstandard") 90 | 91 | tmp_path.mkdir(exist_ok=True) 92 | 93 | tdat = "foobar" * 100 94 | 95 | with fsspec.core.open( 96 | str(tmp_path / "out.zst"), mode="wt", compression="infer" 97 | ) as outfile: 98 | outfile.write(tdat) 99 | 100 | compressed = (tmp_path / "out.zst").open("rb").read() 101 | assert zstd.ZstdDecompressor().decompress(compressed, len(tdat)).decode() == tdat 102 | 103 | with fsspec.core.open( 104 | str(tmp_path / "out.zst"), mode="rt", compression="infer" 105 | ) as infile: 106 | assert infile.read() == tdat 107 | 108 | with fsspec.core.open( 109 | str(tmp_path / "out.zst"), mode="rt", compression="zstd" 110 | ) as infile: 111 | assert infile.read() == tdat 112 | 113 | 114 | def test_snappy_compression(tmpdir): 115 | """No registered compression for snappy, but can be specified.""" 116 | tmp_path = pathlib.Path(str(tmpdir)) 117 | 118 | snappy = pytest.importorskip("snappy") 119 | 120 | tmp_path.mkdir(exist_ok=True) 121 | 122 | tdat = "foobar" * 100 123 | 124 | # Snappy isn't inferred. 125 | with fsspec.core.open( 126 | str(tmp_path / "out.snappy"), mode="wt", compression="infer" 127 | ) as outfile: 128 | outfile.write(tdat) 129 | assert (tmp_path / "out.snappy").open("rb").read().decode() == tdat 130 | 131 | # but can be specified. 132 | with fsspec.core.open( 133 | str(tmp_path / "out.snappy"), mode="wt", compression="snappy" 134 | ) as outfile: 135 | outfile.write(tdat) 136 | 137 | compressed = (tmp_path / "out.snappy").open("rb").read() 138 | assert snappy.StreamDecompressor().decompress(compressed).decode() == tdat 139 | 140 | with fsspec.core.open( 141 | str(tmp_path / "out.snappy"), mode="rb", compression="infer" 142 | ) as infile: 143 | assert infile.read() == compressed 144 | 145 | with fsspec.core.open( 146 | str(tmp_path / "out.snappy"), mode="rt", compression="snappy" 147 | ) as infile: 148 | assert infile.read() == tdat 149 | -------------------------------------------------------------------------------- /fsspec/fuse.py: -------------------------------------------------------------------------------- 1 | from __future__ import print_function 2 | import os 3 | import stat 4 | from errno import ENOENT, EIO 5 | from fuse import Operations, FuseOSError 6 | import threading 7 | import time 8 | from fuse import FUSE 9 | 10 | 11 | class FUSEr(Operations): 12 | def __init__(self, fs, path): 13 | self.fs = fs 14 | self.cache = {} 15 | self.root = path.rstrip("/") + "/" 16 | self.counter = 0 17 | 18 | def getattr(self, path, fh=None): 19 | path = "".join([self.root, path.lstrip("/")]).rstrip("/") 20 | try: 21 | info = self.fs.info(path) 22 | except FileNotFoundError: 23 | raise FuseOSError(ENOENT) 24 | data = {"st_uid": 1000, "st_gid": 1000} 25 | perm = 0o777 26 | 27 | if info["type"] != "file": 28 | data["st_mode"] = stat.S_IFDIR | perm 29 | data["st_size"] = 0 30 | data["st_blksize"] = 0 31 | else: 32 | data["st_mode"] = stat.S_IFREG | perm 33 | data["st_size"] = info["size"] 34 | data["st_blksize"] = 5 * 2 ** 20 35 | data["st_nlink"] = 1 36 | data["st_atime"] = time.time() 37 | data["st_ctime"] = time.time() 38 | data["st_mtime"] = time.time() 39 | return data 40 | 41 | def readdir(self, path, fh): 42 | path = "".join([self.root, path.lstrip("/")]) 43 | files = self.fs.ls(path, False) 44 | files = [os.path.basename(f.rstrip("/")) for f in files] 45 | return [".", ".."] + files 46 | 47 | def mkdir(self, path, mode): 48 | path = "".join([self.root, path.lstrip("/")]) 49 | self.fs.mkdir(path) 50 | return 0 51 | 52 | def rmdir(self, path): 53 | path = "".join([self.root, path.lstrip("/")]) 54 | self.fs.rmdir(path) 55 | return 0 56 | 57 | def read(self, path, size, offset, fh): 58 | f = self.cache[fh] 59 | f.seek(offset) 60 | out = f.read(size) 61 | return out 62 | 63 | def write(self, path, data, offset, fh): 64 | f = self.cache[fh] 65 | f.write(data) 66 | return len(data) 67 | 68 | def create(self, path, flags, fi=None): 69 | fn = "".join([self.root, path.lstrip("/")]) 70 | f = self.fs.open(fn, "wb") 71 | self.cache[self.counter] = f 72 | self.counter += 1 73 | return self.counter - 1 74 | 75 | def open(self, path, flags): 76 | fn = "".join([self.root, path.lstrip("/")]) 77 | if flags % 2 == 0: 78 | # read 79 | mode = "rb" 80 | else: 81 | # write/create 82 | mode = "wb" 83 | self.cache[self.counter] = self.fs.open(fn, mode) 84 | self.counter += 1 85 | return self.counter - 1 86 | 87 | def truncate(self, path, length, fh=None): 88 | fn = "".join([self.root, path.lstrip("/")]) 89 | if length != 0: 90 | raise NotImplementedError 91 | # maybe should be no-op since open with write sets size to zero anyway 92 | self.fs.touch(fn) 93 | 94 | def unlink(self, path): 95 | fn = "".join([self.root, path.lstrip("/")]) 96 | try: 97 | self.fs.rm(fn, False) 98 | except (IOError, FileNotFoundError): 99 | raise FuseOSError(EIO) 100 | 101 | def release(self, path, fh): 102 | try: 103 | if fh in self.cache: 104 | f = self.cache[fh] 105 | f.close() 106 | self.cache.pop(fh) 107 | except Exception as e: 108 | print(e) 109 | return 0 110 | 111 | def chmod(self, path, mode): 112 | raise NotImplementedError 113 | 114 | 115 | def run(fs, path, mount_point, foreground=True, threads=False): 116 | """Mount stuff in a local directory 117 | 118 | This uses fusepy to make it appear as if a given path on an fsspec 119 | instance is in fact resident within the local file-system. 120 | 121 | This requires that fusepy by installed, and that FUSE be available on 122 | the system (typically requiring a package to be installed with 123 | apt, yum, brew, etc.). 124 | 125 | Parameters 126 | ---------- 127 | fs: file-system instance 128 | From one of the compatible implementations 129 | path: str 130 | Location on that file-system to regard as the root directory to 131 | mount. Note that you typically should include the terminating "/" 132 | character. 133 | mount_point: str 134 | An empty directory on the local file-system where the contents of 135 | the remote path will appear 136 | foreground: bool 137 | Whether or not calling this function will block. Operation will 138 | typically be more stable if True. 139 | threads: bool 140 | Whether or not to create threads when responding to file operations 141 | within the mounter directory. Operation will typically be more 142 | stable if False. 143 | 144 | """ 145 | func = lambda: FUSE( 146 | FUSEr(fs, path), mount_point, nothreads=not threads, foreground=foreground 147 | ) 148 | if foreground is False: 149 | th = threading.Thread(target=func) 150 | th.daemon = True 151 | th.start() 152 | return th 153 | else: # pragma: no cover 154 | try: 155 | func() 156 | except KeyboardInterrupt: 157 | pass 158 | -------------------------------------------------------------------------------- /fsspec/implementations/zip.py: -------------------------------------------------------------------------------- 1 | from __future__ import print_function, division, absolute_import 2 | 3 | import zipfile 4 | from fsspec import AbstractFileSystem, open_files 5 | from fsspec.utils import tokenize, DEFAULT_BLOCK_SIZE 6 | 7 | 8 | class ZipFileSystem(AbstractFileSystem): 9 | """Read contents of ZIP archive as a file-system 10 | 11 | Keeps file object open while instance lives. 12 | 13 | This class is pickleable, but not necessarily thread-safe 14 | """ 15 | 16 | root_marker = "" 17 | 18 | def __init__( 19 | self, 20 | fo="", 21 | mode="r", 22 | target_protocol=None, 23 | target_options=None, 24 | block_size=DEFAULT_BLOCK_SIZE, 25 | **kwargs 26 | ): 27 | """ 28 | Parameters 29 | ---------- 30 | fo: str or file-like 31 | Contains ZIP, and must exist. If a str, will fetch file using 32 | `open_files()`, which must return one file exactly. 33 | mode: str 34 | Currently, only 'r' accepted 35 | target_protocol: str (optional) 36 | If ``fo`` is a string, this value can be used to override the 37 | FS protocol inferred from a URL 38 | target_options: dict (optional) 39 | Kwargs passed when instantiating the target FS, if ``fo`` is 40 | a string. 41 | """ 42 | super().__init__(self, **kwargs) 43 | if mode != "r": 44 | raise ValueError("Only read from zip files accepted") 45 | if isinstance(fo, str): 46 | files = open_files(fo, protocol=target_protocol, **(target_options or {})) 47 | if len(files) != 1: 48 | raise ValueError( 49 | 'Path "{}" did not resolve to exactly' 50 | 'one file: "{}"'.format(fo, files) 51 | ) 52 | fo = files[0] 53 | self.fo = fo.__enter__() # the whole instance is a context 54 | self.zip = zipfile.ZipFile(self.fo) 55 | self.block_size = block_size 56 | self.dir_cache = None 57 | 58 | @classmethod 59 | def _strip_protocol(cls, path): 60 | # zip file paths are always relative to the archive root 61 | return super()._strip_protocol(path).lstrip("/") 62 | 63 | def _get_dirs(self): 64 | if self.dir_cache is None: 65 | files = self.zip.infolist() 66 | self.dir_cache = { 67 | dirname + "/": {"name": dirname + "/", "size": 0, "type": "directory"} 68 | for dirname in self._all_dirnames(self.zip.namelist()) 69 | } 70 | for z in files: 71 | f = {s: getattr(z, s) for s in zipfile.ZipInfo.__slots__} 72 | f.update( 73 | { 74 | "name": z.filename, 75 | "size": z.file_size, 76 | "type": ("directory" if z.is_dir() else "file"), 77 | } 78 | ) 79 | self.dir_cache[f["name"]] = f 80 | 81 | def info(self, path, **kwargs): 82 | self._get_dirs() 83 | path = self._strip_protocol(path) 84 | if path in self.dir_cache: 85 | return self.dir_cache[path] 86 | elif path + "/" in self.dir_cache: 87 | return self.dir_cache[path + "/"] 88 | else: 89 | raise FileNotFoundError(path) 90 | 91 | def ls(self, path, detail=False, **kwargs): 92 | self._get_dirs() 93 | paths = {} 94 | for p, f in self.dir_cache.items(): 95 | p = p.rstrip("/") 96 | if "/" in p: 97 | root = p.rsplit("/", 1)[0] 98 | else: 99 | root = "" 100 | if root == path.rstrip("/"): 101 | paths[p] = f 102 | elif all( 103 | (a == b) 104 | for a, b in zip(path.split("/"), [""] + p.strip("/").split("/")) 105 | ): 106 | # root directory entry 107 | ppath = p.rstrip("/").split("/", 1)[0] 108 | if ppath not in paths: 109 | out = {"name": ppath + "/", "size": 0, "type": "directory"} 110 | paths[ppath] = out 111 | out = list(paths.values()) 112 | if detail: 113 | return out 114 | else: 115 | return list(sorted(f["name"] for f in out)) 116 | 117 | def cat(self, path): 118 | return self.zip.read(path) 119 | 120 | def _open( 121 | self, 122 | path, 123 | mode="rb", 124 | block_size=None, 125 | autocommit=True, 126 | cache_options=None, 127 | **kwargs 128 | ): 129 | path = self._strip_protocol(path) 130 | if mode != "rb": 131 | raise NotImplementedError 132 | info = self.info(path) 133 | out = self.zip.open(path, "r") 134 | out.size = info["size"] 135 | out.name = info["name"] 136 | return out 137 | 138 | def ukey(self, path): 139 | return tokenize(path, self.fo, self.protocol) 140 | 141 | def _all_dirnames(self, paths): 142 | """Returns *all* directory names for each path in paths, including intermediate ones. 143 | 144 | Parameters 145 | ---------- 146 | paths: Iterable of path strings 147 | """ 148 | if len(paths) == 0: 149 | return set() 150 | 151 | dirnames = {self._parent(path) for path in paths} - {self.root_marker} 152 | return dirnames | self._all_dirnames(dirnames) 153 | -------------------------------------------------------------------------------- /fsspec/tests/test_core.py: -------------------------------------------------------------------------------- 1 | import os 2 | import pickle 3 | import pytest 4 | import tempfile 5 | 6 | from fsspec.core import ( 7 | _expand_paths, 8 | OpenFile, 9 | open_local, 10 | get_compression, 11 | open_files, 12 | OpenFiles, 13 | ) 14 | import fsspec 15 | 16 | 17 | @pytest.mark.parametrize( 18 | "path, name_function, num, out", 19 | [ 20 | [["apath"], None, 1, ["apath"]], 21 | ["apath.*.csv", None, 1, ["apath.0.csv"]], 22 | ["apath.*.csv", None, 2, ["apath.0.csv", "apath.1.csv"]], 23 | ["a*", lambda x: "abc"[x], 2, ["aa", "ab"]], 24 | ], 25 | ) 26 | def test_expand_paths(path, name_function, num, out): 27 | assert _expand_paths(path, name_function, num) == out 28 | 29 | 30 | def test_expand_error(): 31 | with pytest.raises(ValueError): 32 | _expand_paths("*.*", None, 1) 33 | 34 | 35 | def test_openfile_api(m): 36 | m.open("somepath", "wb").write(b"data") 37 | of = OpenFile(m, "somepath") 38 | assert str(of) == "" 39 | f = of.open() 40 | assert f.read() == b"data" 41 | f.close() 42 | with OpenFile(m, "somepath", mode="rt") as f: 43 | f.read() == "data" 44 | 45 | 46 | def test_openfile_open(m): 47 | of = OpenFile(m, "somepath", mode="wt") 48 | f = of.open() 49 | f.write("hello") 50 | assert m.size("somepath") == 0 # no flush yet 51 | del of 52 | assert m.size("somepath") == 0 # still no flush 53 | f.close() 54 | assert m.size("somepath") == 5 55 | 56 | 57 | def test_open_local(): 58 | d1 = str(tempfile.mkdtemp()) 59 | f1 = os.path.join(d1, "f1") 60 | open(f1, "w").write("test1") 61 | d2 = str(tempfile.mkdtemp()) 62 | fn = open_local("simplecache://" + f1, cache_storage=d2, target_protocol="file") 63 | assert isinstance(fn, str) 64 | assert open(fn).read() == "test1" 65 | assert d2 in fn 66 | 67 | 68 | def test_xz_lzma_compressions(): 69 | pytest.importorskip("lzma") 70 | # Ensure that both 'xz' and 'lzma' compression names can be parsed 71 | assert get_compression("some_file.xz", "infer") == "xz" 72 | assert get_compression("some_file.xz", "xz") == "xz" 73 | assert get_compression("some_file.xz", "lzma") == "lzma" 74 | 75 | 76 | def test_list(): 77 | here = os.path.abspath(os.path.dirname(__file__)) 78 | flist = os.listdir(here) 79 | plist = [os.path.join(here, p).replace("\\", "/") for p in flist] 80 | of = open_files(plist) 81 | assert len(of) == len(flist) 82 | assert [f.path for f in of] == plist 83 | 84 | 85 | def test_pathobject(tmpdir): 86 | import pathlib 87 | 88 | tmpdir = str(tmpdir) 89 | plist_str = [os.path.join(str(tmpdir), f).replace("\\", "/") for f in ["a", "b"]] 90 | open(plist_str[0], "w").write("first file") 91 | open(plist_str[1], "w").write("second file") 92 | plist = [pathlib.Path(p) for p in plist_str] 93 | of = open_files(plist) 94 | assert len(of) == 2 95 | assert [f.path for f in of] == plist_str 96 | 97 | of = open_files(plist[0]) 98 | assert len(of) == 1 99 | assert of[0].path == plist_str[0] 100 | with of[0] as f: 101 | assert f.read() == open(plist_str[0], "rb").read() 102 | 103 | 104 | def test_automkdir(tmpdir): 105 | dir = os.path.join(str(tmpdir), "a") 106 | of = fsspec.open(os.path.join(dir, "afile"), "w") 107 | with of: 108 | pass 109 | assert "afile" in os.listdir(dir) 110 | 111 | dir = os.path.join(str(tmpdir), "b") 112 | of = fsspec.open(os.path.join(dir, "bfile"), "w", auto_mkdir=True) 113 | with of: 114 | pass 115 | 116 | assert "bfile" in os.listdir(dir) 117 | 118 | dir = os.path.join(str(tmpdir), "c") 119 | with pytest.raises(FileNotFoundError): 120 | of = fsspec.open(os.path.join(dir, "bfile"), "w", auto_mkdir=False) 121 | with of: 122 | pass 123 | 124 | 125 | def test_automkdir_readonly(tmpdir): 126 | dir = os.path.join(str(tmpdir), "d") 127 | with pytest.raises(FileNotFoundError): 128 | of = fsspec.open(os.path.join(dir, "dfile"), "r") 129 | with of: 130 | pass 131 | 132 | 133 | def test_openfile_pickle_newline(): 134 | # GH#318 135 | test = fsspec.open(__file__, newline=b"") 136 | 137 | pickled = pickle.dumps(test) 138 | restored = pickle.loads(pickled) 139 | 140 | assert test.newline == restored.newline 141 | 142 | 143 | def test_mismatch(): 144 | with pytest.raises(ValueError, match="protocol"): 145 | open_files(["s3://test/path.csv", "/other/path.csv"]) 146 | 147 | 148 | def test_url_kwargs_chain(ftp_writable): 149 | host, port, username, password = "localhost", 2121, "user", "pass" 150 | data = b"hello" 151 | with fsspec.open( 152 | "ftp:///afile", "wb", host=host, port=port, username=username, password=password 153 | ) as f: 154 | f.write(data) 155 | 156 | with fsspec.open( 157 | "simplecache::ftp://{}:{}@{}:{}/afile".format(username, password, host, port), 158 | "rb", 159 | ) as f: 160 | assert f.read() == data 161 | 162 | 163 | def test_multi_context(tmpdir): 164 | fns = [os.path.join(tmpdir, fn) for fn in ["a", "b"]] 165 | files = open_files(fns, "wb") 166 | assert isinstance(files, OpenFiles) 167 | assert isinstance(files[0], OpenFile) 168 | assert len(files) == 2 169 | with files as of: 170 | assert len(of) == 2 171 | assert not of[0].closed 172 | assert of[0].name.endswith("a") 173 | assert of[0].closed 174 | assert repr(files) == "" 175 | 176 | 177 | def test_not_local(): 178 | with pytest.raises(ValueError, match="attribute local_file=True"): 179 | open_local("memory://afile") 180 | -------------------------------------------------------------------------------- /fsspec/implementations/tests/test_zip.py: -------------------------------------------------------------------------------- 1 | import zipfile 2 | from contextlib import contextmanager 3 | 4 | import os 5 | import pickle 6 | import pytest 7 | import sys 8 | import tempfile 9 | import fsspec 10 | 11 | 12 | @contextmanager 13 | def tempzip(data={}): 14 | f = tempfile.mkstemp(suffix="zip")[1] 15 | with zipfile.ZipFile(f, mode="w") as z: 16 | for k, v in data.items(): 17 | z.writestr(k, v) 18 | try: 19 | yield f 20 | finally: 21 | try: 22 | os.remove(f) 23 | except (IOError, OSError): 24 | pass 25 | 26 | 27 | data = {"a": b"", "b": b"hello", "deeply/nested/path": b"stuff"} 28 | 29 | 30 | def test_empty(): 31 | with tempzip() as z: 32 | fs = fsspec.filesystem("zip", fo=z) 33 | assert fs.find("") == [] 34 | assert fs.find("", withdirs=True) == [] 35 | with pytest.raises(FileNotFoundError): 36 | fs.info("") 37 | assert fs.ls("") == [] 38 | 39 | 40 | def test_glob(): 41 | with tempzip(data) as z: 42 | fs = fsspec.filesystem("zip", fo=z) 43 | assert fs.glob("*/*/*th") == ["deeply/nested/path"] 44 | 45 | 46 | @pytest.mark.xfail(sys.version_info < (3, 6), reason="zip-info odd on py35") 47 | def test_mapping(): 48 | with tempzip(data) as z: 49 | fs = fsspec.filesystem("zip", fo=z) 50 | m = fs.get_mapper("") 51 | assert list(m) == ["a", "b", "deeply/nested/path"] 52 | assert m["b"] == data["b"] 53 | 54 | 55 | @pytest.mark.xfail(sys.version_info < (3, 6), reason="zip not supported on py35") 56 | def test_pickle(): 57 | with tempzip(data) as z: 58 | fs = fsspec.filesystem("zip", fo=z) 59 | fs2 = pickle.loads(pickle.dumps(fs)) 60 | assert fs2.cat("b") == b"hello" 61 | 62 | 63 | def test_all_dirnames(): 64 | with tempzip() as z: 65 | fs = fsspec.filesystem("zip", fo=z) 66 | 67 | # fx are files, dx are a directories 68 | assert fs._all_dirnames([]) == set() 69 | assert fs._all_dirnames(["f1"]) == set() 70 | assert fs._all_dirnames(["f1", "f2"]) == set() 71 | assert fs._all_dirnames(["f1", "f2", "d1/f1"]) == {"d1"} 72 | assert fs._all_dirnames(["f1", "d1/f1", "d1/f2"]) == {"d1"} 73 | assert fs._all_dirnames(["f1", "d1/f1", "d2/f1"]) == {"d1", "d2"} 74 | assert fs._all_dirnames(["d1/d1/d1/f1"]) == {"d1", "d1/d1", "d1/d1/d1"} 75 | 76 | 77 | def test_ls(): 78 | with tempzip(data) as z: 79 | lhs = fsspec.filesystem("zip", fo=z) 80 | 81 | assert lhs.ls("") == ["a", "b", "deeply/"] 82 | assert lhs.ls("/") == lhs.ls("") 83 | 84 | assert lhs.ls("deeply") == ["deeply/nested/"] 85 | assert lhs.ls("deeply/") == lhs.ls("deeply") 86 | 87 | assert lhs.ls("deeply/nested") == ["deeply/nested/path"] 88 | assert lhs.ls("deeply/nested/") == lhs.ls("deeply/nested") 89 | 90 | 91 | def test_find(): 92 | with tempzip(data) as z: 93 | lhs = fsspec.filesystem("zip", fo=z) 94 | 95 | assert lhs.find("") == ["a", "b", "deeply/nested/path"] 96 | assert lhs.find("", withdirs=True) == [ 97 | "a", 98 | "b", 99 | "deeply/", 100 | "deeply/nested/", 101 | "deeply/nested/path", 102 | ] 103 | 104 | assert lhs.find("deeply") == ["deeply/nested/path"] 105 | assert lhs.find("deeply/") == lhs.find("deeply") 106 | 107 | 108 | def test_walk(): 109 | with tempzip(data) as z: 110 | fs = fsspec.filesystem("zip", fo=z) 111 | expected = [ 112 | # (dirname, list of subdirs, list of files) 113 | ("", ["deeply"], ["a", "b"]), 114 | ("deeply", ["nested"], []), 115 | ("deeply/nested", [], ["path"]), 116 | ] 117 | assert list(fs.walk("")) == expected 118 | 119 | 120 | def test_info(): 121 | with tempzip(data) as z: 122 | fs_cache = fsspec.filesystem("zip", fo=z) 123 | 124 | with pytest.raises(FileNotFoundError): 125 | fs_cache.info("i-do-not-exist") 126 | 127 | # Iterate over all directories 128 | # The ZipFile does not include additional information about the directories, 129 | for d in fs_cache._all_dirnames(data.keys()): 130 | lhs = fs_cache.info(d) 131 | expected = {"name": f"{d}/", "size": 0, "type": "directory"} 132 | assert lhs == expected 133 | 134 | # Iterate over all files 135 | for f, v in data.items(): 136 | lhs = fs_cache.info(f) 137 | assert lhs["name"] == f 138 | assert lhs["size"] == len(v) 139 | assert lhs["type"] == "file" 140 | 141 | # There are many flags specific to Zip Files. 142 | # These are two we can use to check we are getting some of them 143 | assert "CRC" in lhs 144 | assert "compress_size" in lhs 145 | 146 | 147 | @pytest.mark.parametrize("scale", [128, 512, 4096]) 148 | def test_isdir_isfile(scale): 149 | def make_nested_dir(i): 150 | x = f"{i}" 151 | table = x.maketrans("0123456789", "ABCDEFGHIJ") 152 | return "/".join(x.translate(table)) 153 | 154 | scaled_data = {f"{make_nested_dir(i)}/{i}": b"" for i in range(1, scale + 1)} 155 | with tempzip(scaled_data) as z: 156 | fs = fsspec.filesystem("zip", fo=z) 157 | 158 | lhs_dirs, lhs_files = fs._all_dirnames(scaled_data.keys()), scaled_data.keys() 159 | 160 | # Warm-up the Cache, this is done in both cases anyways... 161 | fs._get_dirs() 162 | 163 | entries = lhs_files | lhs_dirs 164 | 165 | assert lhs_dirs == {e for e in entries if fs.isdir(e)} 166 | assert lhs_files == {e for e in entries if fs.isfile(e)} 167 | -------------------------------------------------------------------------------- /fsspec/implementations/reference.py: -------------------------------------------------------------------------------- 1 | import json 2 | from ..asyn import AsyncFileSystem 3 | from ..core import open, filesystem 4 | 5 | 6 | class ReferenceFileSystem(AsyncFileSystem): 7 | """View byte ranges of some other file as a file system 8 | 9 | Initial version: single file system target, which must support 10 | async, and must allow start and end args in _cat_file. Later versions 11 | may allow multiple arbitrary URLs for the targets. 12 | 13 | This FileSystem is read-only. It is designed to be used with async 14 | targets (for now). This FileSystem only allows whole-file access, no 15 | ``open``. We do not get original file details from the target FS. 16 | 17 | Configuration is by passing a dict of references at init, or a URL to 18 | a JSON file containing the same; this dict 19 | can also contain concrete data for some set of paths. 20 | 21 | Reference dict format: 22 | {path0: bytes_data, path1: (target_url, offset, size)} 23 | 24 | https://github.com/intake/fsspec-reference-maker/blob/main/README.md 25 | """ 26 | 27 | protocol = "reference" 28 | 29 | def __init__( 30 | self, 31 | references, 32 | target=None, 33 | ref_storage_args=None, 34 | target_protocol=None, 35 | target_options=None, 36 | fs=None, 37 | **kwargs 38 | ): 39 | """ 40 | 41 | Parameters 42 | ---------- 43 | references : dict or str 44 | The set of references to use for this instance, with a structure as above. 45 | If str, will use fsspec.open, in conjunction with ref_storage_args to 46 | open and parse JSON at this location. 47 | target : str 48 | For any references having target_url as None, this is the default file 49 | target to use 50 | ref_storage_args : dict 51 | If references is a str, use these kwargs for loading the JSON file 52 | target_protocol : str 53 | If fs is None, instantiate a file system using this protocol 54 | target_options : dict 55 | If fs is None, instantiate a filesystem using these kwargs 56 | fs : file system instance 57 | Directly provide a file system, if you want to configure it beforehand. This 58 | takes precedence over target_protocol/target_options 59 | kwargs : passed to parent class 60 | """ 61 | if fs is not None: 62 | if not fs.async_impl: 63 | raise NotImplementedError("Only works with async targets") 64 | kwargs["loop"] = fs.loop 65 | super().__init__(**kwargs) 66 | if fs is None: 67 | fs = filesystem(target_protocol, loop=self.loop, **(target_options or {})) 68 | if not fs.async_impl: 69 | raise NotImplementedError("Only works with async targets") 70 | if isinstance(references, str): 71 | with open(references, "rb", **(ref_storage_args or {})) as f: 72 | references = json.load(f) 73 | self.references = references 74 | self.target = target 75 | self._process_references() 76 | self.fs = fs 77 | 78 | async def _cat_file(self, path): 79 | path = self._strip_protocol(path) 80 | part = self.references[path] 81 | if isinstance(part, bytes): 82 | return part 83 | elif isinstance(part, str): 84 | return part.encode() 85 | url, start, size = part 86 | end = start + size 87 | if url is None: 88 | url = self.target 89 | return await self.fs._cat_file(url, start=start, end=end) 90 | 91 | def _process_references(self): 92 | if "zarr_consolidated_format" in self.references: 93 | self.references = _unmodel_hdf5(self.references) 94 | self.dircache = {"": []} 95 | for path, part in self.references.items(): 96 | if isinstance(part, (bytes, str)): 97 | size = len(part) 98 | else: 99 | _, start, end = part 100 | size = end - start 101 | par = self._parent(path) 102 | par0 = par 103 | while par0: 104 | # build parent directories 105 | if par0 not in self.dircache: 106 | self.dircache[par0] = [] 107 | self.dircache.setdefault(self._parent(par0), []).append( 108 | {"name": par0, "type": "directory", "size": 0} 109 | ) 110 | par0 = self._parent(par0) 111 | 112 | self.dircache[par].append({"name": path, "type": "file", "size": size}) 113 | 114 | def ls(self, path, detail=True, **kwargs): 115 | path = self._strip_protocol(path) 116 | out = self._ls_from_cache(path) 117 | if detail: 118 | return out 119 | return [o["name"] for o in out] 120 | 121 | 122 | def _unmodel_hdf5(references): 123 | """Special JSON format from HDF5""" 124 | # see https://gist.github.com/ajelenak/80354a95b449cedea5cca508004f97a9 125 | import re 126 | 127 | ref = {} 128 | for key, value in references["metadata"].items(): 129 | if key.endswith(".zchunkstore"): 130 | source = value.pop("source")["uri"] 131 | match = re.findall(r"https://([^.]+)\.s3\.amazonaws\.com", source) 132 | if match: 133 | source = source.replace( 134 | f"https://{match[0]}.s3.amazonaws.com", match[0] 135 | ) 136 | for k, v in value.items(): 137 | ref[k] = (source, v["offset"], v["offset"] + v["size"]) 138 | else: 139 | ref[key] = json.dumps(value).encode() 140 | return ref 141 | -------------------------------------------------------------------------------- /docs/source/changelog.rst: -------------------------------------------------------------------------------- 1 | Changelog 2 | ========= 3 | 4 | Dev 5 | ------------- 6 | 7 | Features: 8 | 9 | - Add dbfs:// support 10 | 11 | Fixes: 12 | 13 | - random appending of a directory within the filesystems ``find()`` method 14 | 15 | Version 0.8.5 16 | ------------- 17 | 18 | Features: 19 | 20 | - config system 21 | - libarchive implementation 22 | - add reference file system implementation 23 | 24 | Version 0.8.4 25 | ------------- 26 | 27 | Features: 28 | 29 | - function ``can_be_local`` to see whether URL is compatible with ``open_local`` 30 | - concurrent cat with filecaches, if backend supports it 31 | - jupyter FS 32 | 33 | Fixes: 34 | 35 | - dircache expiry after transaction 36 | - blockcache garbage collection 37 | - close for HDFS 38 | - windows tests 39 | - glob depth with "**" 40 | 41 | Version 0.8.3 42 | ------------- 43 | 44 | Features: 45 | 46 | - error options for cat 47 | - memory fs created time in detailed `ls` 48 | 49 | 50 | Fixes: 51 | 52 | - duplicate directories could appear in MemoryFileSystem 53 | - Added support for hat dollar lbrace rbrace regex character escapes in glob 54 | - Fix blockcache (was doing unnecessary work) 55 | - handle multibyte dtypes in readinto 56 | - Fix missing kwargs in call to _copy in asyn 57 | 58 | Other: 59 | 60 | - Stop inheriting from pyarrow.filesystem for pyarrow>=2.0 61 | - Raise low-level program friendly OSError. 62 | - Guard against instance reuse in new processes 63 | - Make hash_name a method on CachingFileSystem to make it easier to change. 64 | - Use get_event_loop for py3.6 compatibility 65 | 66 | Version 0.8.2 67 | ------------- 68 | 69 | Fixes: 70 | 71 | - More careful strip for caching 72 | 73 | Version 0.8.1 74 | ------------- 75 | 76 | Features: 77 | 78 | - add sign to base class 79 | - Allow calling of coroutines from normal code when running async 80 | - Implement writing for cached many files 81 | - Allow concurrent caching of remote files 82 | - Add gdrive:// protocol 83 | 84 | Fixes: 85 | 86 | - Fix memfs with exact ls 87 | - HTTPFileSystem requires requests and aiohttp in registry 88 | 89 | Other: 90 | 91 | - Allow http kwargs to clientSession 92 | - Use extras_require in setup.py for optional dependencies 93 | - Replacing md5 with sha256 for hash (CVE req) 94 | - Test against Python 3.8, drop 3.5 testing 95 | - add az alias for abfs 96 | 97 | Version 0.8.0 98 | ------------- 99 | 100 | Major release allowing async implementations with concurrent batch 101 | operations. 102 | 103 | Features: 104 | 105 | - async filesystem spec, first applied to HTTP 106 | - OpenFiles cContext for multiple files 107 | - Document async, and ensure docstrings 108 | - Make LocalFileOpener iterable 109 | - handle smb:// protocol using smbprotocol package 110 | - allow Path object in open 111 | - simplecache write mode 112 | 113 | Fixes: 114 | 115 | - test_local: fix username not in home path 116 | - Tighten cacheFS if dir deleted 117 | - Fix race condition of lzma import when using threads 118 | - properly rewind MemoryFile 119 | - OpenFile newline in reduce 120 | 121 | Other: 122 | 123 | - Add aiobotocore to deps for s3fs check 124 | - Set default clobber=True on impl register 125 | - Use _get_kwargs_from_url when unchaining 126 | - Add cache_type and cache_options to HTTPFileSystem constructor 127 | 128 | Version 0.7.5 129 | ------------- 130 | 131 | * async implemented for HTTP as prototype (read-only) 132 | * write for simplecache 133 | * added SMB (Samba, protocol >=2) implementation 134 | 135 | Version 0.7.4 136 | ------------- 137 | 138 | * panel-based GUI 139 | 140 | 0.7.3 series 141 | ------------ 142 | 143 | * added ``git`` and ``github`` interfaces 144 | * added chained syntax for open, open_files and get_mapper 145 | * adapt webHDFS for HttpFS 146 | * added open_local 147 | * added ``simplecache``, and compression to both file caches 148 | 149 | 150 | Version 0.6.2 151 | ------------- 152 | 153 | * Added ``adl`` and ``abfs`` protocols to the known implementations registry (:pr:`209`) 154 | * Fixed issue with whole-file caching and implementations providing multiple protocols (:pr:`219`) 155 | 156 | Version 0.6.1 157 | ------------- 158 | 159 | * ``LocalFileSystem`` is now considered a filestore by pyarrow (:pr:`211`) 160 | * Fixed bug in HDFS filesystem with ``cache_options`` (:pr:`202`) 161 | * Fixed instance caching bug with multiple instances (:pr:`203`) 162 | 163 | 164 | Version 0.6.0 165 | ------------- 166 | 167 | * Fixed issues with filesystem instance caching. This was causing authorization errors 168 | in downstream libraries like ``gcsfs`` and ``s3fs`` in multi-threaded code (:pr:`155`, :pr:`181`) 169 | * Changed the default file caching strategy to :class:`fsspec.caching.ReadAheadCache` (:pr:`193`) 170 | * Moved file caches to the new ``fsspec.caching`` module. They're still available from 171 | their old location in ``fsspec.core``, but we recommend using the new location for new code (:pr:`195`) 172 | * Added a new file caching strategy, :class:`fsspec.caching.BlockCache` for fetching and caching 173 | file reads in blocks (:pr:`191`). 174 | * Fixed equality checks for file system instance to return ``False`` when compared to objects 175 | other than file systems (:pr:`192`) 176 | * Fixed a bug in :meth:`fsspec.FSMap.keys` returning a generator, which was consumed upon iteration (:pr:`189`). 177 | * Removed the magic addition of aliases in ``AbstractFileSystem.__init__``. Now alias methods are always 178 | present (:pr:`177`) 179 | * Deprecated passing ``trim`` to :class:`fsspec.spec.AbstractBufferedFile`. Pass it in ``storage_options`` instead (:pr:`188`) 180 | * Improved handling of requests for :class:`fsspec.implementations.http.HTTPFileSystem` when the 181 | HTTP server responds with an (incorrect) content-length of 0 (:pr:`163`) 182 | * Added a ``detail=True`` parameter to :meth:`fsspec.spec.AbstractFileSystem.ls` (:pr:`168`) 183 | * Fixed handling of UNC/DFS paths (:issue:`154`) 184 | -------------------------------------------------------------------------------- /fsspec/implementations/tests/test_libarchive.py: -------------------------------------------------------------------------------- 1 | from contextlib import contextmanager 2 | 3 | import os 4 | import pickle 5 | import pytest 6 | import tempfile 7 | import fsspec 8 | 9 | libarchive = pytest.importorskip("libarchive") 10 | 11 | 12 | @contextmanager 13 | def temparchive(data={}): 14 | f = tempfile.mkstemp(suffix="7z")[1] 15 | with libarchive.file_writer(f, "7zip") as archive: 16 | for k, v in data.items(): 17 | archive.add_file_from_memory(entry_path=k, entry_size=len(v), entry_data=v) 18 | try: 19 | yield f 20 | finally: 21 | try: 22 | os.remove(f) 23 | except (IOError, OSError): 24 | pass 25 | 26 | 27 | data = {"a": b"", "b": b"hello", "deeply/nested/path": b"stuff"} 28 | 29 | 30 | def test_empty(): 31 | with temparchive() as archive_file: 32 | fs = fsspec.filesystem("libarchive", fo=archive_file) 33 | assert fs.find("") == [] 34 | assert fs.find("", withdirs=True) == [] 35 | with pytest.raises(FileNotFoundError): 36 | fs.info("") 37 | assert fs.ls("") == [] 38 | 39 | 40 | def test_mapping(): 41 | with temparchive(data) as archive_file: 42 | fs = fsspec.filesystem("libarchive", fo=archive_file) 43 | m = fs.get_mapper("") 44 | 45 | fs._get_dirs() 46 | print(fs.dir_cache) 47 | 48 | assert list(m) == ["a", "b", "deeply/nested/path"] 49 | assert m["b"] == data["b"] 50 | 51 | 52 | def test_pickle(): 53 | with temparchive(data) as archive_file: 54 | fs = fsspec.filesystem("libarchive", fo=archive_file) 55 | fs2 = pickle.loads(pickle.dumps(fs)) 56 | assert fs2 is fs 57 | assert fs2.cat("b") == b"hello" 58 | 59 | 60 | def test_all_dirnames(): 61 | with temparchive() as archive_file: 62 | fs = fsspec.filesystem("libarchive", fo=archive_file) 63 | 64 | # fx are files, dx are a directories 65 | assert fs._all_dirnames([]) == set() 66 | assert fs._all_dirnames(["f1"]) == set() 67 | assert fs._all_dirnames(["f1", "f2"]) == set() 68 | assert fs._all_dirnames(["f1", "f2", "d1/f1"]) == {"d1"} 69 | assert fs._all_dirnames(["f1", "d1/f1", "d1/f2"]) == {"d1"} 70 | assert fs._all_dirnames(["f1", "d1/f1", "d2/f1"]) == {"d1", "d2"} 71 | assert fs._all_dirnames(["d1/d1/d1/f1"]) == {"d1", "d1/d1", "d1/d1/d1"} 72 | 73 | 74 | def test_ls(): 75 | with temparchive(data) as archive_file: 76 | lhs = fsspec.filesystem("libarchive", fo=archive_file) 77 | 78 | assert lhs.ls("") == ["a", "b", "deeply/"] 79 | assert lhs.ls("/") == lhs.ls("") 80 | 81 | assert lhs.ls("deeply") == ["deeply/nested/"] 82 | assert lhs.ls("deeply/") == lhs.ls("deeply") 83 | 84 | assert lhs.ls("deeply/nested") == ["deeply/nested/path"] 85 | assert lhs.ls("deeply/nested/") == lhs.ls("deeply/nested") 86 | 87 | 88 | def test_find(): 89 | with temparchive(data) as archive_file: 90 | lhs = fsspec.filesystem("libarchive", fo=archive_file) 91 | 92 | assert lhs.find("") == ["a", "b", "deeply/nested/path"] 93 | assert lhs.find("", withdirs=True) == [ 94 | "a", 95 | "b", 96 | "deeply/", 97 | "deeply/nested/", 98 | "deeply/nested/path", 99 | ] 100 | 101 | assert lhs.find("deeply") == ["deeply/nested/path"] 102 | assert lhs.find("deeply/") == lhs.find("deeply") 103 | 104 | 105 | def test_walk(): 106 | with temparchive(data) as archive_file: 107 | fs = fsspec.filesystem("libarchive", fo=archive_file) 108 | expected = [ 109 | # (dirname, list of subdirs, list of files) 110 | ("", ["deeply"], ["a", "b"]), 111 | ("deeply", ["nested"], []), 112 | ("deeply/nested", [], ["path"]), 113 | ] 114 | for lhs, rhs in zip(fs.walk(""), expected): 115 | assert lhs[0] == rhs[0] 116 | assert sorted(lhs[1]) == sorted(rhs[1]) 117 | assert sorted(lhs[2]) == sorted(rhs[2]) 118 | 119 | 120 | def test_info(): 121 | with temparchive(data) as archive_file: 122 | fs_cache = fsspec.filesystem("libarchive", fo=archive_file) 123 | 124 | with pytest.raises(FileNotFoundError): 125 | fs_cache.info("i-do-not-exist") 126 | 127 | # Iterate over all directories 128 | # The 7zip archive does not include additional information about the 129 | # directories 130 | for d in fs_cache._all_dirnames(data.keys()): 131 | lhs = fs_cache.info(d) 132 | expected = {"name": f"{d}/", "size": 0, "type": "directory"} 133 | assert lhs == expected 134 | 135 | # Iterate over all files 136 | for f, v in data.items(): 137 | lhs = fs_cache.info(f) 138 | assert lhs["name"] == f 139 | assert lhs["size"] == len(v) 140 | assert lhs["type"] == "file" 141 | 142 | # These are the specific flags retrieved from the archived files 143 | assert "created" in lhs 144 | assert "mode" in lhs 145 | assert "uid" in lhs 146 | assert "gid" in lhs 147 | assert "mtime" in lhs 148 | 149 | 150 | @pytest.mark.parametrize("scale", [128, 512, 4096]) 151 | def test_isdir_isfile(scale): 152 | def make_nested_dir(i): 153 | x = f"{i}" 154 | table = x.maketrans("0123456789", "ABCDEFGHIJ") 155 | return "/".join(x.translate(table)) 156 | 157 | scaled_data = {f"{make_nested_dir(i)}/{i}": b"" for i in range(1, scale + 1)} 158 | with temparchive(scaled_data) as archive_file: 159 | fs = fsspec.filesystem("libarchive", fo=archive_file) 160 | 161 | lhs_dirs, lhs_files = fs._all_dirnames(scaled_data.keys()), scaled_data.keys() 162 | 163 | # Warm-up the Cache, this is done in both cases anyways... 164 | fs._get_dirs() 165 | 166 | entries = lhs_files | lhs_dirs 167 | 168 | assert lhs_dirs == {e for e in entries if fs.isdir(e)} 169 | assert lhs_files == {e for e in entries if fs.isfile(e)} 170 | -------------------------------------------------------------------------------- /docs/source/conf.py: -------------------------------------------------------------------------------- 1 | # -*- coding: utf-8 -*- 2 | # 3 | # fsspec documentation build configuration file, created by 4 | # sphinx-quickstart on Mon Jan 15 18:11:02 2018. 5 | # 6 | # This file is execfile()d with the current directory set to its 7 | # containing dir. 8 | # 9 | # Note that not all possible configuration values are present in this 10 | # autogenerated file. 11 | # 12 | # All configuration values have a default; values that are commented out 13 | # serve to show the default. 14 | 15 | # If extensions (or modules to document with autodoc) are in another directory, 16 | # add these directories to sys.path here. If the directory is relative to the 17 | # documentation root, use os.path.abspath to make it absolute, like shown here. 18 | # 19 | import os 20 | import sys 21 | 22 | sys.path.insert(0, os.path.abspath("../..")) 23 | 24 | 25 | # -- General configuration ------------------------------------------------ 26 | 27 | # If your documentation needs a minimal Sphinx version, state it here. 28 | # 29 | # needs_sphinx = '1.0' 30 | 31 | # Add any Sphinx extension module names here, as strings. They can be 32 | # extensions coming with Sphinx (named 'sphinx.ext.*') or your custom 33 | # ones. 34 | extensions = [ 35 | "sphinx.ext.autodoc", 36 | "sphinx.ext.viewcode", 37 | "sphinx.ext.autosummary", 38 | "sphinx.ext.extlinks", 39 | "numpydoc", 40 | ] 41 | 42 | # Add any paths that contain templates here, relative to this directory. 43 | templates_path = ["_templates"] 44 | 45 | # The suffix(es) of source filenames. 46 | # You can specify multiple suffix as a list of string: 47 | # 48 | # source_suffix = ['.rst', '.md'] 49 | source_suffix = ".rst" 50 | 51 | # The master toctree document. 52 | master_doc = "index" 53 | 54 | # General information about the project. 55 | project = "fsspec" 56 | copyright = "2018, Martin Durant" 57 | author = "Martin Durant" 58 | 59 | # The version info for the project you're documenting, acts as replacement for 60 | # |version| and |release|, also used in various other places throughout the 61 | # built documents. 62 | # 63 | # The short X.Y version. 64 | import fsspec 65 | 66 | version = fsspec.__version__ 67 | # The full version, including alpha/beta/rc tags. 68 | release = fsspec.__version__ 69 | 70 | # The language for content autogenerated by Sphinx. Refer to documentation 71 | # for a list of supported languages. 72 | # 73 | # This is also used if you do content translation via gettext catalogs. 74 | # Usually you set "language" from the command line for these cases. 75 | language = None 76 | 77 | # List of patterns, relative to source directory, that match files and 78 | # directories to ignore when looking for source files. 79 | # This patterns also effect to html_static_path and html_extra_path 80 | exclude_patterns = [] 81 | 82 | # The name of the Pygments (syntax highlighting) style to use. 83 | pygments_style = "sphinx" 84 | 85 | # If true, `todo` and `todoList` produce output, else they produce nothing. 86 | todo_include_todos = False 87 | 88 | 89 | # -- Options for HTML output ---------------------------------------------- 90 | 91 | # The theme to use for HTML and HTML Help pages. See the documentation for 92 | # a list of builtin themes. 93 | # 94 | html_theme = "sphinx_rtd_theme" 95 | 96 | # Theme options are theme-specific and customize the look and feel of a theme 97 | # further. For a list of options available for each theme, see the 98 | # documentation. 99 | # 100 | # html_theme_options = {} 101 | 102 | # Add any paths that contain custom static files (such as style sheets) here, 103 | # relative to this directory. They are copied after the builtin static files, 104 | # so a file named "default.css" will overwrite the builtin "default.css". 105 | html_static_path = [] 106 | 107 | # Custom sidebar templates, must be a dictionary that maps document names 108 | # to template names. 109 | # 110 | # This is required for the alabaster theme 111 | # refs: http://alabaster.readthedocs.io/en/latest/installation.html#sidebars 112 | html_sidebars = { 113 | "**": [ 114 | "relations.html", # needs 'show_related': True theme option to display 115 | "searchbox.html", 116 | ] 117 | } 118 | 119 | 120 | # -- Options for HTMLHelp output ------------------------------------------ 121 | 122 | # Output file base name for HTML help builder. 123 | htmlhelp_basename = "fsspecdoc" 124 | 125 | 126 | # -- Options for LaTeX output --------------------------------------------- 127 | 128 | latex_elements = { 129 | # The paper size ('letterpaper' or 'a4paper'). 130 | # 131 | # 'papersize': 'letterpaper', 132 | # The font size ('10pt', '11pt' or '12pt'). 133 | # 134 | # 'pointsize': '10pt', 135 | # Additional stuff for the LaTeX preamble. 136 | # 137 | # 'preamble': '', 138 | # Latex figure (float) alignment 139 | # 140 | # 'figure_align': 'htbp', 141 | } 142 | 143 | # Grouping the document tree into LaTeX files. List of tuples 144 | # (source start file, target name, title, 145 | # author, documentclass [howto, manual, or own class]). 146 | latex_documents = [ 147 | (master_doc, "fsspec.tex", "fsspec Documentation", "Joseph Crail", "manual") 148 | ] 149 | 150 | 151 | # -- Options for manual page output --------------------------------------- 152 | 153 | # One entry per manual page. List of tuples 154 | # (source start file, name, description, authors, manual section). 155 | man_pages = [(master_doc, "fsspec", "fsspec Documentation", [author], 1)] 156 | 157 | 158 | # -- Options for Texinfo output ------------------------------------------- 159 | 160 | # Grouping the document tree into Texinfo files. List of tuples 161 | # (source start file, target name, title, author, 162 | # dir menu entry, description, category) 163 | texinfo_documents = [ 164 | ( 165 | master_doc, 166 | "fsspec", 167 | "fsspec Documentation", 168 | author, 169 | "fsspec", 170 | "One line description of project.", 171 | "Miscellaneous", 172 | ) 173 | ] 174 | 175 | extlinks = { 176 | "issue": ("https://github.com/intake/filesystem_spec/issues/%s", "GH#"), 177 | "pr": ("https://github.com/intake/filesystem_spec/pull/%s", "GH#"), 178 | } 179 | -------------------------------------------------------------------------------- /docs/source/api.rst: -------------------------------------------------------------------------------- 1 | API Reference 2 | ============= 3 | 4 | .. currentmodule:: fsspec 5 | 6 | User Functions 7 | -------------- 8 | 9 | .. autosummary:: 10 | fsspec.open_files 11 | fsspec.open 12 | fsspec.open_local 13 | fsspec.filesystem 14 | fsspec.get_filesystem_class 15 | fsspec.get_mapper 16 | fsspec.fuse.run 17 | fsspec.gui.FileSelector 18 | 19 | .. autofunction:: fsspec.open_files 20 | .. autofunction:: fsspec.open 21 | .. autofunction:: fsspec.open_local 22 | .. autofunction:: fsspec.filesystem 23 | .. autofunction:: fsspec.get_filesystem_class 24 | .. autofunction:: fsspec.get_mapper 25 | .. autofunction:: fsspec.fuse.run 26 | .. autoclass:: fsspec.gui.FileSelector 27 | :members: 28 | 29 | Base Classes 30 | ------------ 31 | 32 | .. autosummary:: 33 | fsspec.spec.AbstractFileSystem 34 | fsspec.spec.Transaction 35 | fsspec.spec.AbstractBufferedFile 36 | fsspec.FSMap 37 | fsspec.asyn.AsyncFileSystem 38 | fsspec.core.OpenFile 39 | fsspec.core.OpenFiles 40 | fsspec.core.BaseCache 41 | fsspec.core.get_fs_token_paths 42 | fsspec.dircache.DirCache 43 | fsspec.registry.ReadOnlyRegistry 44 | fsspec.registry.register_implementation 45 | 46 | .. autoclass:: fsspec.spec.AbstractFileSystem 47 | :members: 48 | 49 | .. autoclass:: fsspec.spec.Transaction 50 | :members: 51 | 52 | .. autoclass:: fsspec.spec.AbstractBufferedFile 53 | :members: 54 | 55 | .. autoclass:: fsspec.asyn.AsyncFileSystem 56 | 57 | .. autoclass:: fsspec.FSMap 58 | :members: 59 | 60 | .. autoclass:: fsspec.core.OpenFile 61 | :members: 62 | 63 | .. autoclass:: fsspec.core.OpenFiles 64 | 65 | .. autoclass:: fsspec.core.BaseCache 66 | :members: 67 | 68 | .. autofunction:: fsspec.core.get_fs_token_paths 69 | 70 | .. autoclass:: fsspec.dircache.DirCache 71 | :members: __init__ 72 | 73 | .. autoclass:: fsspec.registry.ReadOnlyRegistry 74 | :members: __init__ 75 | 76 | .. autofunction:: fsspec.registry.register_implementation 77 | 78 | .. _implementations: 79 | 80 | Built-in Implementations 81 | ------------------------ 82 | 83 | .. autosummary:: 84 | fsspec.implementations.ftp.FTPFileSystem 85 | fsspec.implementations.hdfs.PyArrowHDFS 86 | fsspec.implementations.dask.DaskWorkerFileSystem 87 | fsspec.implementations.http.HTTPFileSystem 88 | fsspec.implementations.local.LocalFileSystem 89 | fsspec.implementations.memory.MemoryFileSystem 90 | fsspec.implementations.github.GithubFileSystem 91 | fsspec.implementations.sftp.SFTPFileSystem 92 | fsspec.implementations.webhdfs.WebHDFS 93 | fsspec.implementations.zip.ZipFileSystem 94 | fsspec.implementations.cached.CachingFileSystem 95 | fsspec.implementations.cached.WholeFileCacheFileSystem 96 | fsspec.implementations.cached.SimpleCacheFileSystem 97 | fsspec.implementations.git.GitFileSystem 98 | fsspec.implementations.smb.SMBFileSystem 99 | fsspec.implementations.jupyter.JupyterFileSystem 100 | fsspec.implementations.libarchive.LibArchiveFileSystem 101 | fsspec.implementations.dbfs.DatabricksFileSystem 102 | fsspec.implementations.reference.ReferenceFileSystem 103 | 104 | .. autoclass:: fsspec.implementations.ftp.FTPFileSystem 105 | :members: __init__ 106 | 107 | .. autoclass:: fsspec.implementations.hdfs.PyArrowHDFS 108 | :members: __init__ 109 | 110 | .. autoclass:: fsspec.implementations.dask.DaskWorkerFileSystem 111 | :members: __init__ 112 | 113 | .. autoclass:: fsspec.implementations.http.HTTPFileSystem 114 | :members: __init__ 115 | 116 | .. autoclass:: fsspec.implementations.local.LocalFileSystem 117 | :members: __init__ 118 | 119 | .. autoclass:: fsspec.implementations.memory.MemoryFileSystem 120 | :members: __init__ 121 | 122 | .. autoclass:: fsspec.implementations.sftp.SFTPFileSystem 123 | :members: __init__ 124 | 125 | .. autoclass:: fsspec.implementations.webhdfs.WebHDFS 126 | :members: __init__ 127 | 128 | .. autoclass:: fsspec.implementations.zip.ZipFileSystem 129 | :members: __init__ 130 | 131 | .. autoclass:: fsspec.implementations.cached.CachingFileSystem 132 | :members: __init__ 133 | 134 | .. autoclass:: fsspec.implementations.cached.WholeFileCacheFileSystem 135 | :members: __init__ 136 | 137 | .. autoclass:: fsspec.implementations.cached.SimpleCacheFileSystem 138 | :members: __init__ 139 | 140 | .. autoclass:: fsspec.implementations.github.GithubFileSystem 141 | :members: __init__ 142 | 143 | .. autoclass:: fsspec.implementations.git.GitFileSystem 144 | :members: __init__ 145 | 146 | .. autoclass:: fsspec.implementations.smb.SMBFileSystem 147 | :members: __init__ 148 | 149 | .. autoclass:: fsspec.implementations.jupyter.JupyterFileSystem 150 | :members: __init__ 151 | 152 | .. autoclass:: fsspec.implementations.libarchive.LibArchiveFileSystem 153 | :members: __init__ 154 | 155 | .. autoclass:: fsspec.implementations.dbfs.DatabricksFileSystem 156 | :members: __init__ 157 | 158 | .. autoclass:: fsspec.implementations.reference.ReferenceFileSystem 159 | :members: __init__ 160 | 161 | Other Known Implementations 162 | --------------------------- 163 | 164 | - `s3fs`_ for Amazon S3 and other compatible stores 165 | - `gcsfs`_ for Google Cloud Storage 166 | - `adl`_ for Azure DataLake storage 167 | - `abfs`_ for Azure Blob service 168 | - `dropbox`_ for access to dropbox shares 169 | - `gdrive`_ to access Google Drive and shares (experimental) 170 | 171 | .. _s3fs: https://s3fs.readthedocs.io/en/latest/ 172 | .. _gcsfs: https://gcsfs.readthedocs.io/en/latest/ 173 | .. _adl: https://github.com/dask/adlfs 174 | .. _abfs: https://github.com/dask/adlfs 175 | .. _dropbox: https://github.com/MarineChap/intake_dropbox 176 | .. _gdrive: https://github.com/intake/gdrivefs 177 | 178 | .. _readbuffering: 179 | 180 | Read Buffering 181 | -------------- 182 | 183 | .. autosummary:: 184 | 185 | fsspec.caching.ReadAheadCache 186 | fsspec.caching.BytesCache 187 | fsspec.caching.MMapCache 188 | fsspec.caching.BlockCache 189 | 190 | .. autoclass:: fsspec.caching.ReadAheadCache 191 | :members: 192 | 193 | .. autoclass:: fsspec.caching.BytesCache 194 | :members: 195 | 196 | .. autoclass:: fsspec.caching.MMapCache 197 | :members: 198 | 199 | .. autoclass:: fsspec.caching.BlockCache 200 | :members: 201 | -------------------------------------------------------------------------------- /fsspec/tests/test_file.py: -------------------------------------------------------------------------------- 1 | """Tests abstract buffered file API, using FTP implementation""" 2 | import pickle 3 | import sys 4 | import pytest 5 | from fsspec.implementations.tests.test_ftp import FTPFileSystem 6 | 7 | data = b"hello" * 10000 8 | 9 | 10 | @pytest.mark.xfail( 11 | sys.version_info < (3, 6), 12 | reason="py35 error, see https://github.com/intake/filesystem_spec/issues/147", 13 | ) 14 | def test_pickle(ftp_writable): 15 | host, port, user, pw = ftp_writable 16 | ftp = FTPFileSystem(host=host, port=port, username=user, password=pw) 17 | 18 | f = ftp.open("/out", "rb") 19 | 20 | f2 = pickle.loads(pickle.dumps(f)) 21 | assert f == f2 22 | 23 | 24 | def test_file_read_attributes(ftp_writable): 25 | host, port, user, pw = ftp_writable 26 | ftp = FTPFileSystem(host=host, port=port, username=user, password=pw) 27 | 28 | f = ftp.open("/out", "rb") 29 | assert f.info()["size"] == len(data) 30 | assert f.tell() == 0 31 | assert f.seekable() 32 | assert f.readable() 33 | assert not f.writable() 34 | out = bytearray(len(data)) 35 | 36 | assert f.read() == data 37 | assert f.read() == b"" 38 | f.seek(0) 39 | assert f.readuntil(b"l") == b"hel" 40 | assert f.tell() == 3 41 | 42 | f.readinto1(out) 43 | assert out[:-3] == data[3:] 44 | with pytest.raises(ValueError): 45 | f.write(b"") 46 | f.close() 47 | with pytest.raises(ValueError): 48 | f.read()(b"") 49 | 50 | 51 | def test_seek(ftp_writable): 52 | host, port, user, pw = ftp_writable 53 | ftp = FTPFileSystem(host=host, port=port, username=user, password=pw) 54 | 55 | f = ftp.open("/out", "rb") 56 | 57 | assert f.seek(-10, 2) == len(data) - 10 58 | assert f.tell() == len(data) - 10 59 | assert f.seek(-1, 1) == len(data) - 11 60 | with pytest.raises(ValueError): 61 | f.seek(-1) 62 | with pytest.raises(ValueError): 63 | f.seek(0, 7) 64 | 65 | 66 | def test_file_idempotent(ftp_writable): 67 | host, port, user, pw = ftp_writable 68 | ftp = FTPFileSystem(host=host, port=port, username=user, password=pw) 69 | 70 | f = ftp.open("/out", "rb") 71 | f2 = ftp.open("/out", "rb") 72 | assert hash(f) == hash(f2) 73 | assert f == f2 74 | ftp.touch("/out2") 75 | f2 = ftp.open("/out2", "rb") 76 | assert hash(f2) != hash(f) 77 | assert f != f2 78 | f2 = ftp.open("/out", "wb") 79 | assert hash(f2) != hash(f) 80 | 81 | 82 | def test_file_text_attributes(ftp_writable): 83 | host, port, user, pw = ftp_writable 84 | ftp = FTPFileSystem(host=host, port=port, username=user, password=pw) 85 | 86 | data = b"hello\n" * 1000 87 | with ftp.open("/out2", "wb") as f: 88 | f.write(data) 89 | 90 | f = ftp.open("/out2", "rb") 91 | assert f.readline() == b"hello\n" 92 | f.seek(0) 93 | assert list(f) == [d + b"\n" for d in data.split()] 94 | f.seek(0) 95 | assert f.readlines() == [d + b"\n" for d in data.split()] 96 | 97 | f = ftp.open("/out2", "rt") 98 | assert f.readline() == "hello\n" 99 | assert f.encoding 100 | 101 | 102 | def test_file_write_attributes(ftp_writable): 103 | host, port, user, pw = ftp_writable 104 | ftp = FTPFileSystem(host=host, port=port, username=user, password=pw) 105 | f = ftp.open("/out2", "wb") 106 | with pytest.raises(ValueError): 107 | f.info() 108 | with pytest.raises(OSError): 109 | f.seek(0) 110 | with pytest.raises(ValueError): 111 | f.read(0) 112 | assert not f.readable() 113 | assert f.writable() 114 | 115 | f.flush() # no-op 116 | 117 | assert f.write(b"hello") == 5 118 | assert f.write(b"hello") == 5 119 | assert not f.closed 120 | f.close() 121 | assert f.closed 122 | with pytest.raises(ValueError): 123 | f.write(b"") 124 | with pytest.raises(ValueError): 125 | f.flush() 126 | 127 | 128 | def test_midread_cache(ftp_writable): 129 | host, port, user, pw = ftp_writable 130 | fs = FTPFileSystem(host=host, port=port, username=user, password=pw) 131 | fn = "/myfile" 132 | with fs.open(fn, "wb") as f: 133 | f.write(b"a" * 175627146) 134 | with fs.open(fn, "rb") as f: 135 | f.seek(175561610) 136 | d1 = f.read(65536) 137 | assert len(d1) == 65536 138 | 139 | f.seek(4) 140 | size = 17562198 141 | d2 = f.read(size) 142 | assert len(d2) == size 143 | 144 | f.seek(17562288) 145 | size = 17562187 146 | d3 = f.read(size) 147 | assert len(d3) == size 148 | 149 | 150 | def test_read_block(ftp_writable): 151 | # not the same as test_read_block in test_utils, this depends on the 152 | # behaviour of the bytest caching 153 | from fsspec.utils import read_block 154 | 155 | host, port, user, pw = ftp_writable 156 | fs = FTPFileSystem(host=host, port=port, username=user, password=pw) 157 | fn = "/myfile" 158 | with fs.open(fn, "wb") as f: 159 | f.write(b"a,b\n1,2") 160 | f = fs.open(fn, "rb", cache_type="bytes") 161 | assert read_block(f, 0, 6400, b"\n") == b"a,b\n1,2" 162 | 163 | 164 | def test_with_gzip(ftp_writable): 165 | import gzip 166 | 167 | data = b"some compressable stuff" 168 | host, port, user, pw = ftp_writable 169 | fs = FTPFileSystem(host=host, port=port, username=user, password=pw) 170 | fn = "/myfile" 171 | with fs.open(fn, "wb") as f: 172 | gf = gzip.GzipFile(fileobj=f, mode="w") 173 | gf.write(data) 174 | gf.close() 175 | with fs.open(fn, "rb") as f: 176 | gf = gzip.GzipFile(fileobj=f, mode="r") 177 | assert gf.read() == data 178 | 179 | 180 | def test_with_zip(ftp_writable): 181 | import zipfile 182 | 183 | data = b"hello zip" 184 | host, port, user, pw = ftp_writable 185 | fs = FTPFileSystem(host=host, port=port, username=user, password=pw) 186 | fn = "/myfile.zip" 187 | inner_file = "test.txt" 188 | with fs.open(fn, "wb") as f: 189 | zf = zipfile.ZipFile(f, mode="w") 190 | zf.writestr(inner_file, data) 191 | zf.close() 192 | with fs.open(fn, "rb") as f: 193 | zf = zipfile.ZipFile(f, mode="r") 194 | assert zf.read(inner_file) == data 195 | -------------------------------------------------------------------------------- /fsspec/implementations/hdfs.py: -------------------------------------------------------------------------------- 1 | import weakref 2 | from ..spec import AbstractFileSystem 3 | from ..utils import infer_storage_options 4 | from pyarrow.hdfs import HadoopFileSystem 5 | 6 | 7 | class PyArrowHDFS(AbstractFileSystem): 8 | """Adapted version of Arrow's HadoopFileSystem 9 | 10 | This is a very simple wrapper over pa.hdfs.HadoopFileSystem, which 11 | passes on all calls to the underlying class. 12 | """ 13 | 14 | protocol = "hdfs" 15 | 16 | def __init__( 17 | self, 18 | host="default", 19 | port=0, 20 | user=None, 21 | kerb_ticket=None, 22 | driver="libhdfs", 23 | extra_conf=None, 24 | **kwargs 25 | ): 26 | """ 27 | 28 | Parameters 29 | ---------- 30 | host: str 31 | Hostname, IP or "default" to try to read from Hadoop config 32 | port: int 33 | Port to connect on, or default from Hadoop config if 0 34 | user: str or None 35 | If given, connect as this username 36 | kerb_ticket: str or None 37 | If given, use this ticket for authentication 38 | driver: 'libhdfs' or 'libhdfs3' 39 | Binary driver; libhdfs if the JNI library and default 40 | extra_conf: None or dict 41 | Passed on to HadoopFileSystem 42 | """ 43 | if self._cached: 44 | return 45 | AbstractFileSystem.__init__(self, **kwargs) 46 | self.pars = (host, port, user, kerb_ticket, driver, extra_conf) 47 | pahdfs = HadoopFileSystem( 48 | host=host, 49 | port=port, 50 | user=user, 51 | kerb_ticket=kerb_ticket, 52 | driver=driver, 53 | extra_conf=extra_conf, 54 | ) 55 | weakref.finalize(self, lambda: pahdfs.close()) 56 | self.pahdfs = pahdfs 57 | 58 | def _open( 59 | self, 60 | path, 61 | mode="rb", 62 | block_size=None, 63 | autocommit=True, 64 | cache_options=None, 65 | **kwargs 66 | ): 67 | """ 68 | 69 | Parameters 70 | ---------- 71 | path: str 72 | Location of file; should start with '/' 73 | mode: str 74 | block_size: int 75 | Hadoop block size, e.g., 2**26 76 | autocommit: True 77 | Transactions are not yet implemented for HDFS; errors if not True 78 | kwargs: dict or None 79 | Hadoop config parameters 80 | 81 | Returns 82 | ------- 83 | HDFSFile file-like instance 84 | """ 85 | 86 | return HDFSFile( 87 | self, 88 | path, 89 | mode, 90 | block_size=block_size, 91 | autocommit=autocommit, 92 | cache_options=cache_options, 93 | **kwargs 94 | ) 95 | 96 | def __reduce_ex__(self, protocol): 97 | return PyArrowHDFS, self.pars 98 | 99 | def ls(self, path, detail=True): 100 | out = self.pahdfs.ls(path, detail) 101 | if detail: 102 | for p in out: 103 | p["type"] = p["kind"] 104 | p["name"] = self._strip_protocol(p["name"]) 105 | else: 106 | out = [self._strip_protocol(p) for p in out] 107 | return out 108 | 109 | @staticmethod 110 | def _get_kwargs_from_urls(path): 111 | ops = infer_storage_options(path) 112 | out = {} 113 | if ops.get("host", None): 114 | out["host"] = ops["host"] 115 | if ops.get("username", None): 116 | out["user"] = ops["username"] 117 | if ops.get("port", None): 118 | out["port"] = ops["port"] 119 | return out 120 | 121 | def close(self): 122 | self.pahdfs.close() 123 | 124 | @classmethod 125 | def _strip_protocol(cls, path): 126 | ops = infer_storage_options(path) 127 | return ops["path"] 128 | 129 | def __getattribute__(self, item): 130 | if item in [ 131 | "_open", 132 | "close", 133 | "__init__", 134 | "__getattribute__", 135 | "__reduce_ex__", 136 | "open", 137 | "ls", 138 | "makedirs", 139 | ]: 140 | # all the methods defined in this class. Note `open` here, since 141 | # it calls `_open`, but is actually in superclass 142 | return lambda *args, **kw: getattr(PyArrowHDFS, item)(self, *args, **kw) 143 | if item == "__class__": 144 | return PyArrowHDFS 145 | d = object.__getattribute__(self, "__dict__") 146 | pahdfs = d.get("pahdfs", None) # fs is not immediately defined 147 | if pahdfs is not None and item in [ 148 | "chmod", 149 | "chown", 150 | "user", 151 | "df", 152 | "disk_usage", 153 | "download", 154 | "driver", 155 | "exists", 156 | "extra_conf", 157 | "get_capacity", 158 | "get_space_used", 159 | "host", 160 | "is_open", 161 | "kerb_ticket", 162 | "strip_protocol", 163 | "mkdir", 164 | "mv", 165 | "port", 166 | "get_capacity", 167 | "get_space_used", 168 | "df", 169 | "chmod", 170 | "chown", 171 | "disk_usage", 172 | "download", 173 | "upload", 174 | "_get_kwargs_from_urls", 175 | "read_parquet", 176 | "rm", 177 | "stat", 178 | "upload", 179 | ]: 180 | return getattr(pahdfs, item) 181 | else: 182 | # attributes of the superclass, while target is being set up 183 | return super().__getattribute__(item) 184 | 185 | 186 | class HDFSFile(object): 187 | """Wrapper around arrow's HdfsFile 188 | 189 | Allows seek beyond EOF and (eventually) commit/discard 190 | """ 191 | 192 | def __init__( 193 | self, 194 | fs, 195 | path, 196 | mode, 197 | block_size, 198 | autocommit=True, 199 | cache_type="readahead", 200 | cache_options=None, 201 | **kwargs 202 | ): 203 | # TODO: Inherit from AbstractBufferedFile? 204 | if not autocommit: 205 | raise NotImplementedError( 206 | "HDFSFile cannot be opened with 'autocommit=False'." 207 | ) 208 | 209 | self.fs = fs 210 | self.path = path 211 | self.mode = mode 212 | self.block_size = block_size 213 | self.fh = fs.pahdfs.open(path, mode, block_size, **kwargs) 214 | if self.fh.readable(): 215 | self.seek_size = self.size() 216 | 217 | def seek(self, loc, whence=0): 218 | if whence == 0 and self.readable(): 219 | loc = min(loc, self.seek_size) 220 | return self.fh.seek(loc, whence) 221 | 222 | def __getattr__(self, item): 223 | return getattr(self.fh, item) 224 | 225 | def __reduce_ex__(self, protocol): 226 | return HDFSFile, (self.fs, self.path, self.mode, self.block_size) 227 | 228 | def __enter__(self): 229 | return self 230 | 231 | def __exit__(self, exc_type, exc_val, exc_tb): 232 | self.close() 233 | -------------------------------------------------------------------------------- /fsspec/implementations/libarchive.py: -------------------------------------------------------------------------------- 1 | from __future__ import print_function, division, absolute_import 2 | 3 | from contextlib import contextmanager 4 | 5 | import libarchive 6 | from fsspec import AbstractFileSystem, open_files 7 | from fsspec.utils import tokenize, DEFAULT_BLOCK_SIZE 8 | from fsspec.implementations.memory import MemoryFile 9 | 10 | 11 | class LibArchiveFileSystem(AbstractFileSystem): 12 | """Compressed archives as a file-system (read-only) 13 | 14 | Supports the following formats: 15 | tar, pax , cpio, ISO9660, zip, mtree, shar, ar, raw, xar, lha/lzh, rar 16 | Microsoft CAB, 7-Zip, WARC 17 | 18 | See the libarchive documentation for further restrictions. 19 | 20 | Keeps file object open while instance lives. 21 | 22 | This class is pickleable, but not necessarily thread-safe 23 | """ 24 | 25 | root_marker = "" 26 | protocol = "libarchive" 27 | 28 | def __init__( 29 | self, 30 | fo="", 31 | mode="r", 32 | target_protocol=None, 33 | target_options=None, 34 | block_size=DEFAULT_BLOCK_SIZE, 35 | **kwargs 36 | ): 37 | """ 38 | Parameters 39 | ---------- 40 | fo: str or file-like 41 | Contains ZIP, and must exist. If a str, will fetch file using 42 | `open_files()`, which must return one file exactly. 43 | mode: str 44 | Currently, only 'r' accepted 45 | target_protocol: str (optional) 46 | If ``fo`` is a string, this value can be used to override the 47 | FS protocol inferred from a URL 48 | target_options: dict (optional) 49 | Kwargs passed when instantiating the target FS, if ``fo`` is 50 | a string. 51 | """ 52 | super().__init__(self, **kwargs) 53 | if mode != "r": 54 | raise ValueError("Only read from archive files accepted") 55 | if isinstance(fo, str): 56 | files = open_files(fo, protocol=target_protocol, **(target_options or {})) 57 | if len(files) != 1: 58 | raise ValueError( 59 | 'Path "{}" did not resolve to exactly' 60 | 'one file: "{}"'.format(fo, files) 61 | ) 62 | fo = files[0] 63 | self.fo = fo.__enter__() # the whole instance is a context 64 | # self.arc_reader = 65 | self.block_size = block_size 66 | self.dir_cache = None 67 | 68 | @contextmanager 69 | def _open_archive(self): 70 | self.fo.seek(0) 71 | with libarchive.fd_reader(self.fo.fileno(), block_size=self.block_size) as arc: 72 | yield arc 73 | 74 | @classmethod 75 | def _strip_protocol(cls, path): 76 | # file paths are always relative to the archive root 77 | return super()._strip_protocol(path).lstrip("/") 78 | 79 | def _get_dirs(self): 80 | fields = { 81 | "name": "pathname", 82 | "size": "size", 83 | "created": "ctime", 84 | "mode": "mode", 85 | "uid": "uid", 86 | "gid": "gid", 87 | "mtime": "mtime", 88 | } 89 | 90 | if self.dir_cache is not None: 91 | return 92 | 93 | self.dir_cache = {} 94 | list_names = [] 95 | with self._open_archive() as arc: 96 | for entry in arc: 97 | if not entry.isdir and not entry.isfile: 98 | # Skip symbolic links, fifo entries, etc. 99 | continue 100 | self.dir_cache.update( 101 | { 102 | dirname 103 | + "/": {"name": dirname + "/", "size": 0, "type": "directory"} 104 | for dirname in self._all_dirnames(set(entry.name)) 105 | } 106 | ) 107 | f = {key: getattr(entry, fields[key]) for key in fields} 108 | f["type"] = "directory" if entry.isdir else "file" 109 | list_names.append(entry.name) 110 | 111 | self.dir_cache[f["name"]] = f 112 | # libarchive does not seem to return an entry for the directories (at least 113 | # not in all formats), so get the directories names from the files names 114 | self.dir_cache.update( 115 | { 116 | dirname + "/": {"name": dirname + "/", "size": 0, "type": "directory"} 117 | for dirname in self._all_dirnames(list_names) 118 | } 119 | ) 120 | 121 | def info(self, path, **kwargs): 122 | self._get_dirs() 123 | path = self._strip_protocol(path) 124 | if path in self.dir_cache: 125 | return self.dir_cache[path] 126 | elif path + "/" in self.dir_cache: 127 | return self.dir_cache[path + "/"] 128 | else: 129 | raise FileNotFoundError(path) 130 | 131 | def ls(self, path, detail=False, **kwargs): 132 | self._get_dirs() 133 | paths = {} 134 | 135 | for p, f in self.dir_cache.items(): 136 | p = p.rstrip("/") 137 | if "/" in p: 138 | root = p.rsplit("/", 1)[0] 139 | else: 140 | root = "" 141 | if root == path.rstrip("/"): 142 | paths[p] = f 143 | elif all( 144 | (a == b) 145 | for a, b in zip(path.split("/"), [""] + p.strip("/").split("/")) 146 | ): 147 | # root directory entry 148 | ppath = p.rstrip("/").split("/", 1)[0] 149 | if ppath not in paths: 150 | out = {"name": ppath + "/", "size": 0, "type": "directory"} 151 | paths[ppath] = out 152 | out = list(paths.values()) 153 | if detail: 154 | return out 155 | else: 156 | return list(sorted(f["name"] for f in out)) 157 | 158 | def _open( 159 | self, 160 | path, 161 | mode="rb", 162 | block_size=None, 163 | autocommit=True, 164 | cache_options=None, 165 | **kwargs 166 | ): 167 | path = self._strip_protocol(path) 168 | if mode != "rb": 169 | raise NotImplementedError 170 | 171 | data = bytes() 172 | with self._open_archive() as arc: 173 | # FIXME? dropwhile would increase performance but less readable 174 | for entry in arc: 175 | if entry.pathname != path: 176 | continue 177 | for block in entry.get_blocks(entry.size): 178 | data = block 179 | break 180 | else: 181 | raise ValueError 182 | return MemoryFile(fs=self, path=path, data=data) 183 | 184 | def ukey(self, path): 185 | return tokenize(path, self.fo, self.protocol) 186 | 187 | def _all_dirnames(self, paths): 188 | """Returns *all* directory names for each path in paths, including intermediate ones. 189 | 190 | Parameters 191 | ---------- 192 | paths: Iterable of path strings 193 | """ 194 | if len(paths) == 0: 195 | return set() 196 | 197 | dirnames = {self._parent(path) for path in paths} - {self.root_marker} 198 | return dirnames | self._all_dirnames(dirnames) 199 | -------------------------------------------------------------------------------- /fsspec/implementations/github.py: -------------------------------------------------------------------------------- 1 | import requests 2 | from ..spec import AbstractFileSystem 3 | from ..utils import infer_storage_options 4 | from .memory import MemoryFile 5 | 6 | 7 | class GithubFileSystem(AbstractFileSystem): 8 | """Interface to files in github 9 | 10 | An instance of this class provides the files residing within a remote github 11 | repository. You may specify a point in the repos history, by SHA, branch 12 | or tag (default is current master). 13 | 14 | Given that code files tend to be small, and that github does not support 15 | retrieving partial content, we always fetch whole files. 16 | 17 | When using fsspec.open, allows URIs of the form: 18 | 19 | - "github://path/file", in which case you must specify org, repo and 20 | may specify sha in the extra args 21 | - 'github://org:repo@/precip/catalog.yml', where the org and repo are 22 | part of the URI 23 | - 'github://org:repo@sha/precip/catalog.yml', where tha sha is also included 24 | 25 | ``sha`` can be the full or abbreviated hex of the commit you want to fetch 26 | from, or a branch or tag name (so long as it doesn't contain special characters 27 | like "/", "?", which would have to be HTTP-encoded). 28 | 29 | For authorised access, you must provide username and token, which can be made 30 | at https://github.com/settings/tokens 31 | """ 32 | 33 | url = "https://api.github.com/repos/{org}/{repo}/git/trees/{sha}" 34 | rurl = "https://raw.githubusercontent.com/{org}/{repo}/{sha}/{path}" 35 | protocol = "github" 36 | 37 | def __init__(self, org, repo, sha="master", username=None, token=None, **kwargs): 38 | super().__init__(**kwargs) 39 | self.org = org 40 | self.repo = repo 41 | self.root = sha 42 | if (username is None) ^ (token is None): 43 | raise ValueError("Auth required both username and token") 44 | self.username = username 45 | self.token = token 46 | self.ls("") 47 | 48 | @property 49 | def kw(self): 50 | if self.username: 51 | return {"auth": (self.username, self.token)} 52 | return {} 53 | 54 | @classmethod 55 | def repos(cls, org_or_user, is_org=True): 56 | """List repo names for given org or user 57 | 58 | This may become the top level of the FS 59 | 60 | Parameters 61 | ---------- 62 | org_or_user: str 63 | Nmae of the github org or user to query 64 | is_org: bool (default True) 65 | Whether the name is an organisation (True) or user (False) 66 | 67 | Returns 68 | ------- 69 | List of string 70 | """ 71 | r = requests.get( 72 | "https://api.github.com/{part}/{org}/repos".format( 73 | part=["users", "orgs"][is_org], org=org_or_user 74 | ) 75 | ) 76 | r.raise_for_status() 77 | return [repo["name"] for repo in r.json()] 78 | 79 | @property 80 | def tags(self): 81 | """Names of tags in the repo""" 82 | r = requests.get( 83 | "https://api.github.com/repos/{org}/{repo}/tags" 84 | "".format(org=self.org, repo=self.repo), 85 | **self.kw 86 | ) 87 | r.raise_for_status() 88 | return [t["name"] for t in r.json()] 89 | 90 | @property 91 | def branches(self): 92 | """Names of branches in the repo""" 93 | r = requests.get( 94 | "https://api.github.com/repos/{org}/{repo}/branches" 95 | "".format(org=self.org, repo=self.repo), 96 | **self.kw 97 | ) 98 | r.raise_for_status() 99 | return [t["name"] for t in r.json()] 100 | 101 | @property 102 | def refs(self): 103 | """Named references, tags and branches""" 104 | return {"tags": self.tags, "branches": self.branches} 105 | 106 | def ls(self, path, detail=False, sha=None, _sha=None, **kwargs): 107 | """List files at given path 108 | 109 | Parameters 110 | ---------- 111 | path: str 112 | Location to list, relative to repo root 113 | detail: bool 114 | If True, returns list of dicts, one per file; if False, returns 115 | list of full filenames only 116 | sha: str (optional) 117 | List at the given point in the repo history, branch or tag name or commit 118 | SHA 119 | _sha: str (optional) 120 | List this specific tree object (used internally to descend into trees) 121 | """ 122 | path = self._strip_protocol(path) 123 | if path == "": 124 | _sha = sha or self.root 125 | if _sha is None: 126 | parts = path.rstrip("/").split("/") 127 | so_far = "" 128 | _sha = sha or self.root 129 | for part in parts: 130 | out = self.ls(so_far, True, sha=sha, _sha=_sha) 131 | so_far += "/" + part if so_far else part 132 | out = [o for o in out if o["name"] == so_far] 133 | if not out: 134 | raise FileNotFoundError(path) 135 | out = out[0] 136 | if out["type"] == "file": 137 | if detail: 138 | return [out] 139 | else: 140 | return path 141 | _sha = out["sha"] 142 | if path not in self.dircache or sha not in [self.root, None]: 143 | r = requests.get( 144 | self.url.format(org=self.org, repo=self.repo, sha=_sha), **self.kw 145 | ) 146 | if r.status_code == 404: 147 | raise FileNotFoundError(path) 148 | r.raise_for_status() 149 | out = [ 150 | { 151 | "name": path + "/" + f["path"] if path else f["path"], 152 | "mode": f["mode"], 153 | "type": {"blob": "file", "tree": "directory"}[f["type"]], 154 | "size": f.get("size", 0), 155 | "sha": f["sha"], 156 | } 157 | for f in r.json()["tree"] 158 | ] 159 | if sha in [self.root, None]: 160 | self.dircache[path] = out 161 | else: 162 | out = self.dircache[path] 163 | if detail: 164 | return out 165 | else: 166 | return sorted([f["name"] for f in out]) 167 | 168 | def invalidate_cache(self, path=None): 169 | self.dircache.clear() 170 | 171 | @classmethod 172 | def _strip_protocol(cls, path): 173 | opts = infer_storage_options(path) 174 | if "username" not in opts: 175 | return super()._strip_protocol(path) 176 | return opts["path"].lstrip("/") 177 | 178 | @staticmethod 179 | def _get_kwargs_from_urls(path): 180 | opts = infer_storage_options(path) 181 | if "username" not in opts: 182 | return {} 183 | out = {"org": opts["username"], "repo": opts["password"]} 184 | if opts["host"]: 185 | out["sha"] = opts["host"] 186 | return out 187 | 188 | def _open( 189 | self, 190 | path, 191 | mode="rb", 192 | block_size=None, 193 | autocommit=True, 194 | cache_options=None, 195 | sha=None, 196 | **kwargs 197 | ): 198 | if mode != "rb": 199 | raise NotImplementedError 200 | url = self.rurl.format( 201 | org=self.org, repo=self.repo, path=path, sha=sha or self.root 202 | ) 203 | r = requests.get(url, **self.kw) 204 | if r.status_code == 404: 205 | raise FileNotFoundError(path) 206 | r.raise_for_status() 207 | return MemoryFile(None, None, r.content) 208 | -------------------------------------------------------------------------------- /fsspec/mapping.py: -------------------------------------------------------------------------------- 1 | import array 2 | from collections.abc import MutableMapping 3 | from .core import url_to_fs 4 | 5 | 6 | class FSMap(MutableMapping): 7 | """Wrap a FileSystem instance as a mutable wrapping. 8 | 9 | The keys of the mapping become files under the given root, and the 10 | values (which must be bytes) the contents of those files. 11 | 12 | Parameters 13 | ---------- 14 | root: string 15 | prefix for all the files 16 | fs: FileSystem instance 17 | check: bool (=True) 18 | performs a touch at the location, to check for write access. 19 | 20 | Examples 21 | -------- 22 | >>> fs = FileSystem(**parameters) # doctest: +SKIP 23 | >>> d = FSMap('my-data/path/', fs) # doctest: +SKIP 24 | or, more likely 25 | >>> d = fs.get_mapper('my-data/path/') 26 | 27 | >>> d['loc1'] = b'Hello World' # doctest: +SKIP 28 | >>> list(d.keys()) # doctest: +SKIP 29 | ['loc1'] 30 | >>> d['loc1'] # doctest: +SKIP 31 | b'Hello World' 32 | """ 33 | 34 | def __init__(self, root, fs, check=False, create=False, missing_exceptions=None): 35 | self.fs = fs 36 | self.root = fs._strip_protocol(root).rstrip( 37 | "/" 38 | ) # we join on '/' in _key_to_str 39 | if missing_exceptions is None: 40 | missing_exceptions = ( 41 | FileNotFoundError, 42 | IsADirectoryError, 43 | NotADirectoryError, 44 | ) 45 | self.missing_exceptions = missing_exceptions 46 | if create: 47 | if not self.fs.exists(root): 48 | self.fs.mkdir(root) 49 | if check: 50 | if not self.fs.exists(root): 51 | raise ValueError( 52 | "Path %s does not exist. Create " 53 | " with the ``create=True`` keyword" % root 54 | ) 55 | self.fs.touch(root + "/a") 56 | self.fs.rm(root + "/a") 57 | 58 | def clear(self): 59 | """Remove all keys below root - empties out mapping""" 60 | try: 61 | self.fs.rm(self.root, True) 62 | self.fs.mkdir(self.root) 63 | except: # noqa: E722 64 | pass 65 | 66 | def getitems(self, keys, on_error="raise"): 67 | """Fetch multiple items from the store 68 | 69 | If the backend is async-able, this might proceed concurrently 70 | 71 | Parameters 72 | ---------- 73 | keys: list(str) 74 | They keys to be fetched 75 | on_error : "raise", "omit", "return" 76 | If raise, an underlying exception will be raised (converted to KeyError 77 | if the type is in self.missing_exceptions); if omit, keys with exception 78 | will simply not be included in the output; if "return", all keys are 79 | included in the output, but the value will be bytes or an exception 80 | instance. 81 | 82 | Returns 83 | ------- 84 | dict(key, bytes|exception) 85 | """ 86 | keys2 = [self._key_to_str(k) for k in keys] 87 | oe = on_error if on_error == "raise" else "return" 88 | try: 89 | out = self.fs.cat(keys2, on_error=oe) 90 | except self.missing_exceptions as e: 91 | raise KeyError from e 92 | out = { 93 | k: (KeyError() if isinstance(v, self.missing_exceptions) else v) 94 | for k, v in out.items() 95 | } 96 | return { 97 | key: out[k2] 98 | for key, k2 in zip(keys, keys2) 99 | if on_error == "return" or not isinstance(out[k2], BaseException) 100 | } 101 | 102 | def setitems(self, values_dict): 103 | """Set the values of multiple items in the store 104 | 105 | Parameters 106 | ---------- 107 | values_dict: dict(str, bytes) 108 | """ 109 | values = {self._key_to_str(k): maybe_convert(v) for k, v in values_dict.items()} 110 | self.fs.pipe(values) 111 | 112 | def delitems(self, keys): 113 | """Remove multiple keys from the store""" 114 | self.fs.rm([self._key_to_str(k) for k in keys]) 115 | 116 | def _key_to_str(self, key): 117 | """Generate full path for the key""" 118 | if isinstance(key, (tuple, list)): 119 | key = str(tuple(key)) 120 | else: 121 | key = str(key) 122 | return "/".join([self.root, key]) if self.root else key 123 | 124 | def _str_to_key(self, s): 125 | """Strip path of to leave key name""" 126 | return s[len(self.root) :].lstrip("/") 127 | 128 | def __getitem__(self, key, default=None): 129 | """Retrieve data""" 130 | k = self._key_to_str(key) 131 | try: 132 | result = self.fs.cat(k) 133 | except self.missing_exceptions: 134 | if default is not None: 135 | return default 136 | raise KeyError(key) 137 | return result 138 | 139 | def pop(self, key, default=None): 140 | result = self.__getitem__(key, default) 141 | try: 142 | del self[key] 143 | except KeyError: 144 | pass 145 | return result 146 | 147 | def __setitem__(self, key, value): 148 | """Store value in key""" 149 | key = self._key_to_str(key) 150 | self.fs.mkdirs(self.fs._parent(key), exist_ok=True) 151 | self.fs.pipe_file(key, maybe_convert(value)) 152 | 153 | def __iter__(self): 154 | return (self._str_to_key(x) for x in self.fs.find(self.root)) 155 | 156 | def __len__(self): 157 | return len(self.fs.find(self.root)) 158 | 159 | def __delitem__(self, key): 160 | """Remove key""" 161 | try: 162 | self.fs.rm(self._key_to_str(key)) 163 | except: # noqa: E722 164 | raise KeyError 165 | 166 | def __contains__(self, key): 167 | """Does key exist in mapping?""" 168 | path = self._key_to_str(key) 169 | return self.fs.exists(path) and self.fs.isfile(path) 170 | 171 | def __reduce__(self): 172 | return FSMap, (self.root, self.fs, False, False, self.missing_exceptions) 173 | 174 | 175 | def maybe_convert(value): 176 | if isinstance(value, array.array) or hasattr(value, "__array__"): 177 | # bytes-like things 178 | value = bytearray(memoryview(value)) 179 | return value 180 | 181 | 182 | def get_mapper(url, check=False, create=False, missing_exceptions=None, **kwargs): 183 | """Create key-value interface for given URL and options 184 | 185 | The URL will be of the form "protocol://location" and point to the root 186 | of the mapper required. All keys will be file-names below this location, 187 | and their values the contents of each key. 188 | 189 | Also accepts compound URLs like zip::s3://bucket/file.zip , see ``fsspec.open``. 190 | 191 | Parameters 192 | ---------- 193 | url: str 194 | Root URL of mapping 195 | check: bool 196 | Whether to attempt to read from the location before instantiation, to 197 | check that the mapping does exist 198 | create: bool 199 | Whether to make the directory corresponding to the root before 200 | instantiating 201 | missing_exceptions: None or tuple 202 | If given, these excpetion types will be regarded as missing keys and 203 | return KeyError when trying to read data. By default, you get 204 | (FileNotFoundError, IsADirectoryError, NotADirectoryError) 205 | 206 | Returns 207 | ------- 208 | ``FSMap`` instance, the dict-like key-value store. 209 | """ 210 | # Removing protocol here - could defer to each open() on the backend 211 | fs, urlpath = url_to_fs(url, **kwargs) 212 | return FSMap(urlpath, fs, check, create, missing_exceptions=missing_exceptions) 213 | --------------------------------------------------------------------------------