├── requirements.txt ├── fsspec ├── tests │ ├── __init__.py │ ├── test_gui.py │ ├── test_generic.py │ ├── data │ │ └── listing.html │ ├── test_callbacks.py │ ├── test_config.py │ ├── test_registry.py │ ├── test_caches.py │ ├── test_fuse.py │ ├── test_async.py │ ├── test_mapping.py │ ├── test_parquet.py │ ├── conftest.py │ ├── test_compression.py │ └── test_file.py ├── implementations │ ├── __init__.py │ ├── tests │ │ ├── __init__.py │ │ ├── conftest.py │ │ ├── test_dask.py │ │ ├── test_common.py │ │ ├── test_libarchive.py │ │ ├── test_zip.py │ │ ├── test_jupyter.py │ │ ├── cassettes │ │ │ └── test_dbfs_file_listing.yaml │ │ ├── test_git.py │ │ ├── test_smb.py │ │ ├── test_hdfs.py │ │ ├── test_webhdfs.py │ │ ├── test_memory.py │ │ ├── test_dbfs.py │ │ ├── test_ftp.py │ │ ├── test_arrow.py │ │ └── test_sftp.py │ ├── dvc.py │ ├── zip.py │ ├── jupyter.py │ ├── git.py │ ├── dask.py │ ├── sftp.py │ └── tar.py ├── exceptions.py ├── conftest.py ├── __init__.py ├── archive.py ├── transaction.py ├── dircache.py ├── config.py ├── compression.py ├── generic.py └── callbacks.py ├── .gitattributes ├── readthedocs.yml ├── docs ├── source │ ├── img │ │ └── gui.png │ ├── index.rst │ ├── usage.rst │ ├── developer.rst │ ├── intro.rst │ ├── async.rst │ └── conf.py ├── environment.yml ├── README.md ├── Makefile └── make.bat ├── MANIFEST.in ├── ci ├── environment-py38.yml └── environment-win.yml ├── .coveragerc ├── pyproject.toml ├── .github └── workflows │ ├── pypipublish.yaml │ └── main.yaml ├── .pre-commit-config.yaml ├── setup.cfg ├── LICENSE ├── .gitignore ├── setup.py ├── tox.ini └── README.md /requirements.txt: -------------------------------------------------------------------------------- 1 | -------------------------------------------------------------------------------- /fsspec/tests/__init__.py: -------------------------------------------------------------------------------- 1 | -------------------------------------------------------------------------------- /fsspec/implementations/__init__.py: -------------------------------------------------------------------------------- 1 | -------------------------------------------------------------------------------- /fsspec/implementations/tests/__init__.py: -------------------------------------------------------------------------------- 1 | -------------------------------------------------------------------------------- /.gitattributes: -------------------------------------------------------------------------------- 1 | fsspec/_version.py export-subst 2 | -------------------------------------------------------------------------------- /readthedocs.yml: -------------------------------------------------------------------------------- 1 | conda: 2 | file: docs/environment.yml 3 | -------------------------------------------------------------------------------- /docs/source/img/gui.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/PatrikHlobil/filesystem_spec/master/docs/source/img/gui.png -------------------------------------------------------------------------------- /docs/environment.yml: -------------------------------------------------------------------------------- 1 | name: fsspec 2 | channels: 3 | - defaults 4 | dependencies: 5 | - python=3.9 6 | - numpydoc 7 | -------------------------------------------------------------------------------- /MANIFEST.in: -------------------------------------------------------------------------------- 1 | include versioneer.py 2 | include fsspec/_version.py 3 | 4 | include LICENSE 5 | include README.rst 6 | include requirements.txt 7 | -------------------------------------------------------------------------------- /ci/environment-py38.yml: -------------------------------------------------------------------------------- 1 | name: test_env 2 | channels: 3 | - conda-forge 4 | - defaults 5 | dependencies: 6 | - python=3.8 7 | - tox 8 | - tox-conda 9 | -------------------------------------------------------------------------------- /fsspec/tests/test_gui.py: -------------------------------------------------------------------------------- 1 | import pytest 2 | 3 | panel = pytest.importorskip("panel") 4 | 5 | 6 | def test_basic(): 7 | import fsspec.gui 8 | 9 | gui = fsspec.gui.FileSelector() 10 | assert "url" in str(gui.panel) 11 | -------------------------------------------------------------------------------- /docs/README.md: -------------------------------------------------------------------------------- 1 | # Building Documentation 2 | 3 | A basic python environment with packages listed in `./requirements.txt` is 4 | required to build the docs, see ``environment.yml``. 5 | 6 | To make HTML documentation: 7 | 8 | ```bash 9 | make html 10 | ``` 11 | 12 | Outputs to `build/html/index.html` 13 | -------------------------------------------------------------------------------- /ci/environment-win.yml: -------------------------------------------------------------------------------- 1 | name: test_env 2 | channels: 3 | - conda-forge 4 | - defaults 5 | dependencies: 6 | - aiohttp 7 | - pip 8 | - requests 9 | - zstandard 10 | - python-snappy 11 | - lz4<3.1.3 12 | - pyftpdlib 13 | - cloudpickle 14 | - pytest 15 | - pytest-asyncio 16 | - pytest-benchmark 17 | - pytest-cov 18 | - pytest-mock 19 | - pytest-vcr 20 | - python-libarchive-c 21 | - numpy 22 | - nomkl 23 | - tqdm 24 | -------------------------------------------------------------------------------- /.coveragerc: -------------------------------------------------------------------------------- 1 | [run] 2 | omit = 3 | */test_*.py 4 | fsspec/_version.py 5 | fsspec/implementations/dvc.py 6 | fsspec/implementations/github.py 7 | fsspec/implementations/hdfs.py 8 | source = 9 | fsspec 10 | 11 | [report] 12 | # Regexes for lines to exclude from consideration 13 | exclude_lines = 14 | pragma: no cover 15 | 16 | raise AssertionError 17 | raise NotImplementedError 18 | pass 19 | 20 | ignore_errors = True 21 | -------------------------------------------------------------------------------- /fsspec/exceptions.py: -------------------------------------------------------------------------------- 1 | """ 2 | fsspec user-defined exception classes 3 | """ 4 | import asyncio 5 | 6 | 7 | class BlocksizeMismatchError(ValueError): 8 | """ 9 | Raised when a cached file is opened with a different blocksize than it was 10 | written with 11 | """ 12 | 13 | ... 14 | 15 | 16 | class FSTimeoutError(asyncio.TimeoutError): 17 | """ 18 | Raised when a fsspec function timed out occurs 19 | """ 20 | 21 | ... 22 | -------------------------------------------------------------------------------- /pyproject.toml: -------------------------------------------------------------------------------- 1 | [tool.black] 2 | target_version = ['py37', 'py38'] 3 | line-length = 88 4 | skip-string-normalization = false 5 | exclude = ''' 6 | 7 | ( 8 | /( 9 | \.eggs # exclude a few common directories in the 10 | | \.git # root of the project 11 | | \.hg 12 | | \.mypy_cache 13 | | \.tox 14 | | \.venv 15 | | _build 16 | | buck-out 17 | | build 18 | | dist 19 | )/ 20 | | fsspec/_version.py 21 | | versioneer.py 22 | ) 23 | ''' 24 | -------------------------------------------------------------------------------- /docs/Makefile: -------------------------------------------------------------------------------- 1 | # Minimal makefile for Sphinx documentation 2 | # 3 | 4 | # You can set these variables from the command line. 5 | SPHINXOPTS = 6 | SPHINXBUILD = sphinx-build 7 | SPHINXPROJ = fsspec 8 | SOURCEDIR = source 9 | BUILDDIR = build 10 | 11 | # Put it first so that "make" without argument is like "make help". 12 | help: 13 | @$(SPHINXBUILD) -M help "$(SOURCEDIR)" "$(BUILDDIR)" $(SPHINXOPTS) $(O) 14 | 15 | .PHONY: help Makefile 16 | 17 | # Catch-all target: route all unknown targets to Sphinx using the new 18 | # "make mode" option. $(O) is meant as a shortcut for $(SPHINXOPTS). 19 | %: Makefile 20 | @$(SPHINXBUILD) -M $@ "$(SOURCEDIR)" "$(BUILDDIR)" $(SPHINXOPTS) $(O) 21 | -------------------------------------------------------------------------------- /fsspec/implementations/tests/conftest.py: -------------------------------------------------------------------------------- 1 | import tempfile 2 | 3 | import pytest 4 | 5 | from fsspec.implementations.local import LocalFileSystem 6 | 7 | 8 | # A dummy filesystem that has a list of protocols 9 | class MultiProtocolFileSystem(LocalFileSystem): 10 | protocol = ["file", "other"] 11 | 12 | 13 | FILESYSTEMS = {"local": LocalFileSystem, "multi": MultiProtocolFileSystem} 14 | 15 | READ_ONLY_FILESYSTEMS = [] 16 | 17 | 18 | @pytest.fixture(scope="function") 19 | def fs(request): 20 | cls = FILESYSTEMS[request.param] 21 | return cls() 22 | 23 | 24 | @pytest.fixture(scope="function") 25 | def temp_file(): 26 | with tempfile.TemporaryDirectory() as temp_dir: 27 | return temp_dir + "test-file" 28 | -------------------------------------------------------------------------------- /fsspec/implementations/tests/test_dask.py: -------------------------------------------------------------------------------- 1 | import pytest 2 | 3 | import fsspec 4 | 5 | pytest.importorskip("distributed") 6 | 7 | 8 | @pytest.fixture() 9 | def cli(tmpdir): 10 | import dask.distributed 11 | 12 | client = dask.distributed.Client(n_workers=1) 13 | 14 | def setup(): 15 | m = fsspec.filesystem("memory") 16 | with m.open("afile", "wb") as f: 17 | f.write(b"data") 18 | 19 | client.run(setup) 20 | try: 21 | yield client 22 | finally: 23 | client.close() 24 | 25 | 26 | def test_basic(cli): 27 | 28 | fs = fsspec.filesystem("dask", target_protocol="memory") 29 | assert fs.ls("") == ["/afile"] 30 | assert fs.cat("/afile") == b"data" 31 | -------------------------------------------------------------------------------- /.github/workflows/pypipublish.yaml: -------------------------------------------------------------------------------- 1 | name: Upload Python Package 2 | 3 | on: 4 | release: 5 | types: [created] 6 | 7 | jobs: 8 | deploy: 9 | runs-on: ubuntu-latest 10 | steps: 11 | - uses: actions/checkout@v2 12 | - name: Set up Python 13 | uses: actions/setup-python@v2 14 | with: 15 | python-version: "3.x" 16 | - name: Install dependencies 17 | run: | 18 | python -m pip install --upgrade pip 19 | pip install setuptools setuptools-scm wheel twine 20 | - name: Build and publish 21 | env: 22 | TWINE_USERNAME: ${{ secrets.PYPI_USERNAME }} 23 | TWINE_PASSWORD: ${{ secrets.PYPI_PASSWORD }} 24 | run: | 25 | python setup.py sdist bdist_wheel 26 | twine upload dist/* 27 | -------------------------------------------------------------------------------- /.pre-commit-config.yaml: -------------------------------------------------------------------------------- 1 | exclude: > 2 | (?x)^( 3 | \.tox/.* 4 | )$ 5 | repos: 6 | 7 | - repo: https://github.com/pre-commit/pre-commit-hooks 8 | rev: v3.4.0 9 | hooks: 10 | - id: trailing-whitespace 11 | - id: end-of-file-fixer 12 | - id: check-docstring-first 13 | - id: check-json 14 | - id: check-yaml 15 | - repo: https://github.com/ambv/black 16 | rev: 22.3.0 17 | hooks: 18 | - id: black 19 | - repo: https://gitlab.com/pycqa/flake8 20 | rev: 3.8.4 21 | hooks: 22 | - id: flake8 23 | - repo: https://github.com/asottile/seed-isort-config 24 | rev: v2.2.0 25 | hooks: 26 | - id: seed-isort-config 27 | - repo: https://github.com/pre-commit/mirrors-isort 28 | rev: v5.7.0 29 | hooks: 30 | - id: isort 31 | -------------------------------------------------------------------------------- /docs/make.bat: -------------------------------------------------------------------------------- 1 | @ECHO OFF 2 | 3 | pushd %~dp0 4 | 5 | REM Command file for Sphinx documentation 6 | 7 | if "%SPHINXBUILD%" == "" ( 8 | set SPHINXBUILD=sphinx-build 9 | ) 10 | set SOURCEDIR=source 11 | set BUILDDIR=build 12 | set SPHINXPROJ=fsspec 13 | 14 | if "%1" == "" goto help 15 | 16 | %SPHINXBUILD% >NUL 2>NUL 17 | if errorlevel 9009 ( 18 | echo. 19 | echo.The 'sphinx-build' command was not found. Make sure you have Sphinx 20 | echo.installed, then set the SPHINXBUILD environment variable to point 21 | echo.to the full path of the 'sphinx-build' executable. Alternatively you 22 | echo.may add the Sphinx directory to PATH. 23 | echo. 24 | echo.If you don't have Sphinx installed, grab it from 25 | echo.http://sphinx-doc.org/ 26 | exit /b 1 27 | ) 28 | 29 | %SPHINXBUILD% -M %1 %SOURCEDIR% %BUILDDIR% %SPHINXOPTS% 30 | goto end 31 | 32 | :help 33 | %SPHINXBUILD% -M help %SOURCEDIR% %BUILDDIR% %SPHINXOPTS% 34 | 35 | :end 36 | popd 37 | -------------------------------------------------------------------------------- /setup.cfg: -------------------------------------------------------------------------------- 1 | [metadata] 2 | long_description: file: README.rst 3 | 4 | [versioneer] 5 | VCS = git 6 | style = pep440 7 | versionfile_source = fsspec/_version.py 8 | versionfile_build = fsspec/_version.py 9 | tag_prefix = "" 10 | 11 | [flake8] 12 | exclude = .tox,build,docs/source/conf.py,versioneer.py,fsspec/_version 13 | max-line-length = 88 14 | ignore = 15 | # Assigning lambda expression 16 | E731 17 | # Ambiguous variable names 18 | E741 19 | # line break before binary operator 20 | W503 21 | # whitespace before : 22 | E203 23 | # redefs 24 | F811 25 | 26 | [isort] 27 | known_first_party=fsspec 28 | known_third_party=aiohttp,dask,distributed,dvc,fuse,libarchive,numpy,panel,paramiko,pyarrow,pygit2,pytest,requests,setuptools,smbclient 29 | multi_line_output=3 30 | include_trailing_comma=True 31 | force_grid_wrap=0 32 | combine_as_imports=True 33 | line_length=88 34 | skip= 35 | .tox 36 | build 37 | docs/source/conf.py 38 | versioneer.py 39 | fsspec/_version 40 | -------------------------------------------------------------------------------- /fsspec/implementations/tests/test_common.py: -------------------------------------------------------------------------------- 1 | import datetime 2 | import time 3 | 4 | import pytest 5 | 6 | from fsspec import AbstractFileSystem 7 | from fsspec.implementations.tests.conftest import READ_ONLY_FILESYSTEMS 8 | 9 | 10 | @pytest.mark.parametrize("fs", ["local"], indirect=["fs"]) 11 | def test_created(fs: AbstractFileSystem, temp_file): 12 | try: 13 | fs.touch(temp_file) 14 | created = fs.created(path=temp_file) 15 | assert isinstance(created, datetime.datetime) 16 | finally: 17 | if not isinstance(fs, tuple(READ_ONLY_FILESYSTEMS)): 18 | fs.rm(temp_file) 19 | 20 | 21 | @pytest.mark.parametrize("fs", ["local"], indirect=["fs"]) 22 | def test_modified(fs: AbstractFileSystem, temp_file): 23 | try: 24 | fs.touch(temp_file) 25 | created = fs.created(path=temp_file) 26 | time.sleep(0.05) 27 | fs.touch(temp_file) 28 | modified = fs.modified(path=temp_file) 29 | assert isinstance(modified, datetime.datetime) 30 | assert modified > created 31 | finally: 32 | fs.rm(temp_file) 33 | -------------------------------------------------------------------------------- /fsspec/implementations/tests/test_libarchive.py: -------------------------------------------------------------------------------- 1 | # this test case checks that the libarchive can be used from a seekable source (any fs 2 | # with a block cache active) 3 | import fsspec 4 | from fsspec.implementations.tests.test_archive import archive_data, temparchive 5 | 6 | 7 | def test_cache(ftp_writable): 8 | host, port, username, password = "localhost", 2121, "user", "pass" 9 | 10 | with temparchive(archive_data) as archive_file: 11 | with fsspec.open( 12 | "ftp:///archive.7z", 13 | "wb", 14 | host=host, 15 | port=port, 16 | username=username, 17 | password=password, 18 | ) as f: 19 | f.write(open(archive_file, "rb").read()) 20 | of = fsspec.open( 21 | "libarchive://deeply/nested/path::ftp:///archive.7z", 22 | ftp={ 23 | "host": host, 24 | "port": port, 25 | "username": username, 26 | "password": password, 27 | }, 28 | ) 29 | 30 | with of as f: 31 | readdata = f.read() 32 | 33 | assert readdata == archive_data["deeply/nested/path"] 34 | -------------------------------------------------------------------------------- /fsspec/tests/test_generic.py: -------------------------------------------------------------------------------- 1 | import fsspec 2 | from fsspec.tests.conftest import data, server # noqa: F401 3 | 4 | 5 | def test_remote_async_ops(server): 6 | fsspec.filesystem("http", headers={"give_length": "true", "head_ok": "true"}) 7 | fs = fsspec.filesystem("generic", default_method="current") 8 | out = fs.info(server + "/index/realfile") 9 | assert out["size"] == len(data) 10 | assert out["type"] == "file" 11 | assert fs.isfile(server + "/index/realfile") # this method from superclass 12 | 13 | 14 | def test_touch_rm(m): 15 | m.touch("afile") 16 | m.touch("dir/afile") 17 | 18 | fs = fsspec.filesystem("generic", default_method="current") 19 | fs.rm("memory://afile") 20 | assert not m.exists("afile") 21 | 22 | fs.rm("memory://dir", recursive=True) 23 | assert not m.exists("dir/afile") 24 | assert not m.exists("dir") 25 | 26 | 27 | def test_cp_async_to_sync(server, m): 28 | fsspec.filesystem("http", headers={"give_length": "true", "head_ok": "true"}) 29 | fs = fsspec.filesystem("generic", default_method="current") 30 | fs.cp(server + "/index/realfile", "memory://realfile") 31 | assert m.cat("realfile") == data 32 | 33 | fs.rm("memory://realfile") 34 | assert not m.exists("realfile") 35 | -------------------------------------------------------------------------------- /fsspec/implementations/tests/test_zip.py: -------------------------------------------------------------------------------- 1 | import collections.abc 2 | 3 | import fsspec 4 | from fsspec.implementations.tests.test_archive import archive_data, tempzip 5 | 6 | 7 | def test_info(): 8 | with tempzip(archive_data) as z: 9 | fs = fsspec.filesystem("zip", fo=z) 10 | 11 | # Iterate over all files. 12 | for f, v in archive_data.items(): 13 | lhs = fs.info(f) 14 | 15 | # Probe some specific fields of Zip archives. 16 | assert "CRC" in lhs 17 | assert "compress_size" in lhs 18 | 19 | 20 | def test_fsspec_get_mapper(): 21 | """Added for #788""" 22 | 23 | with tempzip(archive_data) as z: 24 | mapping = fsspec.get_mapper(f"zip::{z}") 25 | 26 | assert isinstance(mapping, collections.abc.Mapping) 27 | keys = sorted(list(mapping.keys())) 28 | assert keys == ["a", "b", "deeply/nested/path"] 29 | 30 | # mapping.getitems() will call FSMap.fs.cat() 31 | # which was not accurately implemented for zip. 32 | assert isinstance(mapping, fsspec.mapping.FSMap) 33 | items = dict(mapping.getitems(keys)) 34 | assert items == {"a": b"", "b": b"hello", "deeply/nested/path": b"stuff"} 35 | 36 | 37 | def test_not_cached(): 38 | with tempzip(archive_data) as z: 39 | fs = fsspec.filesystem("zip", fo=z) 40 | fs2 = fsspec.filesystem("zip", fo=z) 41 | assert fs is not fs2 42 | -------------------------------------------------------------------------------- /fsspec/conftest.py: -------------------------------------------------------------------------------- 1 | import os 2 | import shutil 3 | import subprocess 4 | import sys 5 | import time 6 | 7 | import pytest 8 | 9 | import fsspec 10 | from fsspec.implementations.cached import CachingFileSystem 11 | 12 | 13 | @pytest.fixture() 14 | def m(): 15 | """ 16 | Fixture providing a memory filesystem. 17 | """ 18 | m = fsspec.filesystem("memory") 19 | m.store.clear() 20 | m.pseudo_dirs.clear() 21 | m.pseudo_dirs.append("") 22 | try: 23 | yield m 24 | finally: 25 | m.store.clear() 26 | m.pseudo_dirs.clear() 27 | m.pseudo_dirs.append("") 28 | 29 | 30 | @pytest.fixture 31 | def ftp_writable(tmpdir): 32 | """ 33 | Fixture providing a writable FTP filesystem. 34 | """ 35 | pytest.importorskip("pyftpdlib") 36 | from fsspec.implementations.ftp import FTPFileSystem 37 | 38 | FTPFileSystem.clear_instance_cache() # remove lingering connections 39 | CachingFileSystem.clear_instance_cache() 40 | d = str(tmpdir) 41 | with open(os.path.join(d, "out"), "wb") as f: 42 | f.write(b"hello" * 10000) 43 | P = subprocess.Popen( 44 | [sys.executable, "-m", "pyftpdlib", "-d", d, "-u", "user", "-P", "pass", "-w"] 45 | ) 46 | try: 47 | time.sleep(1) 48 | yield "localhost", 2121, "user", "pass" 49 | finally: 50 | P.terminate() 51 | P.wait() 52 | try: 53 | shutil.rmtree(tmpdir) 54 | except Exception: 55 | pass 56 | -------------------------------------------------------------------------------- /fsspec/tests/data/listing.html: -------------------------------------------------------------------------------- 1 | \nnasagrace.unl.edu - /data/20020401/

nasagrace.unl.edu - /data/20020401/


\n\n
[To Parent Directory]

1/27/2020 9:54 AM 1194073 GRACE_GWS_20020401.pdf
1/27/2020 9:54 AM 380043 GRACE_GWS_20020401.png
1/27/2020 9:54 AM 1192987 GRACE_RTZSM_20020401.pdf
1/27/2020 9:54 AM 384342 GRACE_RTZSM_20020401.png
1/27/2020 9:55 AM 1202046 GRACE_SFSM_20020401.pdf
1/27/2020 9:55 AM 387932 GRACE_SFSM_20020401.png
1/27/2020 9:54 AM 4975980 GRACEDADM_CLSM0125US_7D.A20020401.030.nc4
1/27/2020 9:54 AM 345640 gws_perc_0125deg_US_20020401.img
1/27/2020 9:54 AM 2272 gws_perc_0125deg_US_20020401.img.aux.xml
1/27/2020 9:54 AM 5678 gws_perc_0125deg_US_20020401.img.xml
1/27/2020 9:54 AM 136081 gws_perc_0125deg_US_20020401.rrd
1/27/2020 9:54 AM 83 2 | -------------------------------------------------------------------------------- /LICENSE: -------------------------------------------------------------------------------- 1 | BSD 3-Clause License 2 | 3 | Copyright (c) 2018, Martin Durant 4 | All rights reserved. 5 | 6 | Redistribution and use in source and binary forms, with or without 7 | modification, are permitted provided that the following conditions are met: 8 | 9 | * Redistributions of source code must retain the above copyright notice, this 10 | list of conditions and the following disclaimer. 11 | 12 | * Redistributions in binary form must reproduce the above copyright notice, 13 | this list of conditions and the following disclaimer in the documentation 14 | and/or other materials provided with the distribution. 15 | 16 | * Neither the name of the copyright holder nor the names of its 17 | contributors may be used to endorse or promote products derived from 18 | this software without specific prior written permission. 19 | 20 | THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" 21 | AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE 22 | IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE 23 | DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE 24 | FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL 25 | DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR 26 | SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER 27 | CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, 28 | OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE 29 | OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. 30 | -------------------------------------------------------------------------------- /fsspec/implementations/tests/test_jupyter.py: -------------------------------------------------------------------------------- 1 | import os 2 | import shlex 3 | import subprocess 4 | import time 5 | 6 | import pytest 7 | 8 | import fsspec 9 | 10 | pytest.importorskip("notebook") 11 | requests = pytest.importorskip("requests") 12 | 13 | 14 | @pytest.fixture() 15 | def jupyter(tmpdir): 16 | 17 | tmpdir = str(tmpdir) 18 | os.environ["JUPYTER_TOKEN"] = "blah" 19 | try: 20 | cmd = f"jupyter notebook --notebook-dir={tmpdir} --no-browser --port=5566" 21 | P = subprocess.Popen(shlex.split(cmd)) 22 | except FileNotFoundError: 23 | pytest.skip("notebook not installed correctly") 24 | try: 25 | timeout = 15 26 | while True: 27 | try: 28 | r = requests.get("http://localhost:5566/?token=blah") 29 | r.raise_for_status() 30 | break 31 | except (requests.exceptions.BaseHTTPError, IOError): 32 | time.sleep(0.1) 33 | timeout -= 0.1 34 | if timeout < 0: 35 | pytest.xfail("Timed out for jupyter") 36 | yield "http://localhost:5566/?token=blah", tmpdir 37 | finally: 38 | P.terminate() 39 | 40 | 41 | def test_simple(jupyter): 42 | url, d = jupyter 43 | fs = fsspec.filesystem("jupyter", url=url) 44 | assert fs.ls("") == [] 45 | 46 | fs.pipe("afile", b"data") 47 | assert fs.cat("afile") == b"data" 48 | assert "afile" in os.listdir(d) 49 | 50 | with fs.open("bfile", "wb") as f: 51 | f.write(b"more") 52 | with fs.open("bfile", "rb") as f: 53 | assert f.read() == b"more" 54 | 55 | assert fs.info("bfile")["size"] == 4 56 | fs.rm("afile") 57 | 58 | assert "afile" not in os.listdir(d) 59 | -------------------------------------------------------------------------------- /.github/workflows/main.yaml: -------------------------------------------------------------------------------- 1 | name: CI 2 | 3 | on: 4 | push: 5 | branches: "*" 6 | pull_request: 7 | branches: master 8 | 9 | jobs: 10 | linux: 11 | name: ${{ matrix.TOXENV }}-pytest 12 | runs-on: ubuntu-latest 13 | strategy: 14 | fail-fast: false 15 | matrix: 16 | TOXENV: [py37, py38, py39, s3fs, gcsfs] 17 | 18 | env: 19 | TOXENV: ${{ matrix.TOXENV }} 20 | CIRUN: true 21 | 22 | steps: 23 | - name: Checkout 24 | uses: actions/checkout@v2 25 | 26 | - name: Setup Miniconda 27 | uses: conda-incubator/setup-miniconda@v2 28 | with: 29 | auto-update-conda: true 30 | auto-activate-base: false 31 | activate-environment: test_env 32 | environment-file: ci/environment-py38.yml 33 | 34 | - name: Run Tests 35 | shell: bash -l {0} 36 | run: | 37 | tox -v 38 | 39 | win: 40 | name: ${{ matrix.TOXENV }}-pytest-win 41 | runs-on: windows-2019 42 | strategy: 43 | fail-fast: false 44 | matrix: 45 | TOXENV: [py38] 46 | 47 | env: 48 | TOXENV: ${{ matrix.TOXENV }} 49 | CIRUN: true 50 | 51 | steps: 52 | - name: Checkout 53 | uses: actions/checkout@v2 54 | 55 | - name: Setup Miniconda 56 | uses: conda-incubator/setup-miniconda@v2 57 | with: 58 | auto-update-conda: true 59 | auto-activate-base: false 60 | activate-environment: test_env 61 | environment-file: ci/environment-win.yml 62 | 63 | - name: Run Tests 64 | shell: bash -l {0} 65 | run: | 66 | pytest -v 67 | 68 | lint: 69 | name: lint 70 | runs-on: ubuntu-latest 71 | steps: 72 | - uses: actions/checkout@v2 73 | - uses: actions/setup-python@v2 74 | - uses: pre-commit/action@v2.0.0 75 | -------------------------------------------------------------------------------- /fsspec/tests/test_callbacks.py: -------------------------------------------------------------------------------- 1 | import pytest 2 | 3 | from fsspec.callbacks import Callback, TqdmCallback 4 | 5 | 6 | def test_callbacks(): 7 | empty_callback = Callback() 8 | assert empty_callback.call("something", somearg=None) is None 9 | 10 | hooks = dict(something=lambda *_, arg=None: arg + 2) 11 | simple_callback = Callback(hooks=hooks) 12 | assert simple_callback.call("something", arg=2) == 4 13 | 14 | hooks = dict(something=lambda *_, arg1=None, arg2=None: arg1 + arg2) 15 | multi_arg_callback = Callback(hooks=hooks) 16 | assert multi_arg_callback.call("something", arg1=2, arg2=2) == 4 17 | 18 | 19 | def test_callbacks_as_callback(): 20 | empty_callback = Callback.as_callback(None) 21 | assert empty_callback.call("something", arg="somearg") is None 22 | assert Callback.as_callback(None) is Callback.as_callback(None) 23 | 24 | hooks = dict(something=lambda *_, arg=None: arg + 2) 25 | real_callback = Callback.as_callback(Callback(hooks=hooks)) 26 | assert real_callback.call("something", arg=2) == 4 27 | 28 | 29 | def test_callbacks_wrap(): 30 | events = [] 31 | 32 | class TestCallback(Callback): 33 | def relative_update(self, inc=1): 34 | events.append(inc) 35 | 36 | callback = TestCallback() 37 | for _ in callback.wrap(range(10)): 38 | ... 39 | 40 | assert events == [1] * 10 41 | 42 | 43 | @pytest.mark.parametrize("tqdm_kwargs", [{}, {"desc": "A custom desc"}]) 44 | def test_tqdm_callback(tqdm_kwargs, mocker): 45 | 46 | callback = TqdmCallback(tqdm_kwargs=tqdm_kwargs) 47 | mocker.patch.object(callback, "_tqdm") 48 | callback.set_size(10) 49 | for _ in callback.wrap(range(10)): 50 | ... 51 | 52 | assert callback.tqdm.update.call_count == 10 53 | if not tqdm_kwargs: 54 | callback._tqdm.tqdm.assert_called_with(total=10) 55 | else: 56 | callback._tqdm.tqdm.assert_called_with(total=10, **tqdm_kwargs) 57 | -------------------------------------------------------------------------------- /fsspec/__init__.py: -------------------------------------------------------------------------------- 1 | try: 2 | from importlib.metadata import entry_points 3 | except ImportError: # python < 3.8 4 | try: 5 | from importlib_metadata import entry_points 6 | except ImportError: 7 | entry_points = None 8 | 9 | 10 | from . import _version, caching 11 | from .callbacks import Callback 12 | from .compression import available_compressions 13 | from .core import get_fs_token_paths, open, open_files, open_local 14 | from .exceptions import FSTimeoutError 15 | from .mapping import FSMap, get_mapper 16 | from .registry import ( 17 | available_protocols, 18 | filesystem, 19 | get_filesystem_class, 20 | register_implementation, 21 | registry, 22 | ) 23 | from .spec import AbstractFileSystem 24 | 25 | __version__ = _version.get_versions()["version"] 26 | 27 | __all__ = [ 28 | "AbstractFileSystem", 29 | "FSTimeoutError", 30 | "FSMap", 31 | "filesystem", 32 | "register_implementation", 33 | "get_filesystem_class", 34 | "get_fs_token_paths", 35 | "get_mapper", 36 | "open", 37 | "open_files", 38 | "open_local", 39 | "registry", 40 | "caching", 41 | "Callback", 42 | "available_protocols", 43 | "available_compressions", 44 | ] 45 | 46 | 47 | def process_entries(): 48 | if entry_points is not None: 49 | try: 50 | eps = entry_points() 51 | except TypeError: 52 | pass # importlib-metadata < 0.8 53 | else: 54 | if hasattr(eps, "select"): # Python 3.10+ / importlib_metadata >= 3.9.0 55 | specs = eps.select(group="fsspec.specs") 56 | else: 57 | specs = eps.get("fsspec.specs", []) 58 | for spec in specs: 59 | err_msg = f"Unable to load filesystem from {spec}" 60 | register_implementation( 61 | spec.name, spec.value.replace(":", "."), errtxt=err_msg 62 | ) 63 | 64 | 65 | process_entries() 66 | -------------------------------------------------------------------------------- /.gitignore: -------------------------------------------------------------------------------- 1 | # Dask 2 | dask-worker-space 3 | 4 | # Byte-compiled / optimized / DLL files 5 | __pycache__/ 6 | *.py[cod] 7 | *$py.class 8 | 9 | # C extensions 10 | *.so 11 | 12 | # Distribution / packaging 13 | .Python 14 | env/ 15 | build/ 16 | develop-eggs/ 17 | dist/ 18 | downloads/ 19 | eggs/ 20 | .eggs/ 21 | lib/ 22 | lib64/ 23 | parts/ 24 | sdist/ 25 | var/ 26 | wheels/ 27 | *.egg-info/ 28 | .installed.cfg 29 | *.egg 30 | pip-wheel-metadata/ 31 | 32 | # PyInstaller 33 | # Usually these files are written by a python script from a template 34 | # before PyInstaller builds the exe, so as to inject date/other infos into it. 35 | *.manifest 36 | *.spec 37 | 38 | # Installer logs 39 | pip-log.txt 40 | pip-delete-this-directory.txt 41 | 42 | # Unit test / coverage reports 43 | htmlcov/ 44 | .tox/ 45 | .coverage 46 | .coverage.* 47 | .cache 48 | nosetests.xml 49 | coverage.xml 50 | *.cover 51 | .hypothesis/ 52 | .pytest_cache/ 53 | 54 | # Translations 55 | *.mo 56 | *.pot 57 | 58 | # Django stuff: 59 | *.log 60 | local_settings.py 61 | 62 | # Flask stuff: 63 | instance/ 64 | .webassets-cache 65 | 66 | # Scrapy stuff: 67 | .scrapy 68 | 69 | # Sphinx documentation 70 | docs/_build/ 71 | 72 | # PyBuilder 73 | target/ 74 | 75 | # Jupyter Notebook 76 | .ipynb_checkpoints 77 | 78 | # pyenv 79 | .python-version 80 | 81 | # celery beat schedule file 82 | celerybeat-schedule 83 | 84 | # SageMath parsed files 85 | *.sage.py 86 | 87 | # dotenv 88 | .env 89 | 90 | # virtualenv 91 | .venv 92 | venv/ 93 | ENV/ 94 | 95 | # Spyder project settings 96 | .spyderproject 97 | .spyproject 98 | 99 | # Rope project settings 100 | .ropeproject 101 | 102 | # mkdocs documentation 103 | /site 104 | 105 | # mypy 106 | .mypy_cache/ 107 | 108 | # jetbrains ide stuff 109 | *.iml 110 | .idea/ 111 | 112 | # vscode ide stuff 113 | *.code-workspace 114 | .history 115 | .vscode 116 | 117 | # docker artifacts 118 | .docker 119 | 120 | # vi* 121 | *.swp 122 | 123 | build/ 124 | -------------------------------------------------------------------------------- /setup.py: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env python 2 | import os 3 | 4 | from setuptools import setup 5 | 6 | import versioneer 7 | 8 | here = os.path.abspath(os.path.dirname(__file__)) 9 | with open(os.path.join(here, "README.md"), encoding="utf-8") as f: 10 | long_description = f.read() 11 | 12 | setup( 13 | name="fsspec", 14 | version=versioneer.get_version(), 15 | cmdclass=versioneer.get_cmdclass(), 16 | classifiers=[ 17 | "Development Status :: 4 - Beta", 18 | "Intended Audience :: Developers", 19 | "License :: OSI Approved :: BSD License", 20 | "Operating System :: OS Independent", 21 | "Programming Language :: Python :: 3.7", 22 | "Programming Language :: Python :: 3.8", 23 | "Programming Language :: Python :: 3.9", 24 | "Programming Language :: Python :: 3.10", 25 | ], 26 | description="File-system specification", 27 | long_description=long_description, 28 | long_description_content_type="text/markdown", 29 | url="http://github.com/fsspec/filesystem_spec", 30 | maintainer="Martin Durant", 31 | maintainer_email="mdurant@anaconda.com", 32 | license="BSD", 33 | keywords="file", 34 | packages=["fsspec", "fsspec.implementations"], 35 | python_requires=">=3.7", 36 | install_requires=open("requirements.txt").read().strip().split("\n"), 37 | extras_require={ 38 | "entrypoints": ["importlib_metadata ; python_version < '3.8' "], 39 | "abfs": ["adlfs"], 40 | "adl": ["adlfs"], 41 | "dask": ["dask", "distributed"], 42 | "dropbox": ["dropboxdrivefs", "requests", "dropbox"], 43 | "gcs": ["gcsfs"], 44 | "git": ["pygit2"], 45 | "github": ["requests"], 46 | "gs": ["gcsfs"], 47 | "hdfs": ["pyarrow >= 1"], 48 | "arrow": ["pyarrow >= 1"], 49 | "http": ["requests", "aiohttp"], 50 | "sftp": ["paramiko"], 51 | "s3": ["s3fs"], 52 | "oci": ["ocifs"], 53 | "smb": ["smbprotocol"], 54 | "ssh": ["paramiko"], 55 | "fuse": ["fusepy"], 56 | "libarchive": ["libarchive-c"], 57 | "gui": ["panel"], 58 | "tqdm": ["tqdm"], 59 | }, 60 | zip_safe=False, 61 | ) 62 | -------------------------------------------------------------------------------- /fsspec/tests/test_config.py: -------------------------------------------------------------------------------- 1 | import os 2 | 3 | import pytest 4 | 5 | import fsspec 6 | from fsspec.config import conf, set_conf_env, set_conf_files 7 | 8 | 9 | @pytest.fixture 10 | def clean_conf(): 11 | """Tests should start and end with clean config dict""" 12 | conf.clear() 13 | yield 14 | conf.clear() 15 | 16 | 17 | def test_from_env(clean_conf): 18 | env = { 19 | "FSSPEC_PROTO_KEY": "value", 20 | "FSSPEC_PROTO_LONG_KEY": "othervalue", 21 | "FSSPEC_MALFORMED": "novalue", 22 | } 23 | cd = {} 24 | set_conf_env(conf_dict=cd, envdict=env) 25 | assert cd == {"proto": {"key": "value", "long_key": "othervalue"}} 26 | 27 | 28 | def test_from_file_ini(clean_conf, tmpdir): 29 | file1 = os.path.join(tmpdir, "1.ini") 30 | file2 = os.path.join(tmpdir, "2.ini") 31 | with open(file1, "w") as f: 32 | f.write( 33 | """[proto] 34 | key=value 35 | other_key:othervalue 36 | overwritten=dont_see 37 | """ 38 | ) 39 | with open(file2, "w") as f: 40 | f.write( 41 | """[proto] 42 | overwritten=see 43 | """ 44 | ) 45 | cd = {} 46 | set_conf_files(tmpdir, cd) 47 | assert cd == { 48 | "proto": {"key": "value", "other_key": "othervalue", "overwritten": "see"} 49 | } 50 | 51 | 52 | def test_from_file_json(clean_conf, tmpdir): 53 | file1 = os.path.join(tmpdir, "1.json") 54 | file2 = os.path.join(tmpdir, "2.json") 55 | with open(file1, "w") as f: 56 | f.write( 57 | """{"proto": 58 | {"key": "value", 59 | "other_key": "othervalue", 60 | "overwritten": false}} 61 | """ 62 | ) 63 | with open(file2, "w") as f: 64 | f.write( 65 | """{"proto": 66 | {"overwritten": true}} 67 | """ 68 | ) 69 | cd = {} 70 | set_conf_files(tmpdir, cd) 71 | assert cd == { 72 | "proto": {"key": "value", "other_key": "othervalue", "overwritten": True} 73 | } 74 | 75 | 76 | def test_apply(clean_conf): 77 | conf["file"] = {"auto_mkdir": "test"} 78 | fs = fsspec.filesystem("file") 79 | assert fs.auto_mkdir == "test" 80 | fs = fsspec.filesystem("file", auto_mkdir=True) 81 | assert fs.auto_mkdir is True 82 | -------------------------------------------------------------------------------- /fsspec/archive.py: -------------------------------------------------------------------------------- 1 | from fsspec import AbstractFileSystem 2 | from fsspec.utils import tokenize 3 | 4 | 5 | class AbstractArchiveFileSystem(AbstractFileSystem): 6 | """ 7 | A generic superclass for implementing Archive-based filesystems. 8 | 9 | Currently, it is shared amongst `ZipFileSystem`, `LibArchiveFileSystem` and 10 | `TarFileSystem`. 11 | """ 12 | 13 | def __str__(self): 14 | return "" % (type(self).__name__, id(self)) 15 | 16 | __repr__ = __str__ 17 | 18 | def ukey(self, path): 19 | return tokenize(path, self.fo, self.protocol) 20 | 21 | def _all_dirnames(self, paths): 22 | """Returns *all* directory names for each path in paths, including intermediate ones. 23 | 24 | Parameters 25 | ---------- 26 | paths: Iterable of path strings 27 | """ 28 | if len(paths) == 0: 29 | return set() 30 | 31 | dirnames = {self._parent(path) for path in paths} - {self.root_marker} 32 | return dirnames | self._all_dirnames(dirnames) 33 | 34 | def info(self, path, **kwargs): 35 | self._get_dirs() 36 | path = self._strip_protocol(path) 37 | if path in self.dir_cache: 38 | return self.dir_cache[path] 39 | elif path + "/" in self.dir_cache: 40 | return self.dir_cache[path + "/"] 41 | else: 42 | raise FileNotFoundError(path) 43 | 44 | def ls(self, path, detail=False, **kwargs): 45 | self._get_dirs() 46 | paths = {} 47 | for p, f in self.dir_cache.items(): 48 | p = p.rstrip("/") 49 | if "/" in p: 50 | root = p.rsplit("/", 1)[0] 51 | else: 52 | root = "" 53 | if root == path.rstrip("/"): 54 | paths[p] = f 55 | elif all( 56 | (a == b) 57 | for a, b in zip(path.split("/"), [""] + p.strip("/").split("/")) 58 | ): 59 | # root directory entry 60 | ppath = p.rstrip("/").split("/", 1)[0] 61 | if ppath not in paths: 62 | out = {"name": ppath + "/", "size": 0, "type": "directory"} 63 | paths[ppath] = out 64 | out = list(paths.values()) 65 | if detail: 66 | return out 67 | else: 68 | return list(sorted(f["name"] for f in out)) 69 | -------------------------------------------------------------------------------- /tox.ini: -------------------------------------------------------------------------------- 1 | # content of: tox.ini , put in same dir as setup.py 2 | [tox] 3 | envlist = {py37,py38,py39} 4 | 5 | [core] 6 | conda_channels= 7 | conda-forge 8 | defaults 9 | conda_deps= 10 | pip 11 | paramiko 12 | requests 13 | zstandard 14 | python-snappy 15 | aiohttp 16 | lz4 17 | distributed 18 | dask 19 | 'pyarrow >= 1' 20 | panel 21 | notebook 22 | pygit2 23 | git 24 | s3fs 25 | pyftpdlib 26 | cloudpickle 27 | pytest 28 | pytest-asyncio 29 | pytest-benchmark 30 | pytest-cov 31 | pytest-mock 32 | pytest-vcr 33 | fusepy 34 | tomli < 2 35 | msgpack-python 36 | python-libarchive-c 37 | numpy 38 | nomkl 39 | jinja2 40 | tqdm 41 | deps= 42 | hadoop-test-cluster==0.1.0 43 | smbprotocol 44 | py37: importlib_metadata 45 | 46 | [testenv] 47 | description=Run test suite against target versions. 48 | conda_channels= 49 | {[core]conda_channels} 50 | conda_deps= 51 | {[core]conda_deps} 52 | deps= 53 | {[core]deps} 54 | commands = 55 | pytest --cov=fsspec -v -r s {posargs} 56 | passenv = CIRUN 57 | 58 | [testenv:s3fs] 59 | description=Run s3fs (@master) test suite against fsspec. 60 | extras=s3 61 | conda_channels= 62 | defaults 63 | conda-forge 64 | conda_deps= 65 | {[core]conda_deps} 66 | httpretty 67 | aiobotocore 68 | "moto<3.0" 69 | flask 70 | changedir=.tox/s3fs/tmp 71 | whitelist_externals= 72 | rm 73 | git 74 | setenv= 75 | BOTO_CONFIG=/dev/null 76 | AWS_ACCESS_KEY_ID=foobar_key 77 | AWS_SECRET_ACCESS_KEY=foobar_secret 78 | commands= 79 | rm -rf s3fs 80 | git clone https://github.com/fsspec/s3fs 81 | pytest -vv s3fs/s3fs 82 | 83 | [testenv:gcsfs] 84 | description=Run gcsfs (@master) test suite against fsspec. 85 | extras=gcs 86 | conda_channels= 87 | defaults 88 | conda-forge 89 | conda_deps= 90 | {[core]conda_deps} 91 | deps= 92 | {[core]deps} 93 | vcrpy 94 | ujson 95 | google-auth-oauthlib 96 | crcmod 97 | changedir=.tox/gcsfs/tmp 98 | whitelist_externals= 99 | rm 100 | git 101 | setenv= 102 | GCSFS_RECORD_MODE=none 103 | GOOGLE_APPLICATION_CREDENTIALS=gcsfs/gcsfs/tests/fake-secret.json 104 | commands= 105 | rm -rf gcsfs 106 | git clone https://github.com/fsspec/gcsfs 107 | pytest -vv gcsfs/gcsfs -k 'not fuse' 108 | -------------------------------------------------------------------------------- /fsspec/implementations/tests/cassettes/test_dbfs_file_listing.yaml: -------------------------------------------------------------------------------- 1 | interactions: 2 | - request: 3 | body: '{"path": "/"}' 4 | headers: 5 | Accept: 6 | - '*/*' 7 | Accept-Encoding: 8 | - gzip, deflate 9 | Connection: 10 | - keep-alive 11 | Content-Length: 12 | - '13' 13 | Content-Type: 14 | - application/json 15 | User-Agent: 16 | - python-requests/2.25.1 17 | authorization: 18 | - DUMMY 19 | method: GET 20 | uri: https://my_instance.com/api/2.0/dbfs/list 21 | response: 22 | body: 23 | string: !!binary | 24 | H4sIAAAAAAAEA4zLMQ5AQBBG4bv89Qr1HsAFlCKyGDFByMxsQ9ydaHSy7cv3Toy8kMLXJ/ZgEzzy 25 | 8imVbUJwYG0HFniTSO61rfJB8MXlvmMIFjrhftZMSONimrwaqaXjdU+2UUn+cXPdAAAA//8DAHlY 26 | NJf+AAAA 27 | headers: 28 | content-encoding: 29 | - gzip 30 | content-type: 31 | - application/json 32 | server: 33 | - databricks 34 | strict-transport-security: 35 | - max-age=31536000; includeSubDomains; preload 36 | transfer-encoding: 37 | - chunked 38 | vary: 39 | - Accept-Encoding 40 | x-content-type-options: 41 | - nosniff 42 | status: 43 | code: 200 44 | message: OK 45 | - request: 46 | body: '{"path": "/"}' 47 | headers: 48 | Accept: 49 | - '*/*' 50 | Accept-Encoding: 51 | - gzip, deflate 52 | Connection: 53 | - keep-alive 54 | Content-Length: 55 | - '13' 56 | Content-Type: 57 | - application/json 58 | User-Agent: 59 | - python-requests/2.25.1 60 | authorization: 61 | - DUMMY 62 | method: GET 63 | uri: https://my_instance.com/api/2.0/dbfs/list 64 | response: 65 | body: 66 | string: !!binary | 67 | H4sIAAAAAAAEA4zLMQ5AQBBG4bv89Qr1HsAFlCKyGDFByMxsQ9ydaHSy7cv3Toy8kMLXJ/ZgEzzy 68 | 8imVbUJwYG0HFniTSO61rfJB8MXlvmMIFjrhftZMSONimrwaqaXjdU+2UUn+cXPdAAAA//8DAHlY 69 | NJf+AAAA 70 | headers: 71 | content-encoding: 72 | - gzip 73 | content-type: 74 | - application/json 75 | server: 76 | - databricks 77 | strict-transport-security: 78 | - max-age=31536000; includeSubDomains; preload 79 | transfer-encoding: 80 | - chunked 81 | vary: 82 | - Accept-Encoding 83 | x-content-type-options: 84 | - nosniff 85 | status: 86 | code: 200 87 | message: OK 88 | version: 1 89 | -------------------------------------------------------------------------------- /fsspec/implementations/dvc.py: -------------------------------------------------------------------------------- 1 | import os 2 | 3 | import dvc.repo 4 | 5 | from fsspec.implementations.local import LocalFileSystem 6 | from fsspec.spec import AbstractFileSystem 7 | 8 | lfs = LocalFileSystem() 9 | 10 | 11 | class DVCFileSystem(AbstractFileSystem): 12 | """DVC backend (experimental) 13 | 14 | Load data files that are versioned using the `Data Version Control`_ system 15 | 16 | .. _Data Version Control: https://dvc.org/ 17 | 18 | This interface is incomplete and experimental. 19 | """ 20 | 21 | root_marker = "" 22 | 23 | def __init__(self, path=None, **kwargs): 24 | """ 25 | 26 | Parameters 27 | ---------- 28 | path: str (optional) 29 | Location of the repo to access; defaults to the current directory. 30 | """ 31 | super().__init__(**kwargs) 32 | self.repo = dvc.repo.Repo(path) 33 | self.path = self.repo.find_root() 34 | 35 | @classmethod 36 | def _strip_protocol(cls, path): 37 | return super()._strip_protocol(path).lstrip("/") 38 | 39 | def ls(self, path, detail=False, **kwargs): 40 | path = self._strip_protocol(path) 41 | allfiles = self.repo.tree.walk(os.path.join(self.repo.root_dir, path)) 42 | dirname, dirs, files = next(allfiles) 43 | out = [os.path.join(path, f) for f in dirs + files] 44 | details = [] 45 | 46 | for f in out: 47 | full = os.path.join(self.repo.root_dir, f) 48 | file_info = lfs.info(full) 49 | if lfs.isdir(full): 50 | details.append(file_info) 51 | else: 52 | try: 53 | extra = self.repo.find_out_by_relpath(full).dumpd() 54 | except dvc.exceptions.OutputNotFoundError: 55 | continue 56 | details.append(dict(**extra, **file_info)) 57 | details[-1]["name"] = f 58 | if detail: 59 | return details 60 | return [d["name"] for d in details] 61 | 62 | def ukey(self, path): 63 | return self.info(path)["md5"] 64 | 65 | def _open( 66 | self, 67 | path, 68 | mode="rb", 69 | block_size=None, 70 | autocommit=True, 71 | cache_options=None, 72 | **kwargs, 73 | ): 74 | # returns a context file object (i.e., needs to be used with ``with`` 75 | path = self._strip_protocol(path) 76 | return self.repo.open_by_relpath(path) 77 | -------------------------------------------------------------------------------- /fsspec/transaction.py: -------------------------------------------------------------------------------- 1 | class Transaction(object): 2 | """Filesystem transaction write context 3 | 4 | Gathers files for deferred commit or discard, so that several write 5 | operations can be finalized semi-atomically. This works by having this 6 | instance as the ``.transaction`` attribute of the given filesystem 7 | """ 8 | 9 | def __init__(self, fs): 10 | """ 11 | Parameters 12 | ---------- 13 | fs: FileSystem instance 14 | """ 15 | self.fs = fs 16 | self.files = [] 17 | 18 | def __enter__(self): 19 | self.start() 20 | 21 | def __exit__(self, exc_type, exc_val, exc_tb): 22 | """End transaction and commit, if exit is not due to exception""" 23 | # only commit if there was no exception 24 | self.complete(commit=exc_type is None) 25 | self.fs._intrans = False 26 | self.fs._transaction = None 27 | 28 | def start(self): 29 | """Start a transaction on this FileSystem""" 30 | self.files = [] # clean up after previous failed completions 31 | self.fs._intrans = True 32 | 33 | def complete(self, commit=True): 34 | """Finish transaction: commit or discard all deferred files""" 35 | for f in self.files: 36 | if commit: 37 | f.commit() 38 | else: 39 | f.discard() 40 | self.files = [] 41 | self.fs._intrans = False 42 | 43 | 44 | class FileActor(object): 45 | def __init__(self): 46 | self.files = [] 47 | 48 | def commit(self): 49 | for f in self.files: 50 | f.commit() 51 | self.files.clear() 52 | 53 | def discard(self): 54 | for f in self.files: 55 | f.discard() 56 | self.files.clear() 57 | 58 | def append(self, f): 59 | self.files.append(f) 60 | 61 | 62 | class DaskTransaction(Transaction): 63 | def __init__(self, fs): 64 | """ 65 | Parameters 66 | ---------- 67 | fs: FileSystem instance 68 | """ 69 | import distributed 70 | 71 | super().__init__(fs) 72 | client = distributed.default_client() 73 | self.files = client.submit(FileActor, actor=True).result() 74 | 75 | def complete(self, commit=True): 76 | """Finish transaction: commit or discard all deferred files""" 77 | if commit: 78 | self.files.commit().result() 79 | else: 80 | self.files.discard().result() 81 | self.fs._intrans = False 82 | -------------------------------------------------------------------------------- /fsspec/dircache.py: -------------------------------------------------------------------------------- 1 | import time 2 | from collections.abc import MutableMapping 3 | from functools import lru_cache 4 | 5 | 6 | class DirCache(MutableMapping): 7 | """ 8 | Caching of directory listings, in a structure like:: 9 | 10 | {"path0": [ 11 | {"name": "path0/file0", 12 | "size": 123, 13 | "type": "file", 14 | ... 15 | }, 16 | {"name": "path0/file1", 17 | }, 18 | ... 19 | ], 20 | "path1": [...] 21 | } 22 | 23 | Parameters to this class control listing expiry or indeed turn 24 | caching off 25 | """ 26 | 27 | def __init__( 28 | self, 29 | use_listings_cache=True, 30 | listings_expiry_time=None, 31 | max_paths=None, 32 | **kwargs, 33 | ): 34 | """ 35 | 36 | Parameters 37 | ---------- 38 | use_listings_cache: bool 39 | If False, this cache never returns items, but always reports KeyError, 40 | and setting items has no effect 41 | listings_expiry_time: int or float (optional) 42 | Time in seconds that a listing is considered valid. If None, 43 | listings do not expire. 44 | max_paths: int (optional) 45 | The number of most recent listings that are considered valid; 'recent' 46 | refers to when the entry was set. 47 | """ 48 | self._cache = {} 49 | self._times = {} 50 | if max_paths: 51 | self._q = lru_cache(max_paths + 1)(lambda key: self._cache.pop(key, None)) 52 | self.use_listings_cache = use_listings_cache 53 | self.listings_expiry_time = listings_expiry_time 54 | self.max_paths = max_paths 55 | 56 | def __getitem__(self, item): 57 | if self.listings_expiry_time is not None: 58 | if self._times.get(item, 0) - time.time() < -self.listings_expiry_time: 59 | del self._cache[item] 60 | if self.max_paths: 61 | self._q(item) 62 | return self._cache[item] # maybe raises KeyError 63 | 64 | def clear(self): 65 | self._cache.clear() 66 | 67 | def __len__(self): 68 | return len(self._cache) 69 | 70 | def __contains__(self, item): 71 | try: 72 | self[item] 73 | return True 74 | except KeyError: 75 | return False 76 | 77 | def __setitem__(self, key, value): 78 | if not self.use_listings_cache: 79 | return 80 | if self.max_paths: 81 | self._q(key) 82 | self._cache[key] = value 83 | if self.listings_expiry_time is not None: 84 | self._times[key] = time.time() 85 | 86 | def __delitem__(self, key): 87 | del self._cache[key] 88 | 89 | def __iter__(self): 90 | return (k for k in self._cache if k in self) 91 | 92 | def __reduce__(self): 93 | return ( 94 | DirCache, 95 | (self.use_listings_cache, self.listings_expiry_time, self.max_paths), 96 | ) 97 | -------------------------------------------------------------------------------- /fsspec/implementations/tests/test_git.py: -------------------------------------------------------------------------------- 1 | import os 2 | import shutil 3 | import subprocess 4 | import tempfile 5 | 6 | import pytest 7 | 8 | import fsspec 9 | from fsspec.implementations.local import make_path_posix 10 | 11 | pygit2 = pytest.importorskip("pygit2") 12 | 13 | 14 | @pytest.fixture() 15 | def repo(): 16 | orig_dir = os.getcwd() 17 | d = tempfile.mkdtemp() 18 | try: 19 | os.chdir(d) 20 | subprocess.call("git init -b master", shell=True, cwd=d) 21 | subprocess.call("git init -b master", shell=True, cwd=d) 22 | subprocess.call('git config user.email "you@example.com"', shell=True, cwd=d) 23 | subprocess.call('git config user.name "Your Name"', shell=True, cwd=d) 24 | open(os.path.join(d, "file1"), "wb").write(b"data0") 25 | subprocess.call("git add file1", shell=True, cwd=d) 26 | subprocess.call('git commit -m "init"', shell=True, cwd=d) 27 | sha = open(os.path.join(d, ".git/refs/heads/master"), "r").read().strip() 28 | open(os.path.join(d, "file1"), "wb").write(b"data00") 29 | subprocess.check_output('git commit -a -m "tagger"', shell=True, cwd=d) 30 | subprocess.call('git tag -a thetag -m "make tag"', shell=True, cwd=d) 31 | open(os.path.join(d, "file2"), "wb").write(b"data000") 32 | subprocess.call("git add file2", shell=True) 33 | subprocess.call('git commit -m "master tip"', shell=True, cwd=d) 34 | subprocess.call("git checkout -b abranch", shell=True, cwd=d) 35 | os.mkdir("inner") 36 | open(os.path.join(d, "inner", "file1"), "wb").write(b"data3") 37 | subprocess.call("git add inner/file1", shell=True, cwd=d) 38 | subprocess.call('git commit -m "branch tip"', shell=True, cwd=d) 39 | os.chdir(orig_dir) 40 | yield d, sha 41 | finally: 42 | os.chdir(orig_dir) 43 | shutil.rmtree(d) 44 | 45 | 46 | def test_refs(repo): 47 | d, sha = repo 48 | with fsspec.open("git://file1", path=d, ref=sha) as f: 49 | assert f.read() == b"data0" 50 | 51 | with fsspec.open("git://file1", path=d, ref="thetag") as f: 52 | assert f.read() == b"data00" 53 | 54 | with fsspec.open("git://file2", path=d, ref="master") as f: 55 | assert f.read() == b"data000" 56 | 57 | with fsspec.open("git://file2", path=d, ref=None) as f: 58 | assert f.read() == b"data000" 59 | 60 | with fsspec.open("git://inner/file1", path=d, ref="abranch") as f: 61 | assert f.read() == b"data3" 62 | 63 | 64 | def test_url(repo): 65 | d, sha = repo 66 | fs, _, paths = fsspec.core.get_fs_token_paths(f"git://file1::file://{d}") 67 | assert make_path_posix(d) in make_path_posix(fs.repo.path) 68 | assert paths == ["file1"] 69 | with fsspec.open(f"git://file1::file://{d}") as f: 70 | assert f.read() == b"data00" 71 | 72 | fs, _, paths = fsspec.core.get_fs_token_paths(f"git://{d}:master@file1") 73 | assert make_path_posix(d) in make_path_posix(fs.repo.path) 74 | assert paths == ["file1"] 75 | with fsspec.open(f"git://{d}:master@file1") as f: 76 | assert f.read() == b"data00" 77 | -------------------------------------------------------------------------------- /fsspec/config.py: -------------------------------------------------------------------------------- 1 | import configparser 2 | import json 3 | import os 4 | 5 | conf = {} 6 | default_conf_dir = os.path.join(os.path.expanduser("~"), ".config/fsspec") 7 | conf_dir = os.environ.get("FSSPEC_CONFIG_DIR", default_conf_dir) 8 | 9 | 10 | def set_conf_env(conf_dict, envdict=os.environ): 11 | """Set config values from environment variables 12 | 13 | Looks for variable of the form ``FSSPEC__``. 14 | There is no attempt to convert strings, but the kwarg keys will 15 | be lower-cased. 16 | 17 | Parameters 18 | ---------- 19 | conf_dict : dict(str, dict) 20 | This dict will be mutated 21 | envdict : dict-like(str, str) 22 | Source for the values - usually the real environment 23 | """ 24 | for key in envdict: 25 | if key.startswith("FSSPEC"): 26 | if key.count("_") < 2: 27 | continue 28 | _, proto, kwarg = key.split("_", 2) 29 | conf_dict.setdefault(proto.lower(), {})[kwarg.lower()] = envdict[key] 30 | 31 | 32 | def set_conf_files(cdir, conf_dict): 33 | """Set config values from files 34 | 35 | Scans for INI and JSON files in the given dictionary, and uses their 36 | contents to set the config. In case of repeated values, later values 37 | win. 38 | 39 | In the case of INI files, all values are strings, and these will not 40 | be converted. 41 | 42 | Parameters 43 | ---------- 44 | cdir : str 45 | Directory to search 46 | conf_dict : dict(str, dict) 47 | This dict will be mutated 48 | """ 49 | if not os.path.isdir(cdir): 50 | return 51 | allfiles = sorted(os.listdir(cdir)) 52 | for fn in allfiles: 53 | if fn.endswith(".ini"): 54 | ini = configparser.ConfigParser() 55 | ini.read(os.path.join(cdir, fn)) 56 | for key in ini: 57 | if key == "DEFAULT": 58 | continue 59 | conf_dict.setdefault(key, {}).update(dict(ini[key])) 60 | if fn.endswith(".json"): 61 | with open(os.path.join(cdir, fn)) as f: 62 | js = json.load(f) 63 | for key in js: 64 | conf_dict.setdefault(key, {}).update(dict(js[key])) 65 | 66 | 67 | def apply_config(cls, kwargs, conf_dict=None): 68 | """Supply default values for kwargs when instantiating class 69 | 70 | Augments the passed kwargs, by finding entries in the config dict 71 | which match the classes ``.protocol`` attribute (one or more str) 72 | 73 | Parameters 74 | ---------- 75 | cls : file system implementation 76 | kwargs : dict 77 | conf_dict : dict of dict 78 | Typically this is the global configuration 79 | 80 | Returns 81 | ------- 82 | dict : the modified set of kwargs 83 | """ 84 | if conf_dict is None: 85 | conf_dict = conf 86 | protos = cls.protocol if isinstance(cls.protocol, (tuple, list)) else [cls.protocol] 87 | kw = {} 88 | for proto in protos: 89 | # default kwargs from the current state of the config 90 | if proto in conf_dict: 91 | kw.update(conf_dict[proto]) 92 | # explicit kwargs always win 93 | kw.update(**kwargs) 94 | kwargs = kw 95 | return kwargs 96 | 97 | 98 | set_conf_files(conf_dir, conf) 99 | set_conf_env(conf) 100 | -------------------------------------------------------------------------------- /README.md: -------------------------------------------------------------------------------- 1 | # filesystem_spec 2 | 3 | [![PyPI version](https://badge.fury.io/py/fsspec.svg)](https://pypi.python.org/pypi/fsspec/) 4 | [![Anaconda-Server Badge](https://anaconda.org/conda-forge/fsspec/badges/version.svg)](https://anaconda.org/conda-forge/fsspec) 5 | ![Build](https://github.com/fsspec/filesystem_spec/workflows/CI/badge.svg) 6 | [![Docs](https://readthedocs.org/projects/filesystem-spec/badge/?version=latest)](https://filesystem-spec.readthedocs.io/en/latest/?badge=latest) 7 | 8 | A specification for pythonic filesystems. 9 | 10 | ## Install 11 | 12 | ```bash 13 | pip install fsspec 14 | ``` 15 | or 16 | ```bash 17 | conda install -c conda-forge fsspec 18 | ``` 19 | 20 | ## Purpose 21 | 22 | To produce a template or specification for a file-system interface, that specific implementations should follow, 23 | so that applications making use of them can rely on a common behaviour and not have to worry about the specific 24 | internal implementation decisions with any given backend. Many such implementations are included in this package, 25 | or in sister projects such as `s3fs` and `gcsfs`. 26 | 27 | In addition, if this is well-designed, then additional functionality, such as a key-value store or FUSE 28 | mounting of the file-system implementation may be available for all implementations "for free". 29 | 30 | ## Documentation 31 | 32 | Please refer to [RTD](https://filesystem-spec.readthedocs.io/en/latest/?badge=latest) 33 | 34 | ## Develop 35 | 36 | fsspec uses [tox](https://tox.readthedocs.io/en/latest/) and 37 | [tox-conda](https://github.com/tox-dev/tox-conda) to manage dev and test 38 | environments. First, install conda with tox and tox-conda in a base environment 39 | (eg. ``conda install -c conda-forge tox tox-conda``). Calls to ``tox`` can then be 40 | used to configure a development environment and run tests. 41 | 42 | First, setup a development conda environment via ``tox -e {env}`` where ``env`` is one of ``{py36,py37,py38,py39}``. 43 | This will install fsspec dependencies, test & dev tools, and install fsspec in develop 44 | mode. You may activate the dev environment under ``.tox/{env}`` via ``conda activate .tox/{env}``. 45 | 46 | ### Testing 47 | 48 | Tests can be run in the dev environment, if activated, via ``pytest fsspec``. 49 | 50 | Alternatively, the full fsspec test suite can also be run via ``tox``, which will 51 | also build the appropriate environment (see above), with the environment specified 52 | by the TOXENV environment variable. 53 | 54 | The full fsspec suite requires a system-level docker, docker-compose, and fuse 55 | installation. 56 | 57 | ### Code Formatting 58 | 59 | fsspec uses [Black](https://black.readthedocs.io/en/stable) to ensure 60 | a consistent code format throughout the project. 61 | Run ``black fsspec`` from the root of the filesystem_spec repository to 62 | auto-format your code. Additionally, many editors have plugins that will apply 63 | ``black`` as you edit files. ``black`` is included in the ``tox`` environments. 64 | 65 | 66 | Optionally, you may wish to setup [pre-commit hooks](https://pre-commit.com) to 67 | automatically run ``black`` when you make a git commit. 68 | Run ``pre-commit install --install-hooks`` from the root of the 69 | filesystem_spec repository to setup pre-commit hooks. ``black`` will now be run 70 | before you commit, reformatting any changed files. You can format without 71 | committing via ``pre-commit run`` or skip these checks with ``git commit 72 | --no-verify``. 73 | -------------------------------------------------------------------------------- /fsspec/tests/test_registry.py: -------------------------------------------------------------------------------- 1 | import sys 2 | from unittest.mock import create_autospec, patch 3 | 4 | import pytest 5 | 6 | from fsspec.registry import ( 7 | ReadOnlyError, 8 | _registry, 9 | get_filesystem_class, 10 | known_implementations, 11 | register_implementation, 12 | registry, 13 | ) 14 | from fsspec.spec import AbstractFileSystem 15 | 16 | try: 17 | from importlib.metadata import EntryPoint 18 | except ImportError: # python < 3.8 19 | from importlib_metadata import EntryPoint 20 | 21 | 22 | @pytest.fixture() 23 | def clear_registry(): 24 | try: 25 | yield 26 | finally: 27 | _registry.clear() 28 | known_implementations.pop("test", None) 29 | 30 | 31 | @pytest.fixture() 32 | def clean_imports(): 33 | try: 34 | real_module = sys.modules["fsspec"] 35 | del sys.modules["fsspec"] 36 | yield 37 | finally: 38 | sys.modules["fsspec"] = real_module 39 | 40 | 41 | def test_registry_readonly(): 42 | get_filesystem_class("file") 43 | assert "file" in registry 44 | assert "file" in list(registry) 45 | with pytest.raises(ReadOnlyError): 46 | del registry["file"] 47 | with pytest.raises(ReadOnlyError): 48 | registry["file"] = None 49 | with pytest.raises(ReadOnlyError): 50 | registry.clear() 51 | 52 | 53 | def test_register_cls(clear_registry): 54 | with pytest.raises(ValueError): 55 | get_filesystem_class("test") 56 | register_implementation("test", AbstractFileSystem) 57 | cls = get_filesystem_class("test") 58 | assert cls is AbstractFileSystem 59 | 60 | 61 | def test_register_str(clear_registry): 62 | with pytest.raises(ValueError): 63 | get_filesystem_class("test") 64 | register_implementation("test", "fsspec.AbstractFileSystem") 65 | assert "test" not in registry 66 | cls = get_filesystem_class("test") 67 | assert cls is AbstractFileSystem 68 | assert "test" in registry 69 | 70 | 71 | def test_register_fail(clear_registry): 72 | register_implementation("test", "doesntexist.AbstractFileSystem") 73 | with pytest.raises(ImportError): 74 | get_filesystem_class("test") 75 | 76 | register_implementation("test", "doesntexist.AbstractFileSystem") 77 | with pytest.raises(ValueError): 78 | register_implementation("test", "doesntexist.AbstractFileSystem", clobber=False) 79 | 80 | register_implementation( 81 | "test", "doesntexist.AbstractFileSystem", errtxt="hiho", clobber=True 82 | ) 83 | with pytest.raises(ImportError) as e: 84 | get_filesystem_class("test") 85 | assert "hiho" in str(e.value) 86 | register_implementation("test", AbstractFileSystem) 87 | 88 | with pytest.raises(ValueError): 89 | register_implementation("test", AbstractFileSystem, clobber=False) 90 | register_implementation("test", AbstractFileSystem, clobber=True) 91 | 92 | 93 | def test_entry_points_registered_on_import(clear_registry, clean_imports): 94 | mock_ep = create_autospec(EntryPoint, module="fsspec.spec.AbstractFileSystem") 95 | mock_ep.name = "test" # this can't be set in the constructor... 96 | mock_ep.value = "fsspec.spec.AbstractFileSystem" 97 | if sys.version_info < (3, 8): 98 | import_location = "importlib_metadata.entry_points" 99 | else: 100 | import_location = "importlib.metadata.entry_points" 101 | with patch(import_location, return_value={"fsspec.specs": [mock_ep]}): 102 | assert "test" not in registry 103 | import fsspec # noqa 104 | 105 | get_filesystem_class("test") 106 | assert "test" in registry 107 | -------------------------------------------------------------------------------- /fsspec/implementations/tests/test_smb.py: -------------------------------------------------------------------------------- 1 | # -*- coding: utf-8 -*- 2 | """ 3 | Test SMBFileSystem class using a docker container 4 | """ 5 | 6 | import logging 7 | import shlex 8 | import subprocess 9 | import time 10 | 11 | import pytest 12 | 13 | import fsspec 14 | 15 | pytest.importorskip("smbprotocol") 16 | 17 | # ! pylint: disable=redefined-outer-name,missing-function-docstring 18 | 19 | 20 | def stop_docker(container): 21 | cmd = shlex.split('docker ps -a -q --filter "name=%s"' % container) 22 | cid = subprocess.check_output(cmd).strip().decode() 23 | if cid: 24 | subprocess.call(["docker", "rm", "-f", "-v", cid]) 25 | 26 | 27 | @pytest.fixture(scope="module") 28 | def smb_params(): 29 | try: 30 | pchk = ["docker", "run", "--name", "fsspec_test_smb", "hello-world"] 31 | subprocess.check_call(pchk) 32 | except (subprocess.CalledProcessError, FileNotFoundError): 33 | pytest.skip("docker run not available") 34 | return 35 | stop_docker("fsspec_test_smb") 36 | 37 | # requires docker 38 | container = "fsspec_smb" 39 | stop_docker(container) 40 | img = "docker run --name {} --detach -p 139:139 -p 445:445 dperson/samba" 41 | cfg = " -p -u 'testuser;testpass' -s 'home;/share;no;no;no;testuser'" 42 | cmd = img.format(container) + cfg 43 | cid = subprocess.check_output(shlex.split(cmd)).strip().decode() 44 | logger = logging.getLogger("fsspec") 45 | logger.debug("Container: %s", cid) 46 | try: 47 | time.sleep(1) 48 | yield dict(host="localhost", port=445, username="testuser", password="testpass") 49 | finally: 50 | import smbclient # pylint: disable=import-outside-toplevel 51 | 52 | smbclient.reset_connection_cache() 53 | stop_docker(container) 54 | 55 | 56 | def test_simple(smb_params): 57 | adir = "/home/adir" 58 | adir2 = "/home/adir/otherdir/" 59 | afile = "/home/adir/otherdir/afile" 60 | fsmb = fsspec.get_filesystem_class("smb")(**smb_params) 61 | fsmb.mkdirs(adir2) 62 | fsmb.touch(afile) 63 | assert fsmb.find(adir) == [afile] 64 | assert fsmb.ls(adir2, detail=False) == [afile] 65 | assert fsmb.info(afile)["type"] == "file" 66 | assert fsmb.info(afile)["size"] == 0 67 | assert fsmb.exists(adir) 68 | fsmb.rm(adir, recursive=True) 69 | assert not fsmb.exists(adir) 70 | 71 | 72 | def test_with_url(smb_params): 73 | smb_url = "smb://{username}:{password}@{host}:{port}/home/someuser.txt" 74 | fwo = fsspec.open(smb_url.format(**smb_params), "wb") 75 | with fwo as fwr: 76 | fwr.write(b"hello") 77 | fro = fsspec.open(smb_url.format(**smb_params), "rb") 78 | with fro as frd: 79 | read_result = frd.read() 80 | assert read_result == b"hello" 81 | 82 | 83 | def test_transaction(smb_params): 84 | afile = "/home/afolder/otherdir/afile" 85 | afile2 = "/home/afolder/otherdir/afile2" 86 | adir = "/home/afolder" 87 | adir2 = "/home/afolder/otherdir" 88 | fsmb = fsspec.get_filesystem_class("smb")(**smb_params) 89 | fsmb.mkdirs(adir2) 90 | fsmb.start_transaction() 91 | fsmb.touch(afile) 92 | assert fsmb.find(adir) == [] 93 | fsmb.end_transaction() 94 | assert fsmb.find(adir) == [afile] 95 | 96 | with fsmb.transaction: 97 | assert fsmb._intrans 98 | fsmb.touch(afile2) 99 | assert fsmb.find(adir) == [afile] 100 | assert fsmb.find(adir) == [afile, afile2] 101 | 102 | 103 | def test_makedirs_exist_ok(smb_params): 104 | fsmb = fsspec.get_filesystem_class("smb")(**smb_params) 105 | fsmb.makedirs("/home/a/b/c") 106 | fsmb.makedirs("/home/a/b/c", exist_ok=True) 107 | -------------------------------------------------------------------------------- /fsspec/implementations/zip.py: -------------------------------------------------------------------------------- 1 | from __future__ import absolute_import, division, print_function 2 | 3 | import zipfile 4 | 5 | from fsspec import open_files 6 | from fsspec.archive import AbstractArchiveFileSystem 7 | from fsspec.utils import DEFAULT_BLOCK_SIZE 8 | 9 | 10 | class ZipFileSystem(AbstractArchiveFileSystem): 11 | """Read contents of ZIP archive as a file-system 12 | 13 | Keeps file object open while instance lives. 14 | 15 | This class is pickleable, but not necessarily thread-safe 16 | """ 17 | 18 | root_marker = "" 19 | protocol = "zip" 20 | cachable = False 21 | 22 | def __init__( 23 | self, 24 | fo="", 25 | mode="r", 26 | target_protocol=None, 27 | target_options=None, 28 | block_size=DEFAULT_BLOCK_SIZE, 29 | **kwargs, 30 | ): 31 | """ 32 | Parameters 33 | ---------- 34 | fo: str or file-like 35 | Contains ZIP, and must exist. If a str, will fetch file using 36 | `open_files()`, which must return one file exactly. 37 | mode: str 38 | Currently, only 'r' accepted 39 | target_protocol: str (optional) 40 | If ``fo`` is a string, this value can be used to override the 41 | FS protocol inferred from a URL 42 | target_options: dict (optional) 43 | Kwargs passed when instantiating the target FS, if ``fo`` is 44 | a string. 45 | """ 46 | super().__init__(self, **kwargs) 47 | if mode != "r": 48 | raise ValueError("Only read from zip files accepted") 49 | if isinstance(fo, str): 50 | files = open_files(fo, protocol=target_protocol, **(target_options or {})) 51 | if len(files) != 1: 52 | raise ValueError( 53 | 'Path "{}" did not resolve to exactly' 54 | 'one file: "{}"'.format(fo, files) 55 | ) 56 | fo = files[0] 57 | self.fo = fo.__enter__() # the whole instance is a context 58 | self.zip = zipfile.ZipFile(self.fo) 59 | self.block_size = block_size 60 | self.dir_cache = None 61 | 62 | @classmethod 63 | def _strip_protocol(cls, path): 64 | # zip file paths are always relative to the archive root 65 | return super()._strip_protocol(path).lstrip("/") 66 | 67 | def _get_dirs(self): 68 | if self.dir_cache is None: 69 | files = self.zip.infolist() 70 | self.dir_cache = { 71 | dirname + "/": {"name": dirname + "/", "size": 0, "type": "directory"} 72 | for dirname in self._all_dirnames(self.zip.namelist()) 73 | } 74 | for z in files: 75 | f = {s: getattr(z, s) for s in zipfile.ZipInfo.__slots__} 76 | f.update( 77 | { 78 | "name": z.filename, 79 | "size": z.file_size, 80 | "type": ("directory" if z.is_dir() else "file"), 81 | } 82 | ) 83 | self.dir_cache[f["name"]] = f 84 | 85 | def _open( 86 | self, 87 | path, 88 | mode="rb", 89 | block_size=None, 90 | autocommit=True, 91 | cache_options=None, 92 | **kwargs, 93 | ): 94 | path = self._strip_protocol(path) 95 | if mode != "rb": 96 | raise NotImplementedError 97 | info = self.info(path) 98 | out = self.zip.open(path, "r") 99 | out.size = info["size"] 100 | out.name = info["name"] 101 | return out 102 | -------------------------------------------------------------------------------- /fsspec/tests/test_caches.py: -------------------------------------------------------------------------------- 1 | import pickle 2 | import string 3 | 4 | import pytest 5 | 6 | from fsspec.caching import BlockCache, FirstChunkCache, caches 7 | 8 | 9 | def test_cache_getitem(Cache_imp): 10 | cacher = Cache_imp(4, letters_fetcher, len(string.ascii_letters)) 11 | assert cacher._fetch(0, 4) == b"abcd" 12 | assert cacher._fetch(None, 4) == b"abcd" 13 | assert cacher._fetch(2, 4) == b"cd" 14 | 15 | 16 | def test_block_cache_lru(): 17 | cache = BlockCache(4, letters_fetcher, len(string.ascii_letters), maxblocks=2) 18 | # miss 19 | cache._fetch(0, 2) 20 | assert cache.cache_info().misses == 1 21 | assert cache.cache_info().currsize == 1 22 | 23 | # hit 24 | cache._fetch(0, 2) 25 | assert cache.cache_info().misses == 1 26 | assert cache.cache_info().currsize == 1 27 | 28 | # miss 29 | cache._fetch(4, 6) 30 | assert cache.cache_info().misses == 2 31 | assert cache.cache_info().currsize == 2 32 | 33 | # miss & evict 34 | cache._fetch(12, 13) 35 | assert cache.cache_info().misses == 3 36 | assert cache.cache_info().currsize == 2 37 | 38 | 39 | def _fetcher(start, end): 40 | return b"0" * (end - start) 41 | 42 | 43 | def letters_fetcher(start, end): 44 | return string.ascii_letters[start:end].encode() 45 | 46 | 47 | not_parts_caches = {k: v for k, v in caches.items() if k != "parts"} 48 | 49 | 50 | @pytest.fixture(params=not_parts_caches.values(), ids=list(not_parts_caches)) 51 | def Cache_imp(request): 52 | return request.param 53 | 54 | 55 | def test_cache_empty_file(Cache_imp): 56 | blocksize = 5 57 | size = 0 58 | cache = Cache_imp(blocksize, _fetcher, size) 59 | assert cache._fetch(0, 0) == b"" 60 | 61 | 62 | def test_cache_pickleable(Cache_imp): 63 | blocksize = 5 64 | size = 100 65 | cache = Cache_imp(blocksize, _fetcher, size) 66 | cache._fetch(0, 5) # fill in cache 67 | unpickled = pickle.loads(pickle.dumps(cache)) 68 | assert isinstance(unpickled, Cache_imp) 69 | assert unpickled.blocksize == blocksize 70 | assert unpickled.size == size 71 | assert unpickled._fetch(0, 10) == b"0" * 10 72 | 73 | 74 | def test_first_cache(): 75 | c = FirstChunkCache(5, letters_fetcher, 52) 76 | assert c.cache is None 77 | assert c._fetch(12, 15) == letters_fetcher(12, 15) 78 | assert c.cache is None 79 | assert c._fetch(3, 10) == letters_fetcher(3, 10) 80 | assert c.cache == letters_fetcher(0, 5) 81 | c.fetcher = None 82 | assert c._fetch(1, 4) == letters_fetcher(1, 4) 83 | 84 | 85 | @pytest.mark.parametrize( 86 | "size_requests", 87 | [[(0, 30), (0, 35), (51, 52)], [(0, 1), (1, 11), (1, 52)], [(0, 52), (11, 15)]], 88 | ) 89 | @pytest.mark.parametrize("blocksize", [1, 10, 52, 100]) 90 | def test_cache_basic(Cache_imp, blocksize, size_requests): 91 | cache = Cache_imp(blocksize, letters_fetcher, len(string.ascii_letters)) 92 | 93 | for start, end in size_requests: 94 | result = cache._fetch(start, end) 95 | expected = string.ascii_letters[start:end].encode() 96 | assert result == expected 97 | 98 | 99 | @pytest.mark.parametrize("strict", [True, False]) 100 | @pytest.mark.parametrize("sort", [True, False]) 101 | def test_known(sort, strict): 102 | parts = {(10, 20): b"1" * 10, (20, 30): b"2" * 10, (0, 10): b"0" * 10} 103 | if sort: 104 | parts = {k: v for k, v in sorted(parts.items())} 105 | c = caches["parts"](None, None, 100, parts, strict=strict) 106 | assert (0, 30) in c.data # got consolidated 107 | assert c._fetch(5, 15) == b"0" * 5 + b"1" * 5 108 | assert c._fetch(15, 25) == b"1" * 5 + b"2" * 5 109 | if strict: 110 | # Over-read will raise error 111 | with pytest.raises(ValueError): 112 | # tries to call None fetcher 113 | c._fetch(25, 35) 114 | else: 115 | # Over-read will be zero-padded 116 | assert c._fetch(25, 35) == b"2" * 5 + b"\x00" * 5 117 | -------------------------------------------------------------------------------- /fsspec/implementations/tests/test_hdfs.py: -------------------------------------------------------------------------------- 1 | from pathlib import Path 2 | 3 | import pytest 4 | 5 | import fsspec 6 | 7 | pyarrow = pytest.importorskip("pyarrow") 8 | 9 | basedir = "/tmp/test-fsspec" 10 | data = b"\n".join([b"some test data"] * 1000) 11 | 12 | 13 | @pytest.fixture 14 | def hdfs(request): 15 | try: 16 | hdfs = pyarrow.hdfs.HadoopFileSystem() 17 | except IOError: 18 | pytest.skip("No HDFS configured") 19 | 20 | if hdfs.exists(basedir): 21 | hdfs.rm(basedir, recursive=True) 22 | 23 | hdfs.mkdir(basedir) 24 | 25 | with hdfs.open(basedir + "/file", "wb") as f: 26 | f.write(data) 27 | 28 | yield hdfs 29 | 30 | if hdfs.exists(basedir): 31 | hdfs.rm(basedir, recursive=True) 32 | 33 | 34 | def test_ls(hdfs): 35 | fs = fsspec.filesystem("hdfs") 36 | fs.touch(basedir + "/file_2") 37 | fs.mkdir(basedir + "/dir_1") 38 | fs.touch(basedir + "/dir_1/file_1") 39 | fs.mkdir(basedir + "/dir_2") 40 | 41 | out = {(f["name"], f["kind"]) for f in fs.ls(basedir)} 42 | assert out == { 43 | (basedir + "/file", "file"), 44 | (basedir + "/file_2", "file"), 45 | (basedir + "/dir_1", "directory"), 46 | (basedir + "/dir_2", "directory"), 47 | } 48 | 49 | 50 | def test_walk(hdfs): 51 | h = fsspec.filesystem("hdfs") 52 | out = h.walk(basedir) 53 | assert list(out) == list(hdfs.walk(basedir)) 54 | 55 | 56 | def test_isdir(hdfs): 57 | h = fsspec.filesystem("hdfs") 58 | assert h.isdir(basedir) 59 | assert not h.isdir(basedir + "/file") 60 | 61 | 62 | def test_exists(hdfs): 63 | h = fsspec.filesystem("hdfs") 64 | assert not h.exists(basedir + "/notafile") 65 | 66 | 67 | def test_read(hdfs): 68 | h = fsspec.filesystem("hdfs") 69 | out = basedir + "/file" 70 | with h.open(out, "rb") as f: 71 | assert f.read() == data 72 | with h.open(out, "rb", block_size=0) as f: 73 | assert f.read() == data 74 | with h.open(out, "rb") as f: 75 | assert f.read(100) + f.read() == data 76 | 77 | 78 | def test_copy(hdfs): 79 | fs = fsspec.filesystem("hdfs") 80 | 81 | fs.mkdir(basedir + "/test_dir") 82 | fs.touch(basedir + "/test_dir/a") 83 | fs.touch(basedir + "/test_dir/b") 84 | fs.mkdir(basedir + "/test_dir/c") 85 | fs.touch(basedir + "/test_dir/c/d") 86 | 87 | fs.copy(basedir + "/test_dir", basedir + "/copy_dir", recursive=True) 88 | assert fs.find(basedir + "/copy_dir", detail=False) == [ 89 | basedir + "/copy_dir" + "/a", 90 | basedir + "/copy_dir" + "/b", 91 | basedir + "/copy_dir" + "/c/d", 92 | ] 93 | 94 | 95 | def test_put_get(hdfs, tmpdir): 96 | fs = fsspec.filesystem("hdfs") 97 | 98 | src_dir = Path(tmpdir / "source") 99 | dst_dir = Path(tmpdir / "destination") 100 | 101 | src_dir.mkdir() 102 | (src_dir / "file_1.txt").write_text("file_1") 103 | (src_dir / "file_2.txt").write_text("file_2") 104 | (src_dir / "dir_1").mkdir() 105 | (src_dir / "dir_1" / "file_3.txt").write_text("file_3") 106 | (src_dir / "dir_1" / "file_4.txt").write_text("file_4") 107 | (src_dir / "dir_2").mkdir() 108 | 109 | fs.put(str(src_dir), basedir + "/src", recursive=True) 110 | fs.get(basedir + "/src", str(dst_dir), recursive=True) 111 | 112 | files = [file.relative_to(dst_dir) for file in dst_dir.glob("**/*")] 113 | 114 | assert set(map(str, files)) == { 115 | "file_1.txt", 116 | "file_2.txt", 117 | "dir_1/file_3.txt", 118 | "dir_1/file_4.txt", 119 | "dir_1", 120 | "dir_2", 121 | } 122 | 123 | assert { 124 | (dst_dir / file).read_text() for file in files if (dst_dir / file).is_file() 125 | } == {"file_1", "file_2", "file_3", "file_4"} 126 | 127 | 128 | def test_put_file_get_file(hdfs, tmpdir): 129 | fs = fsspec.filesystem("hdfs") 130 | 131 | src_file = Path(tmpdir / "src_file") 132 | dst_file = Path(tmpdir / "dst_file") 133 | 134 | src_file.write_bytes(b"heyhey") 135 | 136 | fs.put_file(src_file, basedir + "/src_file") 137 | fs.get_file(basedir + "/src_file", dst_file) 138 | 139 | assert dst_file.read_bytes() == b"heyhey" 140 | -------------------------------------------------------------------------------- /fsspec/implementations/jupyter.py: -------------------------------------------------------------------------------- 1 | import base64 2 | import io 3 | import re 4 | 5 | import requests 6 | 7 | import fsspec 8 | 9 | 10 | class JupyterFileSystem(fsspec.AbstractFileSystem): 11 | """View of the files as seen by a Jupyter server (notebook or lab)""" 12 | 13 | protocol = ("jupyter", "jlab") 14 | 15 | def __init__(self, url, tok=None, **kwargs): 16 | """ 17 | 18 | Parameters 19 | ---------- 20 | url : str 21 | Base URL of the server, like "http://127.0.0.1:8888". May include 22 | token in the string, which is given by the process when starting up 23 | tok : str 24 | If the token is obtained separately, can be given here 25 | kwargs 26 | """ 27 | if "?" in url: 28 | if tok is None: 29 | try: 30 | tok = re.findall("token=([a-z0-9]+)", url)[0] 31 | except IndexError as e: 32 | raise ValueError("Could not determine token") from e 33 | url = url.split("?", 1)[0] 34 | self.url = url.rstrip("/") + "/api/contents" 35 | self.session = requests.Session() 36 | if tok: 37 | self.session.headers["Authorization"] = f"token {tok}" 38 | 39 | super().__init__(**kwargs) 40 | 41 | def ls(self, path, detail=True, **kwargs): 42 | path = self._strip_protocol(path) 43 | r = self.session.get(self.url + "/" + path) 44 | if r.status_code == 404: 45 | return FileNotFoundError(path) 46 | r.raise_for_status() 47 | out = r.json() 48 | 49 | if out["type"] == "directory": 50 | out = out["content"] 51 | else: 52 | out = [out] 53 | for o in out: 54 | o["name"] = o.pop("path") 55 | o.pop("content") 56 | if o["type"] == "notebook": 57 | o["type"] = "file" 58 | if detail: 59 | return out 60 | return [o["name"] for o in out] 61 | 62 | def cat_file(self, path, start=None, end=None, **kwargs): 63 | path = self._strip_protocol(path) 64 | r = self.session.get(self.url + "/" + path) 65 | if r.status_code == 404: 66 | return FileNotFoundError(path) 67 | r.raise_for_status() 68 | out = r.json() 69 | if out["format"] == "text": 70 | # data should be binary 71 | b = out["content"].encode() 72 | else: 73 | b = base64.b64decode(out["content"]) 74 | return b[start:end] 75 | 76 | def pipe_file(self, path, value, **_): 77 | path = self._strip_protocol(path) 78 | json = { 79 | "name": path.rsplit("/", 1)[-1], 80 | "path": path, 81 | "size": len(value), 82 | "content": base64.b64encode(value).decode(), 83 | "format": "base64", 84 | "type": "file", 85 | } 86 | self.session.put(self.url + "/" + path, json=json) 87 | 88 | def mkdir(self, path, create_parents=True, **kwargs): 89 | path = self._strip_protocol(path) 90 | if create_parents and "/" in path: 91 | self.mkdir(path.rsplit("/", 1)[0], True) 92 | json = { 93 | "name": path.rsplit("/", 1)[-1], 94 | "path": path, 95 | "size": None, 96 | "content": None, 97 | "type": "directory", 98 | } 99 | self.session.put(self.url + "/" + path, json=json) 100 | 101 | def _rm(self, path): 102 | path = self._strip_protocol(path) 103 | self.session.delete(self.url + "/" + path) 104 | 105 | def _open(self, path, mode="rb", **kwargs): 106 | path = self._strip_protocol(path) 107 | if mode == "rb": 108 | data = self.cat_file(path) 109 | return io.BytesIO(data) 110 | else: 111 | return SimpleFileWriter(self, path, mode="wb") 112 | 113 | 114 | class SimpleFileWriter(fsspec.spec.AbstractBufferedFile): 115 | def _upload_chunk(self, final=False): 116 | """Never uploads a chunk until file is done 117 | 118 | Not suitable for large files 119 | """ 120 | if final is False: 121 | return False 122 | self.buffer.seek(0) 123 | data = self.buffer.read() 124 | self.fs.pipe_file(self.path, data) 125 | -------------------------------------------------------------------------------- /docs/source/index.rst: -------------------------------------------------------------------------------- 1 | ``fsspec``: Filesystem interfaces for Python 2 | ============================================ 3 | 4 | Filesystem Spec (``fsspec``) is a project to provide a unified pythonic interface to 5 | local, remote and embedded file systems and bytes storage. 6 | 7 | Brief Overview 8 | -------------- 9 | 10 | There are many places to store bytes, from in memory, to the local disk, cluster 11 | distributed storage, to the cloud. Many files also contain internal mappings of names to bytes, 12 | maybe in a hierarchical directory-oriented tree. Working with all these different 13 | storage media, and their associated libraries, is a pain. ``fsspec`` exists to 14 | provide a familiar API that will work the same whatever the storage backend. 15 | As much as possible, we iron out the quirks specific to each implementation, 16 | so you need do no more than provide credentials for each service you access 17 | (if needed) and thereafter not have to worry about the implementation again. 18 | 19 | Why 20 | --- 21 | 22 | ``fsspec`` provides two main concepts: a set of filesystem classes with uniform APIs 23 | (i.e., functions such as ``cp``, ``rm``, ``cat``, ``mkdir``, ...) supplying operations on a range of 24 | storage systems; and top-level convenience functions like :func:`fsspec.open`, to allow 25 | you to quickly get from a URL to a file-like object that you can use with a third-party 26 | library or your own code. 27 | 28 | The section :doc:`intro` gives motivation and history of this project, but 29 | most users will want to skip straight to :doc:`usage` to find out how to use 30 | the package and :doc:`features` to see the long list of added functionality 31 | included along with the basic file-system interface. 32 | 33 | 34 | Who uses ``fsspec``? 35 | -------------------- 36 | 37 | You can use ``fsspec``'s file objects with any python function that accepts 38 | file objects, because of *duck typing*. 39 | 40 | You may well be using ``fsspec`` already without knowing it. 41 | The following libraries use ``fsspec`` internally for path and file handling: 42 | 43 | #. `Dask`_, the parallel, out-of-core and distributed 44 | programming platform 45 | #. `Intake`_, the data source cataloguing and loading 46 | library and its plugins 47 | #. `pandas`_, the tabular data analysis package 48 | #. `xarray`_ and `zarr`_, multidimensional array 49 | storage and labelled operations 50 | #. `DVC`_, version control system 51 | for machine learning projects 52 | #. `Kedro`_, a Python framework for reproducible, 53 | maintainable and modular data science code 54 | 55 | ``fsspec`` filesystems are also supported by: 56 | 57 | #. `pyarrow`_, the in-memory data layout engine 58 | #. `petl`_, a general purpose package for extracting, transforming and loading tables of data. 59 | 60 | ... plus many more that we don't know about. 61 | 62 | .. _Dask: https://dask.org/ 63 | .. _Intake: https://intake.readthedocs.io/ 64 | .. _pandas: https://pandas.pydata.org/ 65 | .. _xarray: http://xarray.pydata.org/ 66 | .. _zarr: https://zarr.readthedocs.io/ 67 | .. _DVC: https://dvc.org/ 68 | .. _kedro: https://kedro.readthedocs.io/en/stable/01_introduction/01_introduction.html 69 | .. _pyarrow: https://arrow.apache.org/docs/python/ 70 | .. _petl: https://petl.readthedocs.io/en/stable/io.html#petl.io.remotes.RemoteSource 71 | 72 | Installation 73 | ------------ 74 | 75 | `fsspec` can be installed from PyPI or conda and has no dependencies of its own 76 | 77 | .. code-block:: sh 78 | 79 | pip install fsspec 80 | conda install -c conda-forge fsspec 81 | 82 | Not all filesystem implementations are available without installing extra 83 | dependencies. For example to be able to access data in S3, you can use the optional 84 | pip install syntax below, or install the specific package required 85 | 86 | .. code-block:: sh 87 | 88 | pip install fsspec[gcs] 89 | conda install -c conda-forge gcsfs 90 | 91 | `fsspec` attempts to provide the right message when you attempt to use a filesystem 92 | for which you need additional dependencies. 93 | The current list of known implementations can be found as follows 94 | 95 | .. code-block:: python 96 | 97 | from fsspec.registry import known_implementations 98 | 99 | known_implementations 100 | 101 | 102 | 103 | .. toctree:: 104 | :maxdepth: 1 105 | :caption: Contents: 106 | 107 | intro.rst 108 | usage.rst 109 | features.rst 110 | developer.rst 111 | async.rst 112 | api.rst 113 | changelog.rst 114 | -------------------------------------------------------------------------------- /fsspec/tests/test_fuse.py: -------------------------------------------------------------------------------- 1 | import os 2 | import subprocess 3 | import time 4 | from multiprocessing import Process 5 | 6 | import pytest 7 | 8 | try: 9 | pytest.importorskip("fuse") # noqa: E402 10 | except OSError: 11 | # can succeed in importing fuse, but fail to load so 12 | pytest.importorskip("nonexistent") # noqa: E402 13 | 14 | from fsspec.fuse import main, run 15 | from fsspec.implementations.memory import MemoryFileSystem 16 | 17 | 18 | def host_fuse(mountdir): 19 | fs = MemoryFileSystem() 20 | fs.touch("/mounted/testfile") 21 | run(fs, "/mounted/", mountdir) 22 | 23 | 24 | def test_basic(tmpdir, capfd): 25 | mountdir = str(tmpdir.mkdir("mount")) 26 | 27 | fuse_process = Process(target=host_fuse, args=(str(mountdir),)) 28 | fuse_process.start() 29 | 30 | try: 31 | timeout = 10 32 | while True: 33 | try: 34 | # can fail with device not ready while waiting for fuse 35 | if "testfile" in os.listdir(mountdir): 36 | break 37 | except Exception: 38 | pass 39 | timeout -= 1 40 | time.sleep(1) 41 | if not timeout > 0: 42 | import pdb 43 | 44 | pdb.set_trace() 45 | pytest.skip(msg="fuse didn't come live") 46 | 47 | fn = os.path.join(mountdir, "test") 48 | with open(fn, "wb") as f: 49 | f.write(b"data") 50 | 51 | with open(fn) as f: 52 | assert f.read() == "data" 53 | 54 | os.remove(fn) 55 | 56 | os.mkdir(fn) 57 | assert os.listdir(fn) == [] 58 | 59 | os.mkdir(fn + "/inner") 60 | 61 | with pytest.raises(OSError): 62 | os.rmdir(fn) 63 | 64 | captured = capfd.readouterr() 65 | assert "Traceback" not in captured.out 66 | assert "Traceback" not in captured.err 67 | 68 | os.rmdir(fn + "/inner") 69 | os.rmdir(fn) 70 | finally: 71 | fuse_process.terminate() 72 | fuse_process.join(timeout=10) 73 | if fuse_process.is_alive(): 74 | fuse_process.kill() 75 | fuse_process.join() 76 | 77 | 78 | def host_mount_local(source_dir, mount_dir, debug_log): 79 | main(["local", source_dir, mount_dir, "-l", debug_log, "--ready-file"]) 80 | 81 | 82 | @pytest.fixture() 83 | def mount_local(tmpdir): 84 | source_dir = tmpdir.mkdir("source") 85 | mount_dir = tmpdir.mkdir("local") 86 | debug_log = tmpdir / "debug.log" 87 | fuse_process = Process( 88 | target=host_mount_local, args=(str(source_dir), str(mount_dir), str(debug_log)) 89 | ) 90 | fuse_process.start() 91 | ready_file = mount_dir / ".fuse_ready" 92 | for _ in range(20): 93 | if ready_file.exists() and open(ready_file).read() == b"ready": 94 | break 95 | time.sleep(0.1) 96 | try: 97 | yield (source_dir, mount_dir) 98 | finally: 99 | fuse_process.terminate() 100 | fuse_process.join(timeout=10) 101 | if fuse_process.is_alive(): 102 | fuse_process.kill() 103 | fuse_process.join() 104 | 105 | 106 | def test_mount(mount_local): 107 | source_dir, mount_dir = mount_local 108 | assert os.listdir(mount_dir) == [] 109 | assert os.listdir(source_dir) == [] 110 | 111 | mount_dir.mkdir("a") 112 | 113 | assert os.listdir(mount_dir) == ["a"] 114 | assert os.listdir(source_dir) == ["a"] 115 | 116 | 117 | def test_chmod(mount_local): 118 | source_dir, mount_dir = mount_local 119 | open(mount_dir / "text", "w").write("test") 120 | assert os.listdir(source_dir) == ["text"] 121 | 122 | cp = subprocess.run( 123 | ["cp", str(mount_dir / "text"), str(mount_dir / "new")], 124 | stdout=subprocess.PIPE, 125 | stderr=subprocess.PIPE, 126 | ) 127 | 128 | assert cp.stderr == b"" 129 | assert cp.stdout == b"" 130 | assert set(os.listdir(source_dir)) == set(["text", "new"]) 131 | assert open(mount_dir / "new").read() == "test" 132 | 133 | 134 | def test_seek_rw(mount_local): 135 | source_dir, mount_dir = mount_local 136 | fh = open(mount_dir / "text", "w") 137 | fh.write("teST") 138 | fh.seek(2) 139 | fh.write("st") 140 | fh.close() 141 | 142 | fh = open(mount_dir / "text", "r") 143 | assert fh.read() == "test" 144 | fh.seek(2) 145 | assert fh.read() == "st" 146 | fh.close() 147 | -------------------------------------------------------------------------------- /fsspec/implementations/tests/test_webhdfs.py: -------------------------------------------------------------------------------- 1 | import pickle 2 | import shlex 3 | import subprocess 4 | import time 5 | 6 | import pytest 7 | 8 | import fsspec 9 | 10 | requests = pytest.importorskip("requests") 11 | 12 | from fsspec.implementations.webhdfs import WebHDFS # noqa: E402 13 | 14 | 15 | @pytest.fixture(scope="module") 16 | def hdfs_cluster(): 17 | cmd0 = shlex.split("htcluster shutdown") 18 | try: 19 | subprocess.check_output(cmd0, stderr=subprocess.STDOUT) 20 | except FileNotFoundError: 21 | pytest.skip("htcluster not found") 22 | except subprocess.CalledProcessError as ex: 23 | pytest.skip("htcluster failed: " + ex.output.decode()) 24 | cmd1 = shlex.split("htcluster startup --image base") 25 | subprocess.check_output(cmd1) 26 | try: 27 | while True: 28 | t = 90 29 | try: 30 | requests.get("http://localhost:50070/webhdfs/v1/?op=LISTSTATUS") 31 | except: # noqa: E722 32 | t -= 1 33 | assert t > 0, "Timeout waiting for HDFS" 34 | time.sleep(1) 35 | continue 36 | break 37 | time.sleep(7) 38 | yield "localhost" 39 | finally: 40 | subprocess.check_output(cmd0) 41 | 42 | 43 | def test_pickle(hdfs_cluster): 44 | w = WebHDFS(hdfs_cluster, user="testuser") 45 | w2 = pickle.loads(pickle.dumps(w)) 46 | assert w == w2 47 | 48 | 49 | def test_simple(hdfs_cluster): 50 | w = WebHDFS(hdfs_cluster, user="testuser") 51 | home = w.home_directory() 52 | assert home == "/user/testuser" 53 | with pytest.raises(PermissionError): 54 | w.mkdir("/root") 55 | 56 | 57 | def test_url(hdfs_cluster): 58 | url = "webhdfs://testuser@localhost:50070/user/testuser/myfile" 59 | fo = fsspec.open(url, "wb", data_proxy={"worker.example.com": "localhost"}) 60 | with fo as f: 61 | f.write(b"hello") 62 | fo = fsspec.open(url, "rb", data_proxy={"worker.example.com": "localhost"}) 63 | with fo as f: 64 | assert f.read() == b"hello" 65 | 66 | 67 | def test_workflow(hdfs_cluster): 68 | w = WebHDFS( 69 | hdfs_cluster, user="testuser", data_proxy={"worker.example.com": "localhost"} 70 | ) 71 | fn = "/user/testuser/testrun/afile" 72 | w.mkdir("/user/testuser/testrun") 73 | with w.open(fn, "wb") as f: 74 | f.write(b"hello") 75 | assert w.exists(fn) 76 | info = w.info(fn) 77 | assert info["size"] == 5 78 | assert w.isfile(fn) 79 | assert w.cat(fn) == b"hello" 80 | w.rm("/user/testuser/testrun", recursive=True) 81 | assert not w.exists(fn) 82 | 83 | 84 | def test_with_gzip(hdfs_cluster): 85 | from gzip import GzipFile 86 | 87 | w = WebHDFS( 88 | hdfs_cluster, user="testuser", data_proxy={"worker.example.com": "localhost"} 89 | ) 90 | fn = "/user/testuser/gzfile" 91 | with w.open(fn, "wb") as f: 92 | gf = GzipFile(fileobj=f, mode="w") 93 | gf.write(b"hello") 94 | gf.close() 95 | with w.open(fn, "rb") as f: 96 | gf = GzipFile(fileobj=f, mode="r") 97 | assert gf.read() == b"hello" 98 | 99 | 100 | def test_workflow_transaction(hdfs_cluster): 101 | w = WebHDFS( 102 | hdfs_cluster, user="testuser", data_proxy={"worker.example.com": "localhost"} 103 | ) 104 | fn = "/user/testuser/testrun/afile" 105 | w.mkdirs("/user/testuser/testrun") 106 | with w.transaction: 107 | with w.open(fn, "wb") as f: 108 | f.write(b"hello") 109 | assert not w.exists(fn) 110 | assert w.exists(fn) 111 | assert w.ukey(fn) 112 | files = w.ls("/user/testuser/testrun", True) 113 | summ = w.content_summary("/user/testuser/testrun") 114 | assert summ["length"] == files[0]["size"] 115 | assert summ["fileCount"] == 1 116 | 117 | w.rm("/user/testuser/testrun", recursive=True) 118 | assert not w.exists(fn) 119 | 120 | 121 | def test_webhdfs_cp_file(hdfs_cluster): 122 | fs = WebHDFS( 123 | hdfs_cluster, user="testuser", data_proxy={"worker.example.com": "localhost"} 124 | ) 125 | 126 | src, dst = "/user/testuser/testrun/f1", "/user/testuser/testrun/f2" 127 | 128 | fs.mkdir("/user/testuser/testrun") 129 | 130 | with fs.open(src, "wb") as f: 131 | f.write(b"hello") 132 | 133 | fs.cp_file(src, dst) 134 | 135 | assert fs.exists(src) 136 | assert fs.exists(dst) 137 | assert fs.cat(src) == fs.cat(dst) 138 | -------------------------------------------------------------------------------- /fsspec/implementations/git.py: -------------------------------------------------------------------------------- 1 | import os 2 | 3 | import pygit2 4 | 5 | from fsspec.spec import AbstractFileSystem 6 | 7 | from .memory import MemoryFile 8 | 9 | 10 | class GitFileSystem(AbstractFileSystem): 11 | """Browse the files of a local git repo at any hash/tag/branch 12 | 13 | (experimental backend) 14 | """ 15 | 16 | root_marker = "" 17 | cachable = True 18 | 19 | def __init__(self, path=None, fo=None, ref=None, **kwargs): 20 | """ 21 | 22 | Parameters 23 | ---------- 24 | path: str (optional) 25 | Local location of the repo (uses current directory if not given). 26 | May be deprecated in favour of ``fo``. When used with a higher 27 | level function such as fsspec.open(), may be of the form 28 | "git://[path-to-repo[:]][ref@]path/to/file" (but the actual 29 | file path should not contain "@" or ":"). 30 | fo: str (optional) 31 | Same as ``path``, but passed as part of a chained URL. This one 32 | takes precedence if both are given. 33 | ref: str (optional) 34 | Reference to work with, could be a hash, tag or branch name. Defaults 35 | to current working tree. Note that ``ls`` and ``open`` also take hash, 36 | so this becomes the default for those operations 37 | kwargs 38 | """ 39 | super().__init__(**kwargs) 40 | self.repo = pygit2.Repository(fo or path or os.getcwd()) 41 | self.ref = ref or "master" 42 | 43 | @classmethod 44 | def _strip_protocol(cls, path): 45 | path = super()._strip_protocol(path).lstrip("/") 46 | if ":" in path: 47 | path = path.split(":", 1)[1] 48 | if "@" in path: 49 | path = path.split("@", 1)[1] 50 | return path.lstrip("/") 51 | 52 | def _path_to_object(self, path, ref): 53 | comm, ref = self.repo.resolve_refish(ref or self.ref) 54 | parts = path.split("/") 55 | tree = comm.tree 56 | for part in parts: 57 | if part and isinstance(tree, pygit2.Tree): 58 | tree = tree[part] 59 | return tree 60 | 61 | @staticmethod 62 | def _get_kwargs_from_urls(path): 63 | if path.startswith("git://"): 64 | path = path[6:] 65 | out = {} 66 | if ":" in path: 67 | out["path"], path = path.split(":", 1) 68 | if "@" in path: 69 | out["ref"], path = path.split("@", 1) 70 | return out 71 | 72 | def ls(self, path, detail=True, ref=None, **kwargs): 73 | path = self._strip_protocol(path) 74 | tree = self._path_to_object(path, ref) 75 | if isinstance(tree, pygit2.Tree): 76 | out = [] 77 | for obj in tree: 78 | if isinstance(obj, pygit2.Tree): 79 | out.append( 80 | { 81 | "type": "directory", 82 | "name": "/".join([path, obj.name]).lstrip("/"), 83 | "hex": obj.hex, 84 | "mode": "%o" % obj.filemode, 85 | "size": 0, 86 | } 87 | ) 88 | else: 89 | out.append( 90 | { 91 | "type": "file", 92 | "name": "/".join([path, obj.name]).lstrip("/"), 93 | "hex": obj.hex, 94 | "mode": "%o" % obj.filemode, 95 | "size": obj.size, 96 | } 97 | ) 98 | else: 99 | obj = tree 100 | out = [ 101 | { 102 | "type": "file", 103 | "name": obj.name, 104 | "hex": obj.hex, 105 | "mode": "%o" % obj.filemode, 106 | "size": obj.size, 107 | } 108 | ] 109 | if detail: 110 | return out 111 | return [o["name"] for o in out] 112 | 113 | def ukey(self, path, ref=None): 114 | return self.info(path, ref=ref)["hex"] 115 | 116 | def _open( 117 | self, 118 | path, 119 | mode="rb", 120 | block_size=None, 121 | autocommit=True, 122 | cache_options=None, 123 | ref=None, 124 | **kwargs, 125 | ): 126 | obj = self._path_to_object(path, ref or self.ref) 127 | return MemoryFile(data=obj.data) 128 | -------------------------------------------------------------------------------- /fsspec/implementations/tests/test_memory.py: -------------------------------------------------------------------------------- 1 | import os 2 | 3 | import pytest 4 | 5 | 6 | def test_1(m): 7 | m.touch("/somefile") # NB: is found with or without initial / 8 | m.touch("afiles/and/another") 9 | files = m.find("") 10 | assert files == ["/afiles/and/another", "/somefile"] 11 | 12 | files = sorted(m.get_mapper()) 13 | assert files == ["afiles/and/another", "somefile"] 14 | 15 | 16 | def test_strip(m): 17 | assert m._strip_protocol("") == "" 18 | assert m._strip_protocol("memory://") == "" 19 | assert m._strip_protocol("afile") == "/afile" 20 | assert m._strip_protocol("/b/c") == "/b/c" 21 | assert m._strip_protocol("/b/c/") == "/b/c" 22 | 23 | 24 | def test_put_single(m, tmpdir): 25 | fn = os.path.join(str(tmpdir), "dir") 26 | os.mkdir(fn) 27 | open(os.path.join(fn, "abc"), "w").write("text") 28 | m.put(fn, "/test") # no-op, no files 29 | assert not m.exists("/test/abc") 30 | assert not m.exists("/test/dir") 31 | m.put(fn + "/", "/test", recursive=True) 32 | assert m.cat("/test/dir/abc") == b"text" 33 | 34 | 35 | def test_ls(m): 36 | m.mkdir("/dir") 37 | m.mkdir("/dir/dir1") 38 | 39 | m.touch("/dir/afile") 40 | m.touch("/dir/dir1/bfile") 41 | m.touch("/dir/dir1/cfile") 42 | 43 | assert m.ls("/", False) == ["/dir"] 44 | assert m.ls("/dir", False) == ["/dir/afile", "/dir/dir1"] 45 | assert m.ls("/dir", True)[0]["type"] == "file" 46 | assert m.ls("/dir", True)[1]["type"] == "directory" 47 | 48 | assert len(m.ls("/dir/dir1")) == 2 49 | 50 | 51 | def test_directories(m): 52 | m.mkdir("outer/inner") 53 | assert m.info("outer/inner")["type"] == "directory" 54 | 55 | assert m.ls("outer") 56 | assert m.ls("outer/inner") == [] 57 | 58 | with pytest.raises(OSError): 59 | m.rmdir("outer") 60 | 61 | m.rmdir("outer/inner") 62 | m.rmdir("outer") 63 | 64 | assert not m.store 65 | 66 | 67 | def test_mv_recursive(m): 68 | m.mkdir("src") 69 | m.touch("src/file.txt") 70 | m.mv("src", "dest", recursive=True) 71 | assert m.exists("dest/file.txt") 72 | assert not m.exists("src") 73 | 74 | 75 | def test_rm_no_psuedo_dir(m): 76 | m.touch("/dir1/dir2/file") 77 | m.rm("/dir1", recursive=True) 78 | assert not m.exists("/dir1/dir2/file") 79 | assert not m.exists("/dir1/dir2") 80 | assert not m.exists("/dir1") 81 | 82 | with pytest.raises(FileNotFoundError): 83 | m.rm("/dir1", recursive=True) 84 | 85 | 86 | def test_rewind(m): 87 | # https://github.com/fsspec/filesystem_spec/issues/349 88 | with m.open("src/file.txt", "w") as f: 89 | f.write("content") 90 | with m.open("src/file.txt") as f: 91 | assert f.tell() == 0 92 | 93 | 94 | def test_empty_raises(m): 95 | with pytest.raises(FileNotFoundError): 96 | m.ls("nonexistent") 97 | 98 | with pytest.raises(FileNotFoundError): 99 | m.info("nonexistent") 100 | 101 | 102 | def test_dir_errors(m): 103 | m.mkdir("/first") 104 | 105 | with pytest.raises(FileExistsError): 106 | m.mkdir("/first") 107 | with pytest.raises(FileExistsError): 108 | m.makedirs("/first", exist_ok=False) 109 | m.makedirs("/first", exist_ok=True) 110 | m.makedirs("/first/second/third") 111 | assert "/first/second" in m.pseudo_dirs 112 | 113 | m.touch("/afile") 114 | with pytest.raises(NotADirectoryError): 115 | m.mkdir("/afile/nodir") 116 | 117 | 118 | def test_no_rewind_append_mode(m): 119 | # https://github.com/fsspec/filesystem_spec/issues/349 120 | with m.open("src/file.txt", "w") as f: 121 | f.write("content") 122 | with m.open("src/file.txt", "a") as f: 123 | assert f.tell() == 7 124 | 125 | 126 | def test_moves(m): 127 | m.touch("source.txt") 128 | m.mv("source.txt", "target.txt") 129 | 130 | m.touch("source2.txt") 131 | m.mv("source2.txt", "target2.txt", recursive=True) 132 | assert m.find("") == ["/target.txt", "/target2.txt"] 133 | 134 | 135 | def test_rm_reursive_empty_subdir(m): 136 | # https://github.com/fsspec/filesystem_spec/issues/500 137 | m.mkdir("recdir") 138 | m.mkdir("recdir/subdir2") 139 | m.rm("recdir/", recursive=True) 140 | assert not m.exists("dir") 141 | 142 | 143 | def test_seekable(m): 144 | fn0 = "foo.txt" 145 | with m.open(fn0, "wb") as f: 146 | f.write(b"data") 147 | 148 | f = m.open(fn0, "rt") 149 | assert f.seekable(), "file is not seekable" 150 | f.seek(1) 151 | assert f.read(1) == "a" 152 | assert f.tell() == 2 153 | 154 | 155 | def test_remove_all(m): 156 | m.touch("afile") 157 | m.rm("/", recursive=True) 158 | assert not m.ls("/") 159 | -------------------------------------------------------------------------------- /docs/source/usage.rst: -------------------------------------------------------------------------------- 1 | Usage 2 | ===== 3 | 4 | This is quick-start documentation to help people get familiar with the layout and functioning of ``fsspec``. 5 | 6 | Instantiate a file-system 7 | ------------------------- 8 | 9 | ``fsspec`` provides an abstract file-system interface as a base class, to be used by other filesystems. 10 | A file-system instance is an object for manipulating files on some 11 | remote store, local files, files within some wrapper, or anything else that is capable of producing 12 | file-like objects. 13 | 14 | Some concrete implementations are bundled with ``fsspec`` and others can be installed separately. They 15 | can be instantiated directly, or the `registry` can be used to find them. 16 | 17 | Direct instantiation: 18 | 19 | .. code-block:: python 20 | 21 | from fsspec.implementations.local import LocalFileSystem 22 | 23 | fs = LocalFileSystem() 24 | 25 | Look-up via registry: 26 | 27 | .. code-block:: python 28 | 29 | import fsspec 30 | 31 | fs = fsspec.filesystem('file') 32 | 33 | Many filesystems also take extra parameters, some of which may be options - see :doc:`api`, or use 34 | :func:`fsspec.get_filesystem_class` to get the class object and inspect its docstring. 35 | 36 | .. code-block:: python 37 | 38 | import fsspec 39 | 40 | fs = fsspec.filesystem('ftp', host=host, port=port, username=user, password=pw) 41 | 42 | The list of implemented ``fsspec`` protocols can be retrieved using :func:`fsspec.available_protocols`. 43 | 44 | Use a file-system 45 | ----------------- 46 | 47 | File-system instances offer a large number of methods for getting information about and manipulating files 48 | for the given back-end. Although some specific implementations may not offer all features (e.g., ``http`` 49 | is read-only), generally all normal operations, such as ``ls``, ``rm``, should be expected to work (see the 50 | full list: :class:`fsspec.spec.AbstractFileSystem`). 51 | Note that this quick-start will prefer posix-style naming, but 52 | many common operations are aliased: ``cp()`` and ``copy()`` are identical, for instance. 53 | Functionality is generally chosen to be as close to the builtin ``os`` module's working for things like 54 | ``glob`` as possible. The following block of operations should seem very familiar. 55 | 56 | .. code-block:: python 57 | 58 | fs.mkdir("/remote/output") 59 | fs.touch("/remote/output/success") # creates empty file 60 | assert fs.exists("/remote/output/success") 61 | assert fs.isfile("/remote/output/success") 62 | assert fs.cat("/remote/output/success") == b"" # get content as bytestring 63 | fs.copy("/remote/output/success", "/remote/output/copy") 64 | assert fs.ls("/remote/output", detail=False) == ["/remote/output/success", "/remote/output/copy") 65 | fs.rm("/remote/output", recursive=True) 66 | 67 | The ``open()`` method will return a file-like object which can be passed to any other library that expects 68 | to work with python files, or used by your own code as you would a normal python file object. 69 | These will normally be binary-mode only, but may implement internal buffering 70 | in order to limit the number of reads from a remote source. They respect the use of ``with`` contexts. If 71 | you have ``pandas`` installed, for example, you can do the following: 72 | 73 | .. code-block:: python 74 | 75 | f = fs.open("/remote/path/notes.txt", "rb") 76 | lines = f.readline() # read to first b"\n" 77 | f.seek(-10, 2) 78 | foot = f.read() # read last 10 bytes of file 79 | f.close() 80 | 81 | import pandas as pd 82 | with fs.open('/remote/data/myfile.csv') as f: 83 | df = pd.read_csv(f, sep='|', header=None) 84 | 85 | Higher-level 86 | ------------ 87 | 88 | For many situations, the only function that will be needed is :func:`fsspec.open_files()`, which will return 89 | :class:`fsspec.core.OpenFile` instances created from a single URL and parameters to pass to the backend(s). 90 | This supports text-mode and compression on the fly, and the objects can be serialized for passing between 91 | processes or machines (so long as each has access to the same backend file-system). The protocol (i.e., 92 | backend) is inferred from the URL passed, and glob characters are expanded in read mode (search for files) 93 | or write mode (create names). Critically, the file on the backend system is not actually opened until the 94 | ``OpenFile`` instance is used in a ``with`` context. 95 | 96 | .. code-block:: python 97 | 98 | of = fsspec.open("github://dask:fastparquet@main/test-data/nation.csv", "rt") 99 | # of is an OpenFile container object. The "with" context below actually opens it 100 | with of as f: 101 | # now f is a text-mode file 102 | for line in f: 103 | # iterate text lines 104 | print(line) 105 | if "KENYA" in line: 106 | break 107 | -------------------------------------------------------------------------------- /fsspec/implementations/tests/test_dbfs.py: -------------------------------------------------------------------------------- 1 | """ 2 | Test-Cases for the DataBricks Filesystem. 3 | This test case is somewhat special, as there is no "mock" databricks 4 | API available. We use the "vcr" package to record the requests and 5 | responses to the real databricks API and replay them on tests. 6 | 7 | This however means, that when you change the tests (or when the API 8 | itself changes, which is very unlikely to occur as it is versioned), 9 | you need to re-record the answers. This can be done as follows: 10 | 11 | 1. Delete all casettes files in the "./cassettes" folder 12 | 2. Spin up a databricks cluster. For example, 13 | you can use an Azure Databricks instance for this. 14 | 3. Take note of the instance details (the instance URL. For example for an Azure 15 | databricks cluster, this has the form 16 | adb-..azuredatabricks.net) 17 | and your personal token (Find out more here: 18 | https://docs.databricks.com/dev-tools/api/latest/authentication.html) 19 | 4. Set the two environment variables `DBFS_INSTANCE` and `DBFS_TOKEN` 20 | 5. Now execute the tests as normal. The results of the API calls will be recorded. 21 | 6. Unset the environment variables and replay the tests. 22 | """ 23 | import os 24 | from urllib.parse import urlparse 25 | 26 | import pytest 27 | 28 | import fsspec 29 | 30 | DUMMY_INSTANCE = "my_instance.com" 31 | INSTANCE = os.getenv("DBFS_INSTANCE", DUMMY_INSTANCE) 32 | TOKEN = os.getenv("DBFS_TOKEN", "") 33 | 34 | 35 | @pytest.fixture(scope="module") 36 | def vcr_config(): 37 | """ 38 | To not record information in the instance and token details 39 | (which are sensitive), we delete them from both the 40 | request and the response before storing it. 41 | We also delete the date as it is likely to change 42 | (and will make git diffs harder). 43 | If the DBFS_TOKEN env variable is set, we record with VCR. 44 | If not, we only replay (to not accidentely record with a wrong URL). 45 | """ 46 | 47 | def before_record_response(response): 48 | try: 49 | del response["headers"]["x-databricks-org-id"] 50 | del response["headers"]["date"] 51 | except KeyError: 52 | pass 53 | return response 54 | 55 | def before_record_request(request): 56 | # Replace the instance URL 57 | uri = urlparse(request.uri) 58 | uri = uri._replace(netloc=DUMMY_INSTANCE) 59 | request.uri = uri.geturl() 60 | 61 | return request 62 | 63 | if TOKEN: 64 | return { 65 | "record_mode": "once", 66 | "filter_headers": [("authorization", "DUMMY")], 67 | "before_record_response": before_record_response, 68 | "before_record_request": before_record_request, 69 | } 70 | else: 71 | return { 72 | "record_mode": "none", 73 | } 74 | 75 | 76 | @pytest.fixture 77 | def dbfsFS(): 78 | fs = fsspec.filesystem( 79 | "dbfs", 80 | instance=INSTANCE, 81 | token=TOKEN, 82 | ) 83 | 84 | return fs 85 | 86 | 87 | @pytest.mark.vcr() 88 | def test_dbfs_file_listing(dbfsFS): 89 | assert "/FileStore" in dbfsFS.ls("/", detail=False) 90 | assert {"name": "/FileStore", "size": 0, "type": "directory"} in dbfsFS.ls( 91 | "/", detail=True 92 | ) 93 | 94 | 95 | @pytest.mark.vcr() 96 | def test_dbfs_mkdir(dbfsFS): 97 | dbfsFS.rm("/FileStore/my", recursive=True) 98 | assert "/FileStore/my" not in dbfsFS.ls("/FileStore/", detail=False) 99 | 100 | dbfsFS.mkdir("/FileStore/my/dir", create_parents=True) 101 | 102 | assert "/FileStore/my" in dbfsFS.ls("/FileStore/", detail=False) 103 | assert "/FileStore/my/dir" in dbfsFS.ls("/FileStore/my/", detail=False) 104 | 105 | with pytest.raises(FileExistsError): 106 | dbfsFS.mkdir("/FileStore/my/dir", create_parents=True, exist_ok=False) 107 | 108 | with pytest.raises(OSError): 109 | dbfsFS.rm("/FileStore/my", recursive=False) 110 | 111 | assert "/FileStore/my" in dbfsFS.ls("/FileStore/", detail=False) 112 | 113 | dbfsFS.rm("/FileStore/my", recursive=True) 114 | assert "/FileStore/my" not in dbfsFS.ls("/FileStore/", detail=False) 115 | 116 | 117 | @pytest.mark.vcr() 118 | def test_dbfs_write_and_read(dbfsFS): 119 | dbfsFS.rm("/FileStore/file.csv") 120 | assert "/FileStore/file.csv" not in dbfsFS.ls("/FileStore/", detail=False) 121 | 122 | content = b"This is a test\n" * 100000 + b"For this is the end\n" 123 | 124 | with dbfsFS.open("/FileStore/file.csv", "wb") as f: 125 | f.write(content) 126 | 127 | assert "/FileStore/file.csv" in dbfsFS.ls("/FileStore", detail=False) 128 | 129 | with dbfsFS.open("/FileStore/file.csv", "rb") as f: 130 | data = f.read() 131 | assert data == content 132 | 133 | dbfsFS.rm("/FileStore/file.csv") 134 | assert "/FileStore/file.csv" not in dbfsFS.ls("/FileStore/", detail=False) 135 | -------------------------------------------------------------------------------- /fsspec/tests/test_async.py: -------------------------------------------------------------------------------- 1 | import asyncio 2 | import inspect 3 | import os 4 | import time 5 | 6 | import pytest 7 | 8 | import fsspec 9 | import fsspec.asyn 10 | from fsspec.asyn import _run_coros_in_chunks, get_running_loop 11 | 12 | 13 | def test_sync_methods(): 14 | inst = fsspec.asyn.AsyncFileSystem() 15 | assert inspect.iscoroutinefunction(inst._info) 16 | assert hasattr(inst, "info") 17 | assert not inspect.iscoroutinefunction(inst.info) 18 | 19 | 20 | def test_interrupt(): 21 | loop = fsspec.asyn.get_loop() 22 | 23 | async def f(): 24 | await asyncio.sleep(1000000) 25 | return True 26 | 27 | fut = asyncio.run_coroutine_threadsafe(f(), loop) 28 | time.sleep(0.01) # task launches 29 | out = fsspec.asyn._dump_running_tasks(with_task=True) 30 | task = out[0]["task"] 31 | assert task.done() and fut.done() 32 | assert isinstance(fut.exception(), fsspec.asyn.FSSpecCoroutineCancel) 33 | 34 | 35 | class _DummyAsyncKlass: 36 | def __init__(self): 37 | self.loop = fsspec.asyn.get_loop() 38 | 39 | async def _dummy_async_func(self): 40 | # Sleep 1 second function to test timeout 41 | await asyncio.sleep(1) 42 | return True 43 | 44 | dummy_func = fsspec.asyn.sync_wrapper(_dummy_async_func) 45 | 46 | 47 | def test_sync_wrapper_timeout_on_less_than_expected_wait_time_not_finish_function(): 48 | test_obj = _DummyAsyncKlass() 49 | with pytest.raises(fsspec.FSTimeoutError): 50 | test_obj.dummy_func(timeout=0.1) 51 | 52 | 53 | def test_sync_wrapper_timeout_on_more_than_expected_wait_time_will_finish_function(): 54 | test_obj = _DummyAsyncKlass() 55 | assert test_obj.dummy_func(timeout=5) 56 | 57 | 58 | def test_sync_wrapper_timeout_none_will_wait_func_finished(): 59 | test_obj = _DummyAsyncKlass() 60 | assert test_obj.dummy_func(timeout=None) 61 | 62 | 63 | def test_sync_wrapper_treat_timeout_0_as_none(): 64 | test_obj = _DummyAsyncKlass() 65 | assert test_obj.dummy_func(timeout=0) 66 | 67 | 68 | def test_run_coros_in_chunks(monkeypatch): 69 | total_running = 0 70 | 71 | async def runner(): 72 | nonlocal total_running 73 | 74 | total_running += 1 75 | await asyncio.sleep(0) 76 | if total_running > 4: 77 | raise ValueError("More than 4 coroutines are running together") 78 | total_running -= 1 79 | return 1 80 | 81 | async def main(**kwargs): 82 | nonlocal total_running 83 | 84 | total_running = 0 85 | coros = [runner() for _ in range(32)] 86 | results = await _run_coros_in_chunks(coros, **kwargs) 87 | for result in results: 88 | if isinstance(result, Exception): 89 | raise result 90 | return results 91 | 92 | assert sum(asyncio.run(main(batch_size=4))) == 32 93 | 94 | with pytest.raises(ValueError): 95 | asyncio.run(main(batch_size=5)) 96 | 97 | with pytest.raises(ValueError): 98 | asyncio.run(main(batch_size=-1)) 99 | 100 | assert sum(asyncio.run(main(batch_size=4))) == 32 101 | 102 | monkeypatch.setitem(fsspec.config.conf, "gather_batch_size", 5) 103 | with pytest.raises(ValueError): 104 | asyncio.run(main()) 105 | assert sum(asyncio.run(main(batch_size=4))) == 32 # override 106 | 107 | monkeypatch.setitem(fsspec.config.conf, "gather_batch_size", 4) 108 | assert sum(asyncio.run(main())) == 32 # override 109 | 110 | 111 | @pytest.mark.skipif(os.name != "nt", reason="only for windows") 112 | def test_windows_policy(): 113 | from asyncio.windows_events import SelectorEventLoop 114 | 115 | loop = fsspec.asyn.get_loop() 116 | policy = asyncio.get_event_loop_policy() 117 | 118 | # Ensure that the created loop always uses selector policy 119 | assert isinstance(loop, SelectorEventLoop) 120 | 121 | # Ensure that the global policy is not changed and it is 122 | # set to the default one. This is important since the 123 | # get_loop() method will temporarily override the policy 124 | # with the one which uses selectors on windows, so this 125 | # check ensures that we are restoring the old policy back 126 | # after our change. 127 | assert isinstance(policy, asyncio.DefaultEventLoopPolicy) 128 | 129 | 130 | def test_fsspec_loop(): 131 | asyncio._set_running_loop(None) 132 | 133 | with fsspec.asyn.fsspec_loop() as loop: 134 | assert get_running_loop() is loop 135 | assert get_running_loop() is fsspec.asyn.get_loop() 136 | 137 | with pytest.raises(RuntimeError): 138 | get_running_loop() 139 | 140 | original_loop = asyncio.new_event_loop() 141 | asyncio._set_running_loop(original_loop) 142 | 143 | with fsspec.asyn.fsspec_loop() as loop: 144 | assert get_running_loop() is loop 145 | assert get_running_loop() is fsspec.asyn.get_loop() 146 | 147 | assert get_running_loop() is original_loop 148 | -------------------------------------------------------------------------------- /docs/source/developer.rst: -------------------------------------------------------------------------------- 1 | Developing with fsspec 2 | ---------------------- 3 | 4 | Whereas the majority of the documentation describes the use of ``fsspec`` 5 | from the end-user's point of view, ``fsspec`` is used by many libraries 6 | as the primary/only interface to file operations. 7 | 8 | Clients of the library 9 | ~~~~~~~~~~~~~~~~~~~~~~ 10 | 11 | The most common entrance point for libraries which wish to rely on ``fsspec`` 12 | will be ``open`` or ``open_files``, as a way of generating an object compatible 13 | with the python file interface. This actually produces an ``OpenFile`` instance, 14 | which can be serialised across a network, and resources are only engaged when 15 | entering a context, e.g. 16 | 17 | .. code-block:: python 18 | 19 | with fsspec.open("protocol://path", 'rb', param=value) as f: 20 | process_file(f) 21 | 22 | Note the backend-specific parameters that can be passed in this call. 23 | 24 | In cases where the caller wants to control the context directly, they can use the 25 | ``open`` method of the ``OpenFile``, or get the filesystem object directly, 26 | skipping the ``OpenFile`` route. In the latter case, text encoding and compression 27 | or **not** handled for you. The file-like object can also be used as a context 28 | manager, or the ``close()`` method must be called explicitly to release resources. 29 | 30 | .. code-block:: python 31 | 32 | # OpenFile route 33 | of = fsspec.open("protocol://path", 'rb', param=value) 34 | f = of.open() 35 | process_file(f) 36 | f.close() 37 | 38 | # filesystem class route, context 39 | fs = fsspec.filesystem("protocol", param=value) 40 | with fs.open("path", "rb") as f: 41 | process_file(f) 42 | 43 | # filesystem class route, explicit close 44 | fs = fsspec.filesystem("protocol", param=value) 45 | f = fs.open("path", "rb") 46 | process_file(f) 47 | f.close() 48 | 49 | Implementing a backend 50 | ~~~~~~~~~~~~~~~~~~~~~~ 51 | 52 | The class ``AbstractFileSystem`` provides a template of the methods 53 | that a potential implementation should supply, as well as default 54 | implementation of functionality that depends on these. Methods that 55 | *could* be implemented are marked with ``NotImplementedError`` or 56 | ``pass`` (the latter specifically for directory operations that might 57 | not be required for some backends where directories are emulated. 58 | 59 | Note that not all of the methods need to be implemented: for example, 60 | some implementations may be read-only, in which case things like ``pipe``, 61 | ``put``, ``touch``, ``rm``, etc., can be left as not-implemented 62 | (or you might implement them are raise PermissionError, OSError 30 or some 63 | read-only exception). 64 | 65 | We may eventually refactor ``AbstractFileSystem`` to split the default implementation, 66 | the set of methods that you might implement in a new backend, and the 67 | documented end-user API. 68 | 69 | In order to register a new backend with fsspec, new backends should register 70 | themselves using the `entry_points `_ 71 | facility from setuptools. In particular, if you want to register a new 72 | filesystem protocol ``myfs`` which is provided by the ``MyFS`` class in 73 | the ``myfs`` package, add the following to your ``setup.py``: 74 | 75 | .. code-block:: python 76 | 77 | setuptools.setup( 78 | ... 79 | entry_points={ 80 | 'fsspec.specs': [ 81 | 'myfs=myfs.MyFS', 82 | ], 83 | }, 84 | ... 85 | ) 86 | 87 | 88 | Alternatively, the previous method of registering a new backend can be used. 89 | That is, new backends must register themselves on import 90 | (``register_implementation``) or post a PR to the ``fsspec`` repo 91 | asking to be included in ``fsspec.registry.known_implementations``. 92 | 93 | Implementing async 94 | ~~~~~~~~~~~~~~~~~~ 95 | 96 | Starting in version 0.7.5, we provide async operations for some methods 97 | of some implementations. Async support in storage implementations is 98 | optional. Special considerations are required for async 99 | development, see :doc:`async`. 100 | 101 | Developing the library 102 | ~~~~~~~~~~~~~~~~~~~~~~ 103 | 104 | The following can be used to install ``fsspec`` in development mode 105 | 106 | .. code-block:: 107 | 108 | git clone https://github.com/fsspec/filesystem_spec 109 | cd filesystem_spec 110 | pip install -e . 111 | 112 | A number of additional dependencies are required to run tests, see "ci/environment*.yml", as 113 | well as Docker. Most implementation-specific tests should skip if their requirements are 114 | not met. 115 | 116 | Development happens by submitting pull requests (PRs) on github. 117 | This repo adheres for flake8 and black coding conventions. You may wish to install 118 | commit hooks if you intend to make PRs, as linting is done as part of the CI. 119 | 120 | Docs use sphinx and the numpy docstring style. Please add an entry to the changelog 121 | along with any PR. 122 | -------------------------------------------------------------------------------- /fsspec/tests/test_mapping.py: -------------------------------------------------------------------------------- 1 | import os 2 | import pickle 3 | import sys 4 | 5 | import pytest 6 | 7 | import fsspec 8 | from fsspec.implementations.local import LocalFileSystem 9 | from fsspec.implementations.memory import MemoryFileSystem 10 | 11 | 12 | def test_mapping_prefix(tmpdir): 13 | tmpdir = str(tmpdir) 14 | os.makedirs(os.path.join(tmpdir, "afolder")) 15 | open(os.path.join(tmpdir, "afile"), "w").write("test") 16 | open(os.path.join(tmpdir, "afolder", "anotherfile"), "w").write("test2") 17 | 18 | m = fsspec.get_mapper("file://" + tmpdir) 19 | assert "afile" in m 20 | assert m["afolder/anotherfile"] == b"test2" 21 | 22 | fs = fsspec.filesystem("file") 23 | m2 = fs.get_mapper(tmpdir) 24 | m3 = fs.get_mapper("file://" + tmpdir) 25 | 26 | assert m == m2 == m3 27 | 28 | 29 | def test_getitems_errors(tmpdir): 30 | tmpdir = str(tmpdir) 31 | os.makedirs(os.path.join(tmpdir, "afolder")) 32 | open(os.path.join(tmpdir, "afile"), "w").write("test") 33 | open(os.path.join(tmpdir, "afolder", "anotherfile"), "w").write("test2") 34 | m = fsspec.get_mapper("file://" + tmpdir) 35 | assert m.getitems(["afile", "bfile"], on_error="omit") == {"afile": b"test"} 36 | with pytest.raises(KeyError): 37 | m.getitems(["afile", "bfile"]) 38 | out = m.getitems(["afile", "bfile"], on_error="return") 39 | assert isinstance(out["bfile"], KeyError) 40 | m = fsspec.get_mapper("file://" + tmpdir, missing_exceptions=()) 41 | assert m.getitems(["afile", "bfile"], on_error="omit") == {"afile": b"test"} 42 | with pytest.raises(FileNotFoundError): 43 | m.getitems(["afile", "bfile"]) 44 | 45 | 46 | def test_ops(): 47 | MemoryFileSystem.store.clear() 48 | m = fsspec.get_mapper("memory://") 49 | assert not m 50 | assert list(m) == [] 51 | 52 | with pytest.raises(KeyError): 53 | m["hi"] 54 | 55 | assert m.pop("key", 0) == 0 56 | 57 | m["key0"] = b"data" 58 | assert list(m) == ["key0"] 59 | assert m["key0"] == b"data" 60 | 61 | m.clear() 62 | 63 | assert list(m) == [] 64 | 65 | 66 | def test_pickle(): 67 | m = fsspec.get_mapper("memory://") 68 | assert isinstance(m.fs, MemoryFileSystem) 69 | m["key"] = b"data" 70 | m2 = pickle.loads(pickle.dumps(m)) 71 | assert list(m) == list(m2) 72 | assert m.missing_exceptions == m2.missing_exceptions 73 | 74 | 75 | def test_keys_view(): 76 | # https://github.com/fsspec/filesystem_spec/issues/186 77 | m = fsspec.get_mapper("memory://") 78 | m["key"] = b"data" 79 | 80 | keys = m.keys() 81 | assert len(keys) == 1 82 | # check that we don't consume the keys 83 | assert len(keys) == 1 84 | m.clear() 85 | 86 | 87 | def test_multi(): 88 | m = fsspec.get_mapper("memory:///") 89 | data = {"a": b"data1", "b": b"data2"} 90 | m.setitems(data) 91 | 92 | assert m.getitems(list(data)) == data 93 | m.delitems(list(data)) 94 | assert not list(m) 95 | 96 | 97 | def test_setitem_types(): 98 | import array 99 | 100 | m = fsspec.get_mapper("memory://") 101 | m["a"] = array.array("i", [1]) 102 | if sys.byteorder == "little": 103 | assert m["a"] == b"\x01\x00\x00\x00" 104 | else: 105 | assert m["a"] == b"\x00\x00\x00\x01" 106 | m["b"] = bytearray(b"123") 107 | assert m["b"] == b"123" 108 | m.setitems({"c": array.array("i", [1]), "d": bytearray(b"123")}) 109 | if sys.byteorder == "little": 110 | assert m["c"] == b"\x01\x00\x00\x00" 111 | else: 112 | assert m["c"] == b"\x00\x00\x00\x01" 113 | assert m["d"] == b"123" 114 | 115 | 116 | def test_setitem_numpy(): 117 | m = fsspec.get_mapper("memory://") 118 | np = pytest.importorskip("numpy") 119 | m["c"] = np.array(1, dtype="Link' % realfile.encode() 14 | listing = open( 15 | os.path.join(os.path.dirname(__file__), "data", "listing.html"), "rb" 16 | ).read() 17 | win = os.name == "nt" 18 | 19 | 20 | @pytest.fixture 21 | def reset_files(): 22 | yield 23 | 24 | # Reset the newly added files after the 25 | # test is completed. 26 | HTTPTestHandler.dynamic_files.clear() 27 | 28 | 29 | class HTTPTestHandler(BaseHTTPRequestHandler): 30 | static_files = { 31 | "/index/realfile": data, 32 | "/index/otherfile": data, 33 | "/index": index, 34 | "/data/20020401": listing, 35 | } 36 | dynamic_files = {} 37 | 38 | files = ChainMap(dynamic_files, static_files) 39 | 40 | def __init__(self, *args, **kwargs): 41 | super().__init__(*args, **kwargs) 42 | 43 | def _respond(self, code=200, headers=None, data=b""): 44 | headers = headers or {} 45 | headers.update({"User-Agent": "test"}) 46 | self.send_response(code) 47 | for k, v in headers.items(): 48 | self.send_header(k, str(v)) 49 | self.end_headers() 50 | if data: 51 | self.wfile.write(data) 52 | 53 | def do_GET(self): 54 | file_path = self.path.rstrip("/") 55 | file_data = self.files.get(file_path) 56 | if file_data is None: 57 | return self._respond(404) 58 | if "Range" in self.headers: 59 | ran = self.headers["Range"] 60 | b, ran = ran.split("=") 61 | start, end = ran.split("-") 62 | if start: 63 | file_data = file_data[int(start) : (int(end) + 1) if end else None] 64 | else: 65 | # suffix only 66 | file_data = file_data[-int(end) :] 67 | if "give_length" in self.headers: 68 | response_headers = {"Content-Length": len(file_data)} 69 | self._respond(200, response_headers, file_data) 70 | elif "give_range" in self.headers: 71 | self._respond( 72 | 200, 73 | {"Content-Range": "0-%i/%i" % (len(file_data) - 1, len(file_data))}, 74 | file_data, 75 | ) 76 | else: 77 | self._respond(200, data=file_data) 78 | 79 | def do_POST(self): 80 | length = self.headers.get("Content-Length") 81 | file_path = self.path.rstrip("/") 82 | if length is None: 83 | assert self.headers.get("Transfer-Encoding") == "chunked" 84 | self.files[file_path] = b"".join(self.read_chunks()) 85 | else: 86 | self.files[file_path] = self.rfile.read(length) 87 | self._respond(200) 88 | 89 | do_PUT = do_POST 90 | 91 | def read_chunks(self): 92 | length = -1 93 | while length != 0: 94 | line = self.rfile.readline().strip() 95 | if len(line) == 0: 96 | length = 0 97 | else: 98 | length = int(line, 16) 99 | yield self.rfile.read(length) 100 | self.rfile.readline() 101 | 102 | def do_HEAD(self): 103 | if "head_not_auth" in self.headers: 104 | return self._respond( 105 | 403, {"Content-Length": 123}, b"not authorized for HEAD request" 106 | ) 107 | elif "head_ok" not in self.headers: 108 | return self._respond(405) 109 | 110 | file_path = self.path.rstrip("/") 111 | file_data = self.files.get(file_path) 112 | if file_data is None: 113 | return self._respond(404) 114 | 115 | if "give_length" in self.headers: 116 | response_headers = {"Content-Length": len(file_data)} 117 | if "zero_length" in self.headers: 118 | response_headers["Content-Length"] = 0 119 | 120 | self._respond(200, response_headers) 121 | elif "give_range" in self.headers: 122 | self._respond( 123 | 200, {"Content-Range": "0-%i/%i" % (len(file_data) - 1, len(file_data))} 124 | ) 125 | elif "give_etag" in self.headers: 126 | self._respond(200, {"ETag": "xxx"}) 127 | else: 128 | self._respond(200) # OK response, but no useful info 129 | 130 | 131 | @contextlib.contextmanager 132 | def serve(): 133 | server_address = ("", port) 134 | httpd = HTTPServer(server_address, HTTPTestHandler) 135 | th = threading.Thread(target=httpd.serve_forever) 136 | th.daemon = True 137 | th.start() 138 | try: 139 | yield "http://localhost:%i" % port 140 | finally: 141 | httpd.socket.close() 142 | httpd.shutdown() 143 | th.join() 144 | 145 | 146 | @pytest.fixture(scope="module") 147 | def server(): 148 | with serve() as s: 149 | yield s 150 | -------------------------------------------------------------------------------- /docs/source/intro.rst: -------------------------------------------------------------------------------- 1 | Background 2 | ========== 3 | 4 | Python provides a standard interface for open files, so that alternate implementations of file-like object can 5 | work seamlessly with many function which rely only on the methods of that standard interface. A number of libraries 6 | have implemented a similar concept for file-systems, where file operations can be performed on a logical file-system 7 | which may be local, structured data store or some remote service. 8 | 9 | This repository is intended to be a place to define a standard interface that such file-systems should adhere to, 10 | such that code using them should not have to know the details of the implementation in order to operate on any of 11 | a number of backends. With hope, the community can come together to 12 | define an interface that is the best for the highest number of users, and having the specification, makes developing 13 | other file-system implementations simpler. 14 | 15 | History 16 | ------- 17 | 18 | We have been involved in building a number of remote-data file-system implementations, principally 19 | in the context of the `Dask`_ project. In particular, several are listed 20 | in `docs`_ with links to the specific repositories. 21 | With common authorship, there is much that is similar between the implementations, for example posix-like naming 22 | of the operations, and this has allowed Dask to be able to interact with the various backends and parse generic 23 | URLs in order to select amongst them. However, *some* extra code was required in each case to adapt the peculiarities 24 | of each implementation with the generic usage that Dask demanded. People may find the 25 | `code`_ which parses URLs and creates file-system 26 | instances interesting. 27 | 28 | .. _Dask: http://dask.pydata.org/en/latest/ 29 | .. _docs: http://dask.pydata.org/en/latest/remote-data-services.html 30 | .. _code: https://github.com/dask/dask/blob/master/dask/bytes/core.py#L266 31 | 32 | At the same time, the Apache `Arrow`_ project was also concerned with a similar problem, 33 | particularly a common interface to local and HDFS files, for example the 34 | `hdfs`_ interface (which actually communicated with HDFS 35 | with a choice of driver). These are mostly used internally within Arrow, but Dask was modified in order to be able 36 | to use the alternate HDFS interface (which solves some security issues with `hdfs3`). In the process, a 37 | `conversation`_ 38 | was started, and I invite all interested parties to continue the conversation in this location. 39 | 40 | .. _Arrow: https://arrow.apache.org/ 41 | .. _hdfs: https://arrow.apache.org/docs/python/filesystems.html 42 | .. _conversation: https://github.com/dask/dask/issues/2880 43 | 44 | There is a good argument that this type of code has no place in Dask, which is concerned with making graphs 45 | representing computations, and executing those graphs on a scheduler. Indeed, the file-systems are generally useful, 46 | and each has a user-base wider than just those that work via Dask. 47 | 48 | Influences 49 | ---------- 50 | 51 | The following places to consider, when choosing the definitions of how we would like the file-system specification 52 | to look: 53 | 54 | #. python's `os`_ module and its `path` namespace; also other file-connected 55 | functionality in the standard library 56 | #. posix/bash method naming conventions that linux/unix/osx users are familiar with; or perhaps their Windows variants 57 | #. the existing implementations for the various backends (e.g., 58 | `gcsfs`_ or Arrow's 59 | `hdfs`_) 60 | #. `pyfilesystems`_, an attempt to do something similar, with a 61 | plugin architecture. This conception has several types of local file-system, and a lot of well-thought-out 62 | validation code. 63 | 64 | .. _os: https://docs.python.org/3/library/os.html 65 | .. _gcsfs: http://gcsfs.readthedocs.io/en/latest/api.html#gcsfs.core.GCSFileSystem 66 | .. _pyfilesystems: https://docs.pyfilesystem.org/en/latest/index.html 67 | 68 | Other similar work 69 | ------------------ 70 | 71 | It might have been conceivable to reuse code in ``pyfilesystems``, which has an established interface and several 72 | implementations of its own. However, it supports none of the **critical** features for 73 | cloud and parallel access, and would not be easy to 74 | coerce. Following on the success of ``s3fs`` and ``gcsfs``, and their use within Dask, it seemed best to 75 | have an interface as close to those as possible. See a 76 | `discussion`_ on the topic. 77 | 78 | .. _discussion: https://github.com/fsspec/filesystem_spec/issues/5 79 | 80 | Other newer technologies such as `smart_open`_ and ``pyarrow``'s newer file-system rewrite also have some 81 | parts of the functionality presented here, that might suit some use cases better. 82 | 83 | .. _smart_open: https://github.com/RaRe-Technologies/smart_open 84 | 85 | Structure of the package 86 | ------------------------ 87 | 88 | The best place to get a feel for the contents of ``fsspec`` is by looking through the :doc:`usage` and 89 | :doc:`api` sections. In addition, the source code will be interesting for those who wish to subclass and 90 | develop new file-system implementations. ``fsspec/spec.py`` contains the main abstract file-system class 91 | to derive from, ``AbstractFileSystem``. 92 | 93 | .. _zarr: https://zarr.readthedocs.io 94 | -------------------------------------------------------------------------------- /fsspec/implementations/tests/test_ftp.py: -------------------------------------------------------------------------------- 1 | import os 2 | import subprocess 3 | import sys 4 | import time 5 | 6 | import pytest 7 | 8 | import fsspec 9 | from fsspec import open_files 10 | from fsspec.implementations.ftp import FTPFileSystem 11 | 12 | ftplib = pytest.importorskip("ftplib") 13 | here = os.path.dirname(os.path.abspath(__file__)) 14 | 15 | 16 | @pytest.fixture() 17 | def ftp(): 18 | pytest.importorskip("pyftpdlib") 19 | P = subprocess.Popen( 20 | [sys.executable, "-m", "pyftpdlib", "-d", here], 21 | stderr=subprocess.STDOUT, 22 | stdout=subprocess.PIPE, 23 | ) 24 | try: 25 | time.sleep(1) 26 | yield "localhost", 2121 27 | finally: 28 | P.terminate() 29 | P.wait() 30 | 31 | 32 | def test_basic(ftp): 33 | host, port = ftp 34 | fs = FTPFileSystem(host, port) 35 | assert fs.ls("/", detail=False) == sorted(os.listdir(here)) 36 | out = fs.cat("/" + os.path.basename(__file__)) 37 | assert out == open(__file__, "rb").read() 38 | 39 | 40 | def test_not_cached(ftp): 41 | host, port = ftp 42 | fs = FTPFileSystem(host, port) 43 | fs2 = FTPFileSystem(host, port) 44 | assert fs is not fs2 45 | 46 | 47 | @pytest.mark.parametrize("cache_type", ["bytes", "mmap"]) 48 | def test_complex(ftp_writable, cache_type): 49 | from fsspec.core import BytesCache 50 | 51 | host, port, user, pw = ftp_writable 52 | files = open_files( 53 | "ftp:///ou*", 54 | host=host, 55 | port=port, 56 | username=user, 57 | password=pw, 58 | block_size=10000, 59 | cache_type=cache_type, 60 | ) 61 | assert len(files) == 1 62 | with files[0] as fo: 63 | assert fo.read(10) == b"hellohello" 64 | if isinstance(fo.cache, BytesCache): 65 | assert len(fo.cache.cache) == 10010 66 | assert fo.read(2) == b"he" 67 | assert fo.tell() == 12 68 | 69 | 70 | def test_write_small(ftp_writable): 71 | host, port, user, pw = ftp_writable 72 | fs = FTPFileSystem(host, port, user, pw) 73 | with fs.open("/out2", "wb") as f: 74 | f.write(b"oi") 75 | assert fs.cat("/out2") == b"oi" 76 | 77 | 78 | def test_with_url(ftp_writable): 79 | host, port, user, pw = ftp_writable 80 | fo = fsspec.open("ftp://{}:{}@{}:{}/out".format(user, pw, host, port), "wb") 81 | with fo as f: 82 | f.write(b"hello") 83 | fo = fsspec.open("ftp://{}:{}@{}:{}/out".format(user, pw, host, port), "rb") 84 | with fo as f: 85 | assert f.read() == b"hello" 86 | 87 | 88 | @pytest.mark.parametrize("cache_type", ["bytes", "mmap"]) 89 | def test_write_big(ftp_writable, cache_type): 90 | host, port, user, pw = ftp_writable 91 | fs = FTPFileSystem(host, port, user, pw, block_size=1000, cache_type=cache_type) 92 | fn = "/bigger" 93 | with fs.open(fn, "wb") as f: 94 | f.write(b"o" * 500) 95 | assert not fs.exists(fn) 96 | f.write(b"o" * 1000) 97 | fs.invalidate_cache() 98 | assert fs.exists(fn) 99 | f.write(b"o" * 200) 100 | f.flush() 101 | 102 | assert fs.info(fn)["size"] == 1700 103 | assert fs.cat(fn) == b"o" * 1700 104 | 105 | 106 | def test_transaction(ftp_writable): 107 | host, port, user, pw = ftp_writable 108 | fs = FTPFileSystem(host, port, user, pw) 109 | fs.mkdir("/tmp") 110 | fn = "/tr" 111 | with fs.transaction: 112 | with fs.open(fn, "wb") as f: 113 | f.write(b"not") 114 | assert not fs.exists(fn) 115 | assert fs.exists(fn) 116 | assert fs.cat(fn) == b"not" 117 | 118 | fs.rm(fn) 119 | assert not fs.exists(fn) 120 | 121 | 122 | def test_transaction_with_cache(ftp_writable, tmpdir): 123 | host, port, user, pw = ftp_writable 124 | fs = FTPFileSystem(host, port, user, pw) 125 | fs.mkdir("/tmp") 126 | fs.mkdir("/tmp/dir") 127 | assert "dir" in fs.ls("/tmp", detail=False) 128 | 129 | with fs.transaction: 130 | fs.rmdir("/tmp/dir") 131 | 132 | assert "dir" not in fs.ls("/tmp", detail=False) 133 | assert not fs.exists("/tmp/dir") 134 | 135 | 136 | def test_cat_get(ftp_writable, tmpdir): 137 | host, port, user, pw = ftp_writable 138 | fs = FTPFileSystem(host, port, user, pw, block_size=500) 139 | fs.mkdir("/tmp") 140 | data = b"hello" * 500 141 | fs.pipe("/tmp/myfile", data) 142 | assert fs.cat_file("/tmp/myfile") == data 143 | 144 | fn = os.path.join(tmpdir, "lfile") 145 | fs.get_file("/tmp/myfile", fn) 146 | assert open(fn, "rb").read() == data 147 | 148 | 149 | def test_mkdir(ftp_writable): 150 | host, port, user, pw = ftp_writable 151 | fs = FTPFileSystem(host, port, user, pw) 152 | with pytest.raises(ftplib.error_perm): 153 | fs.mkdir("/tmp/not/exist", create_parents=False) 154 | fs.mkdir("/tmp/not/exist") 155 | assert fs.exists("/tmp/not/exist") 156 | fs.makedirs("/tmp/not/exist", exist_ok=True) 157 | with pytest.raises(FileExistsError): 158 | fs.makedirs("/tmp/not/exist", exist_ok=False) 159 | fs.makedirs("/tmp/not/exist/inner/inner") 160 | assert fs.isdir("/tmp/not/exist/inner/inner") 161 | 162 | 163 | def test_rm_recursive(ftp_writable): 164 | host, port, user, pw = ftp_writable 165 | fs = FTPFileSystem(host, port, user, pw) 166 | fs.mkdir("/tmp/topdir") 167 | fs.mkdir("/tmp/topdir/underdir") 168 | fs.touch("/tmp/topdir/afile") 169 | fs.touch("/tmp/topdir/underdir/afile") 170 | 171 | with pytest.raises(ftplib.error_perm): 172 | fs.rmdir("/tmp/topdir") 173 | 174 | fs.rm("/tmp/topdir", recursive=True) 175 | assert not fs.exists("/tmp/topdir") 176 | -------------------------------------------------------------------------------- /fsspec/implementations/tests/test_arrow.py: -------------------------------------------------------------------------------- 1 | import secrets 2 | 3 | import pytest 4 | 5 | pyarrow_fs = pytest.importorskip("pyarrow.fs") 6 | FileSystem = pyarrow_fs.FileSystem 7 | 8 | from fsspec.implementations.arrow import ArrowFSWrapper # noqa 9 | 10 | 11 | @pytest.fixture(scope="function") 12 | def fs(): 13 | fs, _ = FileSystem.from_uri("mock://") 14 | return ArrowFSWrapper(fs) 15 | 16 | 17 | @pytest.fixture(scope="function") 18 | def remote_dir(fs): 19 | directory = secrets.token_hex(16) 20 | fs.makedirs(directory) 21 | yield directory 22 | fs.rm(directory, recursive=True) 23 | 24 | 25 | def strip_keys(original_entry): 26 | entry = original_entry.copy() 27 | entry.pop("mtime") 28 | return entry 29 | 30 | 31 | def test_info(fs, remote_dir): 32 | fs.touch(remote_dir + "/a.txt") 33 | details = fs.info(remote_dir + "/a.txt") 34 | assert details["type"] == "file" 35 | assert details["name"] == remote_dir + "/a.txt" 36 | assert details["size"] == 0 37 | 38 | fs.mkdir(remote_dir + "/dir") 39 | details = fs.info(remote_dir + "/dir") 40 | assert details["type"] == "directory" 41 | assert details["name"] == remote_dir + "/dir" 42 | 43 | details = fs.info(remote_dir + "/dir/") 44 | assert details["name"] == remote_dir + "/dir/" 45 | 46 | 47 | def test_move(fs, remote_dir): 48 | fs.touch(remote_dir + "/a.txt") 49 | initial_info = fs.info(remote_dir + "/a.txt") 50 | 51 | fs.move(remote_dir + "/a.txt", remote_dir + "/b.txt") 52 | secondary_info = fs.info(remote_dir + "/b.txt") 53 | 54 | assert not fs.exists(remote_dir + "/a.txt") 55 | assert fs.exists(remote_dir + "/b.txt") 56 | 57 | initial_info.pop("name") 58 | secondary_info.pop("name") 59 | assert initial_info == secondary_info 60 | 61 | 62 | def test_move_recursive(fs, remote_dir): 63 | src = remote_dir + "/src" 64 | dest = remote_dir + "/dest" 65 | 66 | assert fs.isdir(src) is False 67 | fs.mkdir(src) 68 | assert fs.isdir(src) 69 | 70 | fs.touch(src + "/a.txt") 71 | fs.mkdir(src + "/b") 72 | fs.touch(src + "/b/c.txt") 73 | fs.move(src, dest, recursive=True) 74 | 75 | assert fs.isdir(src) is False 76 | assert not fs.exists(src) 77 | 78 | assert fs.isdir(dest) 79 | assert fs.exists(dest) 80 | assert fs.cat(dest + "/b/c.txt") == fs.cat(dest + "/a.txt") == b"" 81 | 82 | 83 | def test_copy(fs, remote_dir): 84 | fs.touch(remote_dir + "/a.txt") 85 | initial_info = fs.info(remote_dir + "/a.txt") 86 | 87 | fs.copy(remote_dir + "/a.txt", remote_dir + "/b.txt") 88 | secondary_info = fs.info(remote_dir + "/b.txt") 89 | 90 | assert fs.exists(remote_dir + "/a.txt") 91 | assert fs.exists(remote_dir + "/b.txt") 92 | 93 | initial_info.pop("name") 94 | secondary_info.pop("name") 95 | assert strip_keys(initial_info) == strip_keys(secondary_info) 96 | 97 | 98 | def test_rm(fs, remote_dir): 99 | fs.touch(remote_dir + "/a.txt") 100 | fs.rm(remote_dir + "/a.txt", recursive=True) 101 | assert not fs.exists(remote_dir + "/a.txt") 102 | 103 | fs.mkdir(remote_dir + "/dir") 104 | fs.rm(remote_dir + "/dir", recursive=True) 105 | assert not fs.exists(remote_dir + "/dir") 106 | 107 | fs.mkdir(remote_dir + "/dir") 108 | fs.touch(remote_dir + "/dir/a") 109 | fs.touch(remote_dir + "/dir/b") 110 | fs.mkdir(remote_dir + "/dir/c/") 111 | fs.touch(remote_dir + "/dir/c/a/") 112 | fs.rm(remote_dir + "/dir", recursive=True) 113 | assert not fs.exists(remote_dir + "/dir") 114 | 115 | 116 | def test_ls(fs, remote_dir): 117 | fs.mkdir(remote_dir + "dir/") 118 | files = set() 119 | for no in range(8): 120 | file = remote_dir + f"dir/test_{no}" 121 | fs.touch(file) 122 | files.add(file) 123 | 124 | assert set(fs.ls(remote_dir + "dir/")) == files 125 | 126 | dirs = fs.ls(remote_dir + "dir/", detail=True) 127 | expected = [fs.info(file) for file in files] 128 | 129 | by_name = lambda details: details["name"] 130 | dirs.sort(key=by_name) 131 | expected.sort(key=by_name) 132 | 133 | assert dirs == expected 134 | 135 | 136 | def test_mkdir(fs, remote_dir): 137 | fs.mkdir(remote_dir + "dir/") 138 | assert fs.isdir(remote_dir + "dir/") 139 | assert len(fs.ls(remote_dir + "dir/")) == 0 140 | 141 | fs.mkdir(remote_dir + "dir/sub", create_parents=False) 142 | assert fs.isdir(remote_dir + "dir/sub") 143 | 144 | 145 | def test_makedirs(fs, remote_dir): 146 | fs.makedirs(remote_dir + "dir/a/b/c/") 147 | assert fs.isdir(remote_dir + "dir/a/b/c/") 148 | assert fs.isdir(remote_dir + "dir/a/b/") 149 | assert fs.isdir(remote_dir + "dir/a/") 150 | 151 | fs.makedirs(remote_dir + "dir/a/b/c/", exist_ok=True) 152 | 153 | 154 | def test_exceptions(fs, remote_dir): 155 | with pytest.raises(FileNotFoundError): 156 | with fs.open(remote_dir + "/a.txt"): 157 | ... 158 | 159 | with pytest.raises(FileNotFoundError): 160 | fs.copy(remote_dir + "/u.txt", remote_dir + "/y.txt") 161 | 162 | 163 | def test_open_rw(fs, remote_dir): 164 | data = b"dvc.org" 165 | 166 | with fs.open(remote_dir + "/a.txt", "wb") as stream: 167 | stream.write(data) 168 | 169 | with fs.open(remote_dir + "/a.txt") as stream: 170 | assert stream.read() == data 171 | 172 | 173 | def test_open_rw_flush(fs, remote_dir): 174 | data = b"dvc.org" 175 | 176 | with fs.open(remote_dir + "/b.txt", "wb") as stream: 177 | for _ in range(200): 178 | stream.write(data) 179 | stream.write(data) 180 | stream.flush() 181 | 182 | with fs.open(remote_dir + "/b.txt", "rb") as stream: 183 | assert stream.read() == data * 400 184 | -------------------------------------------------------------------------------- /fsspec/compression.py: -------------------------------------------------------------------------------- 1 | """Helper functions for a standard streaming compression API""" 2 | from bz2 import BZ2File 3 | from zipfile import ZipFile 4 | 5 | import fsspec.utils 6 | from fsspec.spec import AbstractBufferedFile 7 | 8 | 9 | def noop_file(file, mode, **kwargs): 10 | return file 11 | 12 | 13 | # TODO: files should also be available as contexts 14 | # should be functions of the form func(infile, mode=, **kwargs) -> file-like 15 | compr = {None: noop_file} 16 | 17 | 18 | def register_compression(name, callback, extensions, force=False): 19 | """Register an "inferable" file compression type. 20 | 21 | Registers transparent file compression type for use with fsspec.open. 22 | Compression can be specified by name in open, or "infer"-ed for any files 23 | ending with the given extensions. 24 | 25 | Args: 26 | name: (str) The compression type name. Eg. "gzip". 27 | callback: A callable of form (infile, mode, **kwargs) -> file-like. 28 | Accepts an input file-like object, the target mode and kwargs. 29 | Returns a wrapped file-like object. 30 | extensions: (str, Iterable[str]) A file extension, or list of file 31 | extensions for which to infer this compression scheme. Eg. "gz". 32 | force: (bool) Force re-registration of compression type or extensions. 33 | 34 | Raises: 35 | ValueError: If name or extensions already registered, and not force. 36 | 37 | """ 38 | if isinstance(extensions, str): 39 | extensions = [extensions] 40 | 41 | # Validate registration 42 | if name in compr and not force: 43 | raise ValueError("Duplicate compression registration: %s" % name) 44 | 45 | for ext in extensions: 46 | if ext in fsspec.utils.compressions and not force: 47 | raise ValueError( 48 | "Duplicate compression file extension: %s (%s)" % (ext, name) 49 | ) 50 | 51 | compr[name] = callback 52 | 53 | for ext in extensions: 54 | fsspec.utils.compressions[ext] = name 55 | 56 | 57 | def unzip(infile, mode="rb", filename=None, **kwargs): 58 | if "r" not in mode: 59 | filename = filename or "file" 60 | z = ZipFile(infile, mode="w", **kwargs) 61 | fo = z.open(filename, mode="w") 62 | fo.close = lambda closer=fo.close: closer() or z.close() 63 | return fo 64 | z = ZipFile(infile) 65 | if filename is None: 66 | filename = z.namelist()[0] 67 | return z.open(filename, mode="r", **kwargs) 68 | 69 | 70 | register_compression("zip", unzip, "zip") 71 | register_compression("bz2", BZ2File, "bz2") 72 | 73 | try: # pragma: no cover 74 | from isal import igzip 75 | 76 | # igzip is meant to be used as a faster drop in replacement to gzip 77 | # so its api and functions are the same as the stdlib’s module. Except 78 | # where ISA-L does not support the same calls as zlib 79 | # (See https://python-isal.readthedocs.io/). 80 | 81 | register_compression("gzip", igzip.IGzipFile, "gz") 82 | except ImportError: 83 | from gzip import GzipFile 84 | 85 | register_compression( 86 | "gzip", lambda f, **kwargs: GzipFile(fileobj=f, **kwargs), "gz" 87 | ) 88 | 89 | try: 90 | from lzma import LZMAFile 91 | 92 | register_compression("lzma", LZMAFile, "xz") 93 | register_compression("xz", LZMAFile, "xz", force=True) 94 | except ImportError: 95 | pass 96 | 97 | try: 98 | import lzmaffi 99 | 100 | register_compression("lzma", lzmaffi.LZMAFile, "xz", force=True) 101 | register_compression("xz", lzmaffi.LZMAFile, "xz", force=True) 102 | except ImportError: 103 | pass 104 | 105 | 106 | class SnappyFile(AbstractBufferedFile): 107 | def __init__(self, infile, mode, **kwargs): 108 | import snappy 109 | 110 | super().__init__( 111 | fs=None, path="snappy", mode=mode.strip("b") + "b", size=999999999, **kwargs 112 | ) 113 | self.infile = infile 114 | if "r" in mode: 115 | self.codec = snappy.StreamDecompressor() 116 | else: 117 | self.codec = snappy.StreamCompressor() 118 | 119 | def _upload_chunk(self, final=False): 120 | self.buffer.seek(0) 121 | out = self.codec.add_chunk(self.buffer.read()) 122 | self.infile.write(out) 123 | return True 124 | 125 | def seek(self, loc, whence=0): 126 | raise NotImplementedError("SnappyFile is not seekable") 127 | 128 | def seekable(self): 129 | return False 130 | 131 | def _fetch_range(self, start, end): 132 | """Get the specified set of bytes from remote""" 133 | data = self.infile.read(end - start) 134 | return self.codec.decompress(data) 135 | 136 | 137 | try: 138 | import snappy 139 | 140 | snappy.compress 141 | # Snappy may use the .sz file extension, but this is not part of the 142 | # standard implementation. 143 | register_compression("snappy", SnappyFile, []) 144 | 145 | except (ImportError, NameError): 146 | pass 147 | 148 | try: 149 | import lz4.frame 150 | 151 | register_compression("lz4", lz4.frame.open, "lz4") 152 | except ImportError: 153 | pass 154 | 155 | try: 156 | import zstandard as zstd 157 | 158 | def zstandard_file(infile, mode="rb"): 159 | if "r" in mode: 160 | cctx = zstd.ZstdDecompressor() 161 | return cctx.stream_reader(infile) 162 | else: 163 | cctx = zstd.ZstdCompressor(level=10) 164 | return cctx.stream_writer(infile) 165 | 166 | register_compression("zstd", zstandard_file, "zst") 167 | except ImportError: 168 | pass 169 | 170 | 171 | def available_compressions(): 172 | """Return a list of the implemented compressions.""" 173 | return list(compr) 174 | -------------------------------------------------------------------------------- /fsspec/tests/test_compression.py: -------------------------------------------------------------------------------- 1 | import pathlib 2 | 3 | import pytest 4 | 5 | import fsspec.core 6 | from fsspec.compression import compr, register_compression 7 | from fsspec.utils import compressions, infer_compression 8 | 9 | 10 | def test_infer_custom_compression(): 11 | """Inferred compression gets values from fsspec.compression.compr.""" 12 | assert infer_compression("fn.zip") == "zip" 13 | assert infer_compression("fn.gz") == "gzip" 14 | assert infer_compression("fn.unknown") is None 15 | assert infer_compression("fn.test_custom") is None 16 | assert infer_compression("fn.tst") is None 17 | 18 | register_compression("test_custom", lambda f, **kwargs: f, "tst") 19 | 20 | try: 21 | assert infer_compression("fn.zip") == "zip" 22 | assert infer_compression("fn.gz") == "gzip" 23 | assert infer_compression("fn.unknown") is None 24 | assert infer_compression("fn.test_custom") is None 25 | assert infer_compression("fn.tst") == "test_custom" 26 | 27 | # Duplicate registration in name or extension raises a value error. 28 | with pytest.raises(ValueError): 29 | register_compression("test_custom", lambda f, **kwargs: f, "tst") 30 | 31 | with pytest.raises(ValueError): 32 | register_compression("test_conflicting", lambda f, **kwargs: f, "tst") 33 | assert "test_conflicting" not in compr 34 | 35 | # ...but can be forced. 36 | register_compression( 37 | "test_conflicting", lambda f, **kwargs: f, "tst", force=True 38 | ) 39 | assert infer_compression("fn.zip") == "zip" 40 | assert infer_compression("fn.gz") == "gzip" 41 | assert infer_compression("fn.unknown") is None 42 | assert infer_compression("fn.test_custom") is None 43 | assert infer_compression("fn.tst") == "test_conflicting" 44 | 45 | finally: 46 | del compr["test_custom"] 47 | del compr["test_conflicting"] 48 | del compressions["tst"] 49 | 50 | 51 | def test_infer_uppercase_compression(): 52 | assert infer_compression("fn.ZIP") == "zip" 53 | assert infer_compression("fn.GZ") == "gzip" 54 | assert infer_compression("fn.UNKNOWN") is None 55 | assert infer_compression("fn.TEST_UPPERCASE") is None 56 | assert infer_compression("fn.TEST") is None 57 | 58 | 59 | def test_lzma_compression_name(): 60 | pytest.importorskip("lzma") 61 | assert infer_compression("fn.xz") == "xz" 62 | 63 | 64 | def test_lz4_compression(tmpdir): 65 | """Infer lz4 compression for .lz4 files if lz4 is available.""" 66 | tmp_path = pathlib.Path(str(tmpdir)) 67 | 68 | lz4 = pytest.importorskip("lz4") 69 | 70 | tmp_path.mkdir(exist_ok=True) 71 | 72 | tdat = "foobar" * 100 73 | 74 | with fsspec.core.open( 75 | str(tmp_path / "out.lz4"), mode="wt", compression="infer" 76 | ) as outfile: 77 | outfile.write(tdat) 78 | 79 | compressed = (tmp_path / "out.lz4").open("rb").read() 80 | assert lz4.frame.decompress(compressed).decode() == tdat 81 | 82 | with fsspec.core.open( 83 | str(tmp_path / "out.lz4"), mode="rt", compression="infer" 84 | ) as infile: 85 | assert infile.read() == tdat 86 | 87 | with fsspec.core.open( 88 | str(tmp_path / "out.lz4"), mode="rt", compression="lz4" 89 | ) as infile: 90 | assert infile.read() == tdat 91 | 92 | 93 | def test_zstd_compression(tmpdir): 94 | """Infer zstd compression for .zst files if zstandard is available.""" 95 | tmp_path = pathlib.Path(str(tmpdir)) 96 | 97 | zstd = pytest.importorskip("zstandard") 98 | 99 | tmp_path.mkdir(exist_ok=True) 100 | 101 | tdat = "foobar" * 100 102 | 103 | with fsspec.core.open( 104 | str(tmp_path / "out.zst"), mode="wt", compression="infer" 105 | ) as outfile: 106 | outfile.write(tdat) 107 | 108 | compressed = (tmp_path / "out.zst").open("rb").read() 109 | assert zstd.ZstdDecompressor().decompress(compressed, len(tdat)).decode() == tdat 110 | 111 | with fsspec.core.open( 112 | str(tmp_path / "out.zst"), mode="rt", compression="infer" 113 | ) as infile: 114 | assert infile.read() == tdat 115 | 116 | with fsspec.core.open( 117 | str(tmp_path / "out.zst"), mode="rt", compression="zstd" 118 | ) as infile: 119 | assert infile.read() == tdat 120 | 121 | # fails in https://github.com/fsspec/filesystem_spec/issues/725 122 | infile = fsspec.core.open( 123 | str(tmp_path / "out.zst"), mode="rb", compression="infer" 124 | ).open() 125 | 126 | infile.close() 127 | 128 | 129 | def test_snappy_compression(tmpdir): 130 | """No registered compression for snappy, but can be specified.""" 131 | tmp_path = pathlib.Path(str(tmpdir)) 132 | 133 | snappy = pytest.importorskip("snappy") 134 | 135 | tmp_path.mkdir(exist_ok=True) 136 | 137 | tdat = "foobar" * 100 138 | 139 | # Snappy isn't inferred. 140 | with fsspec.core.open( 141 | str(tmp_path / "out.snappy"), mode="wt", compression="infer" 142 | ) as outfile: 143 | outfile.write(tdat) 144 | assert (tmp_path / "out.snappy").open("rb").read().decode() == tdat 145 | 146 | # but can be specified. 147 | with fsspec.core.open( 148 | str(tmp_path / "out.snappy"), mode="wt", compression="snappy" 149 | ) as outfile: 150 | outfile.write(tdat) 151 | 152 | compressed = (tmp_path / "out.snappy").open("rb").read() 153 | assert snappy.StreamDecompressor().decompress(compressed).decode() == tdat 154 | 155 | with fsspec.core.open( 156 | str(tmp_path / "out.snappy"), mode="rb", compression="infer" 157 | ) as infile: 158 | assert infile.read() == compressed 159 | 160 | with fsspec.core.open( 161 | str(tmp_path / "out.snappy"), mode="rt", compression="snappy" 162 | ) as infile: 163 | assert infile.read() == tdat 164 | -------------------------------------------------------------------------------- /fsspec/implementations/sftp.py: -------------------------------------------------------------------------------- 1 | import datetime 2 | import logging 3 | import types 4 | import uuid 5 | from stat import S_ISDIR, S_ISLNK 6 | 7 | import paramiko 8 | 9 | from .. import AbstractFileSystem 10 | from ..utils import infer_storage_options 11 | 12 | logger = logging.getLogger("fsspec.sftp") 13 | 14 | 15 | class SFTPFileSystem(AbstractFileSystem): 16 | """Files over SFTP/SSH 17 | 18 | Peer-to-peer filesystem over SSH using paramiko. 19 | 20 | Note: if using this with the ``open`` or ``open_files``, with full URLs, 21 | there is no way to tell if a path is relative, so all paths are assumed 22 | to be absolute. 23 | """ 24 | 25 | protocol = "sftp", "ssh" 26 | 27 | def __init__(self, host, **ssh_kwargs): 28 | """ 29 | 30 | Parameters 31 | ---------- 32 | host: str 33 | Hostname or IP as a string 34 | temppath: str 35 | Location on the server to put files, when within a transaction 36 | ssh_kwargs: dict 37 | Parameters passed on to connection. See details in 38 | http://docs.paramiko.org/en/2.4/api/client.html#paramiko.client.SSHClient.connect 39 | May include port, username, password... 40 | """ 41 | if self._cached: 42 | return 43 | super(SFTPFileSystem, self).__init__(**ssh_kwargs) 44 | self.temppath = ssh_kwargs.pop("temppath", "/tmp") # remote temp directory 45 | self.host = host 46 | self.ssh_kwargs = ssh_kwargs 47 | self._connect() 48 | 49 | def _connect(self): 50 | logger.debug("Connecting to SFTP server %s" % self.host) 51 | self.client = paramiko.SSHClient() 52 | self.client.set_missing_host_key_policy(paramiko.AutoAddPolicy()) 53 | self.client.connect(self.host, **self.ssh_kwargs) 54 | self.ftp = self.client.open_sftp() 55 | 56 | @classmethod 57 | def _strip_protocol(cls, path): 58 | return infer_storage_options(path)["path"] 59 | 60 | @staticmethod 61 | def _get_kwargs_from_urls(urlpath): 62 | out = infer_storage_options(urlpath) 63 | out.pop("path", None) 64 | out.pop("protocol", None) 65 | return out 66 | 67 | def mkdir(self, path, mode=511): 68 | logger.debug("Creating folder %s" % path) 69 | self.ftp.mkdir(path, mode) 70 | 71 | def makedirs(self, path, exist_ok=False, mode=511): 72 | if self.exists(path) and not exist_ok: 73 | raise FileExistsError("File exists: {}".format(path)) 74 | 75 | parts = path.split("/") 76 | path = "" 77 | 78 | for part in parts: 79 | path += "/" + part 80 | if not self.exists(path): 81 | self.mkdir(path, mode) 82 | 83 | def rmdir(self, path): 84 | logger.debug("Removing folder %s" % path) 85 | self.ftp.rmdir(path) 86 | 87 | def info(self, path): 88 | stat = self._decode_stat(self.ftp.stat(path)) 89 | stat["name"] = path 90 | return stat 91 | 92 | @staticmethod 93 | def _decode_stat(stat, parent_path=None): 94 | if S_ISDIR(stat.st_mode): 95 | t = "directory" 96 | elif S_ISLNK(stat.st_mode): 97 | t = "link" 98 | else: 99 | t = "file" 100 | out = { 101 | "name": "", 102 | "size": stat.st_size, 103 | "type": t, 104 | "uid": stat.st_uid, 105 | "gid": stat.st_gid, 106 | "time": datetime.datetime.utcfromtimestamp(stat.st_atime), 107 | "mtime": datetime.datetime.utcfromtimestamp(stat.st_mtime), 108 | } 109 | if parent_path: 110 | out["name"] = "/".join([parent_path.rstrip("/"), stat.filename]) 111 | return out 112 | 113 | def ls(self, path, detail=False): 114 | logger.debug("Listing folder %s" % path) 115 | stats = [self._decode_stat(stat, path) for stat in self.ftp.listdir_iter(path)] 116 | if detail: 117 | return stats 118 | else: 119 | paths = [stat["name"] for stat in stats] 120 | return sorted(paths) 121 | 122 | def put(self, lpath, rpath, callback=None, **kwargs): 123 | logger.debug("Put file %s into %s" % (lpath, rpath)) 124 | self.ftp.put(lpath, rpath) 125 | 126 | def get(self, rpath, lpath, callback=None, **kwargs): 127 | logger.debug("Get file %s into %s" % (rpath, lpath)) 128 | self.ftp.get(rpath, lpath) 129 | 130 | def _open(self, path, mode="rb", block_size=None, **kwargs): 131 | """ 132 | block_size: int or None 133 | If 0, no buffering, if 1, line buffering, if >1, buffer that many 134 | bytes, if None use default from paramiko. 135 | """ 136 | logger.debug("Opening file %s" % path) 137 | if kwargs.get("autocommit", True) is False: 138 | # writes to temporary file, move on commit 139 | path2 = "/".join([self.temppath, str(uuid.uuid4())]) 140 | f = self.ftp.open(path2, mode, bufsize=block_size if block_size else -1) 141 | f.temppath = path2 142 | f.targetpath = path 143 | f.fs = self 144 | f.commit = types.MethodType(commit_a_file, f) 145 | f.discard = types.MethodType(discard_a_file, f) 146 | else: 147 | f = self.ftp.open(path, mode, bufsize=block_size if block_size else -1) 148 | return f 149 | 150 | def _rm(self, path): 151 | if self.isdir(path): 152 | self.ftp.rmdir(path) 153 | else: 154 | self.ftp.remove(path) 155 | 156 | def mv(self, old, new): 157 | logger.debug("Renaming %s into %s" % (old, new)) 158 | self.ftp.posix_rename(old, new) 159 | 160 | 161 | def commit_a_file(self): 162 | self.fs.mv(self.temppath, self.targetpath) 163 | 164 | 165 | def discard_a_file(self): 166 | self.fs._rm(self.temppath) 167 | -------------------------------------------------------------------------------- /fsspec/implementations/tests/test_sftp.py: -------------------------------------------------------------------------------- 1 | import shlex 2 | import subprocess 3 | import time 4 | from tarfile import TarFile 5 | 6 | import pytest 7 | 8 | import fsspec 9 | 10 | pytest.importorskip("paramiko") 11 | 12 | 13 | def stop_docker(name): 14 | cmd = shlex.split('docker ps -a -q --filter "name=%s"' % name) 15 | cid = subprocess.check_output(cmd).strip().decode() 16 | if cid: 17 | subprocess.call(["docker", "rm", "-f", cid]) 18 | 19 | 20 | @pytest.fixture(scope="module") 21 | def ssh(): 22 | try: 23 | subprocess.check_call(["docker", "run", "hello-world"]) 24 | except (subprocess.CalledProcessError, FileNotFoundError): 25 | pytest.skip("docker run not available") 26 | return 27 | 28 | # requires docker 29 | cmds = [ 30 | r"apt-get update", 31 | r"apt-get install -y openssh-server", 32 | r"mkdir /var/run/sshd", 33 | "bash -c \"echo 'root:pass' | chpasswd\"", 34 | ( 35 | r"sed -i 's/PermitRootLogin prohibit-password/PermitRootLogin yes/' " 36 | r"/etc/ssh/sshd_config" 37 | ), 38 | ( 39 | r"sed 's@session\s*required\s*pam_loginuid.so@session optional " 40 | r"pam_loginuid.so@g' -i /etc/pam.d/sshd" 41 | ), 42 | r'bash -c "echo \"export VISIBLE=now\" >> /etc/profile"', 43 | r"/usr/sbin/sshd", 44 | ] 45 | name = "fsspec_sftp" 46 | stop_docker(name) 47 | cmd = "docker run -d -p 9200:22 --name {} ubuntu:16.04 sleep 9000".format(name) 48 | cid = subprocess.check_output(shlex.split(cmd)).strip().decode() 49 | for cmd in cmds: 50 | subprocess.call(["docker", "exec", cid] + shlex.split(cmd)) 51 | try: 52 | time.sleep(1) 53 | yield dict(host="localhost", port=9200, username="root", password="pass") 54 | finally: 55 | stop_docker(name) 56 | 57 | 58 | @pytest.fixture(scope="module") 59 | def root_path(): 60 | return "/home/someuser/" 61 | 62 | 63 | def test_simple(ssh, root_path): 64 | f = fsspec.get_filesystem_class("sftp")(**ssh) 65 | f.mkdirs(root_path + "deeper") 66 | try: 67 | f.touch(root_path + "deeper/afile") 68 | assert f.find(root_path) == [root_path + "deeper/afile"] 69 | assert f.ls(root_path + "deeper/") == [root_path + "deeper/afile"] 70 | assert f.info(root_path + "deeper/afile")["type"] == "file" 71 | assert f.info(root_path + "deeper/afile")["size"] == 0 72 | assert f.exists(root_path) 73 | finally: 74 | f.rm(root_path, recursive=True) 75 | assert not f.exists(root_path) 76 | 77 | 78 | @pytest.mark.parametrize("protocol", ["sftp", "ssh"]) 79 | def test_with_url(protocol, ssh): 80 | fo = fsspec.open( 81 | protocol + "://{username}:{password}@{host}:{port}" 82 | "/home/someuserout".format(**ssh), 83 | "wb", 84 | ) 85 | with fo as f: 86 | f.write(b"hello") 87 | fo = fsspec.open( 88 | protocol + "://{username}:{password}@{host}:{port}" 89 | "/home/someuserout".format(**ssh), 90 | "rb", 91 | ) 92 | with fo as f: 93 | assert f.read() == b"hello" 94 | 95 | 96 | @pytest.fixture(scope="module") 97 | def netloc(ssh): 98 | username = ssh.get("username") 99 | password = ssh.get("password") 100 | host = ssh.get("host") 101 | port = ssh.get("port") 102 | userpass = ( 103 | username + ((":" + password) if password is not None else "") + "@" 104 | if username is not None 105 | else "" 106 | ) 107 | netloc = host + ((":" + str(port)) if port is not None else "") 108 | return userpass + netloc 109 | 110 | 111 | def test_simple_with_tar(ssh, netloc, tmp_path, root_path): 112 | 113 | files_to_pack = ["a.txt", "b.txt"] 114 | 115 | tar_filename = make_tarfile(files_to_pack, tmp_path) 116 | 117 | f = fsspec.get_filesystem_class("sftp")(**ssh) 118 | f.mkdirs(root_path + "deeper", exist_ok=True) 119 | try: 120 | remote_tar_filename = root_path + "deeper/somefile.tar" 121 | with f.open(remote_tar_filename, mode="wb") as wfd: 122 | with open(tar_filename, mode="rb") as rfd: 123 | wfd.write(rfd.read()) 124 | fs = fsspec.open("tar::ssh://" + netloc + remote_tar_filename).fs 125 | files = fs.find("/") 126 | assert files == files_to_pack 127 | finally: 128 | f.rm(root_path, recursive=True) 129 | 130 | 131 | def make_tarfile(files_to_pack, tmp_path): 132 | """Create a tarfile with some files.""" 133 | tar_filename = tmp_path / "sometarfile.tar" 134 | for filename in files_to_pack: 135 | with open(tmp_path / filename, mode="w") as fd: 136 | fd.write("") 137 | with TarFile(tar_filename, mode="w") as tf: 138 | for filename in files_to_pack: 139 | tf.add(tmp_path / filename, arcname=filename) 140 | return tar_filename 141 | 142 | 143 | def test_transaction(ssh, root_path): 144 | f = fsspec.get_filesystem_class("sftp")(**ssh) 145 | f.mkdirs(root_path + "deeper", exist_ok=True) 146 | try: 147 | f.start_transaction() 148 | f.touch(root_path + "deeper/afile") 149 | assert f.find(root_path) == [] 150 | f.end_transaction() 151 | assert f.find(root_path) == [root_path + "deeper/afile"] 152 | 153 | with f.transaction: 154 | assert f._intrans 155 | f.touch(root_path + "deeper/afile2") 156 | assert f.find(root_path) == [root_path + "deeper/afile"] 157 | assert f.find(root_path) == [ 158 | root_path + "deeper/afile", 159 | root_path + "deeper/afile2", 160 | ] 161 | finally: 162 | f.rm(root_path, recursive=True) 163 | 164 | 165 | def test_makedirs_exist_ok(ssh): 166 | f = fsspec.get_filesystem_class("sftp")(**ssh) 167 | 168 | f.makedirs("/a/b/c") 169 | 170 | with pytest.raises(FileExistsError, match="/a/b/c"): 171 | f.makedirs("/a/b/c", exist_ok=False) 172 | 173 | f.makedirs("/a/b/c", exist_ok=True) 174 | -------------------------------------------------------------------------------- /fsspec/generic.py: -------------------------------------------------------------------------------- 1 | import inspect 2 | 3 | from .asyn import AsyncFileSystem 4 | from .callbacks import _DEFAULT_CALLBACK 5 | from .core import filesystem, get_filesystem_class, split_protocol 6 | 7 | _generic_fs = {} 8 | 9 | 10 | def set_generic_fs(protocol, **storage_options): 11 | _generic_fs[protocol] = filesystem(protocol, **storage_options) 12 | 13 | 14 | default_method = "default" 15 | 16 | 17 | def _resolve_fs(url, method=None, protocol=None, storage_options=None): 18 | """Pick instance of backend FS""" 19 | method = method or default_method 20 | protocol = protocol or split_protocol(url)[0] 21 | storage_options = storage_options or {} 22 | if method == "default": 23 | return filesystem(protocol) 24 | if method == "generic": 25 | return _generic_fs[protocol] 26 | if method == "current": 27 | cls = get_filesystem_class(protocol) 28 | return cls.current() 29 | if method == "options": 30 | return filesystem(protocol, **storage_options.get(protocol, {})) 31 | raise ValueError(f"Unknown FS resolution method: {method}") 32 | 33 | 34 | class GenericFileSystem(AsyncFileSystem): 35 | """Wrapper over all other FS types 36 | 37 | 38 | 39 | This implementation is a single unified interface to be able to run FS operations 40 | over generic URLs, and dispatch to the specific implementations using the URL 41 | protocol prefix. 42 | 43 | Note: instances of this FS are always async, even if you never use it with any async 44 | backend. 45 | """ 46 | 47 | protocol = "generic" # there is no real reason to ever use a protocol with this FS 48 | 49 | def __init__(self, default_method=None, **kwargs): 50 | """ 51 | 52 | Parameters 53 | ---------- 54 | default_method: str (optional) 55 | Defines how to configure backend FS instances. Options are: 56 | - "default" (you get this with None): instantiate like FSClass(), with no 57 | extra arguments; this is the default instance of that FS, and can be 58 | configured via the config system 59 | - "generic": takes instances from the `_generic_fs` dict in this module, 60 | which you must populate before use. Keys are by protocol 61 | - "current": takes the most recently instantiated version of each FS 62 | - "options": expect ``storage_options`` to be passed along with every call. 63 | """ 64 | self.method = default_method 65 | super(GenericFileSystem, self).__init__(**kwargs) 66 | 67 | async def _info( 68 | self, url, method=None, protocol=None, storage_options=None, fs=None, **kwargs 69 | ): 70 | fs = fs or _resolve_fs(url, method or self.method, protocol, storage_options) 71 | if fs.async_impl: 72 | out = await fs._info(url, **kwargs) 73 | else: 74 | out = fs.info(url, **kwargs) 75 | out["name"] = fs.unstrip_protocol(out["name"]) 76 | return out 77 | 78 | async def _ls( 79 | self, 80 | url, 81 | method=None, 82 | protocol=None, 83 | storage_options=None, 84 | fs=None, 85 | detail=True, 86 | **kwargs, 87 | ): 88 | fs = fs or _resolve_fs(url, method or self.method, protocol, storage_options) 89 | if fs.async_impl: 90 | out = await fs._ls(url, detail=True, **kwargs) 91 | else: 92 | out = fs.ls(url, detail=True, **kwargs) 93 | for o in out: 94 | o["name"] = fs.unstrip_protocol(o["name"]) 95 | if detail: 96 | return out 97 | else: 98 | return [o["name"] for o in out] 99 | 100 | async def _rm( 101 | self, url, method=None, protocol=None, storage_options=None, fs=None, **kwargs 102 | ): 103 | fs = fs or _resolve_fs(url, method or self.method, protocol, storage_options) 104 | if fs.async_impl: 105 | await fs._rm(url, **kwargs) 106 | else: 107 | fs.rm(url, **kwargs) 108 | 109 | async def _cp_file( 110 | self, 111 | url, 112 | url2, 113 | method=None, 114 | protocol=None, 115 | storage_options=None, 116 | fs=None, 117 | method2=None, 118 | protocol2=None, 119 | storage_options2=None, 120 | fs2=None, 121 | blocksize=2**20, 122 | callback=_DEFAULT_CALLBACK, 123 | **kwargs, 124 | ): 125 | fs = fs or _resolve_fs(url, method or self.method, protocol, storage_options) 126 | fs2 = fs2 or _resolve_fs( 127 | url2, method2 or self.method, protocol2, storage_options2 128 | ) 129 | if fs is fs2: 130 | # pure remote 131 | if fs.async_impl: 132 | return await fs._cp_file(url, url2, **kwargs) 133 | else: 134 | return fs.cp_file(url, url2, **kwargs) 135 | kw = {"blocksize": 0, "cache_type": "none"} 136 | try: 137 | f1 = ( 138 | await fs.open_async(url, "rb") 139 | if hasattr(fs, "open_async") 140 | else fs.open(url, "rb", **kw) 141 | ) 142 | callback.set_size(maybe_await(f1.size)) 143 | f2 = ( 144 | await fs2.open_async(url2, "wb") 145 | if hasattr(fs2, "open_async") 146 | else fs2.open(url2, "wb", **kw) 147 | ) 148 | while f1.size is None or f2.tell() < f1.size: 149 | data = await maybe_await(f1.read(blocksize)) 150 | if f1.size is None and not data: 151 | break 152 | await maybe_await(f2.write(data)) 153 | callback.absolute_update(f2.tell()) 154 | finally: 155 | try: 156 | await maybe_await(f2.close()) 157 | await maybe_await(f1.close()) 158 | except NameError: 159 | # fail while opening f1 or f2 160 | pass 161 | 162 | 163 | async def maybe_await(cor): 164 | if inspect.iscoroutine(cor): 165 | return await cor 166 | else: 167 | return cor 168 | -------------------------------------------------------------------------------- /docs/source/async.rst: -------------------------------------------------------------------------------- 1 | Async 2 | ===== 3 | 4 | 5 | 6 | ``fsspec`` supports asynchronous operations on certain implementations. This 7 | allows for concurrent calls within bulk operations such as ``cat`` (fetch 8 | the contents of many files at once) even from normal code, and for the direct 9 | use of fsspec in async code without blocking. 10 | Async implementations derive from the class ``fsspec.async.AsyncFileSystem``. 11 | The class attribute ``async_impl`` can be used to test whether an 12 | implementation is async of not. 13 | 14 | ``AsyncFileSystem`` contains ``async def`` coroutine versions of the methods of 15 | ``AbstractFileSystem``. By convention, these methods are prefixed with "_" 16 | to indicate that they are not to called directly in normal code, only 17 | when you know what you are doing. In most cases, the code is identical or 18 | slightly modified by replacing sync calls with ``await`` calls to async 19 | functions. 20 | 21 | The only async implementation built into ``fsspec`` is ``HTTPFileSystem``. 22 | 23 | Synchronous API 24 | --------------- 25 | 26 | The methods of ``AbstractFileSystem`` are available and can be called from 27 | normal code. They call and wait on the corresponding async function. The 28 | *work* is carried out in a separate threads, so if there are many fsspec 29 | operations in flight at once, launched from many threads, they will still 30 | all be processed on the same IO-dedicated thread. 31 | 32 | Most users should not be aware that their code is running async. 33 | 34 | Note that the sync functions are wrapped using ``sync_wrapper``, which 35 | copies the docstrings from ``AbstractFileSystem``, unless they are 36 | explicitly given in the implementation. 37 | 38 | Example: 39 | 40 | .. code-block:: python 41 | 42 | fs = fsspec.filesystem("http") 43 | out = fs.cat([url1, url2, url3]) # fetches data concurrently 44 | 45 | Coroutine batching 46 | ------------------ 47 | 48 | The various methods which create many coroutines to be passed to the event loop 49 | for processing may be batched: submitting a certain number in one go and waiting 50 | for them to complete before launching more. This is important to work around 51 | local open-file limits (which can be <~100) and not to swamp the heap. 52 | 53 | ``fsspec.asyn._run_coros_in_chunks`` controls this process, but from the user's point 54 | of view, there are three ways to affect it. In increasing order or precedence: 55 | 56 | - the global variables ``fsspec.asyn._DEFAULT_BATCH_SIZE`` and 57 | ``fsspec.asyn._NOFILES_DEFAULT_BATCH_SIZE`` (for calls involving local files or not, 58 | respectively) 59 | 60 | - config keys "gather_batch_size" and "nofiles_gather_batch_size" 61 | 62 | - the ``batch_size`` keyword, accepted by the batch methods of an async filesystem. 63 | 64 | 65 | Using from Async 66 | ---------------- 67 | 68 | File system instances can be created with ``asynchronous=True``. This 69 | implies that the instantiation is happening within a coroutine, so 70 | the various async method can be called directly with ``await``, as is 71 | normal in async code. 72 | 73 | Note that, because ``__init__`` is a blocking function, any creation 74 | of asynchronous resources will be deferred. You will normally need to 75 | explicitly ``await`` a coroutine to create them. Since garbage collection 76 | also happens in blocking code, you may wish to explicitly await 77 | resource destructors too. Example: 78 | 79 | .. code-block:: python 80 | 81 | async def work_coroutine(): 82 | fs = fsspec.filesystem("http", asynchronous=True) 83 | session = await fs.set_session() # creates client 84 | out = await fs._cat([url1, url2, url3]) # fetches data concurrently 85 | await session.close() # explicit destructor 86 | 87 | asyncio.run(work_coroutine()) 88 | 89 | Bring your own loop 90 | ------------------- 91 | 92 | For the non-asynchronous case, ``fsspec`` will normally create an asyncio 93 | event loop on a specific thread. However, the calling application may prefer 94 | IO processes to run on a loop that is already around and running (in another 95 | thread). The loop needs to be asyncio compliant, but does not necessarily need 96 | to be an ``ayncio.events.AbstractEventLoop``. Example: 97 | 98 | .. code-block:: python 99 | 100 | loop = ... # however a loop was made, running on another thread 101 | fs = fsspec.filesystem("http", loop=loop) 102 | out = fs.cat([url1, url2, url3]) # fetches data concurrently 103 | 104 | 105 | Implementing new backends 106 | ------------------------- 107 | 108 | Async file systems should derive from ``AsyncFileSystem``, and implement the 109 | ``async def _*`` coroutines there. These functions will either have sync versions 110 | automatically generated is the name is in the ``async_methods`` list, or 111 | can be directly created using ``sync_wrapper``. 112 | 113 | .. code-block:: python 114 | 115 | class MyFileSystem(AsyncFileSystem): 116 | 117 | async def _my_method(self): 118 | ... 119 | 120 | my_method = sync_wrapper(_my_method) 121 | 122 | 123 | These functions must **not call** methods or functions which themselves are synced, 124 | but should instead ``await`` other coroutines. Calling methods which do not require sync, 125 | such as ``_strip_protocol`` is fine. 126 | 127 | Note that ``__init__``, cannot be ``async``, so it might need to allocate async 128 | resources using the ``sync`` function, but *only* if ``asynchronous=False``. If it 129 | is ``True``, you probably need to require the caller to await a coroutine that 130 | creates those resources. Similarly, any destructor (e.g., ``__del__``) will run from normal 131 | code, and possibly after the loop has stopped/closed. 132 | 133 | To call ``sync``, you will need to pass the associated event loop, which will be 134 | available as the attribute ``.loop``. 135 | 136 | .. autosummary:: 137 | fsspec.asyn.AsyncFileSystem 138 | fsspec.asyn.sync 139 | fsspec.asyn.sync_wrapper 140 | fsspec.asyn.get_loop 141 | 142 | .. autoclass:: fsspec.asyn.AsyncFileSystem 143 | :members: 144 | 145 | .. autofunction:: fsspec.asyn.sync 146 | 147 | .. autofunction:: fsspec.asyn.sync_wrapper 148 | 149 | .. autofunction:: fsspec.asyn.get_loop 150 | 151 | .. autofunction:: fsspec.asyn.fsspec_loop 152 | -------------------------------------------------------------------------------- /fsspec/tests/test_file.py: -------------------------------------------------------------------------------- 1 | """Tests abstract buffered file API, using FTP implementation""" 2 | import pickle 3 | 4 | import pytest 5 | 6 | from fsspec.implementations.tests.test_ftp import FTPFileSystem 7 | 8 | data = b"hello" * 10000 9 | 10 | 11 | def test_pickle(ftp_writable): 12 | host, port, user, pw = ftp_writable 13 | ftp = FTPFileSystem(host=host, port=port, username=user, password=pw) 14 | 15 | f = ftp.open("/out", "rb") 16 | 17 | f2 = pickle.loads(pickle.dumps(f)) 18 | assert f == f2 19 | 20 | 21 | def test_file_read_attributes(ftp_writable): 22 | host, port, user, pw = ftp_writable 23 | ftp = FTPFileSystem(host=host, port=port, username=user, password=pw) 24 | 25 | f = ftp.open("/out", "rb") 26 | assert f.info()["size"] == len(data) 27 | assert f.tell() == 0 28 | assert f.seekable() 29 | assert f.readable() 30 | assert not f.writable() 31 | out = bytearray(len(data)) 32 | 33 | assert f.read() == data 34 | assert f.read() == b"" 35 | f.seek(0) 36 | assert f.readuntil(b"l") == b"hel" 37 | assert f.tell() == 3 38 | 39 | f.readinto1(out) 40 | assert out[:-3] == data[3:] 41 | with pytest.raises(ValueError): 42 | f.write(b"") 43 | f.close() 44 | with pytest.raises(ValueError): 45 | f.read()(b"") 46 | 47 | 48 | def test_seek(ftp_writable): 49 | host, port, user, pw = ftp_writable 50 | ftp = FTPFileSystem(host=host, port=port, username=user, password=pw) 51 | 52 | f = ftp.open("/out", "rb") 53 | 54 | assert f.seek(-10, 2) == len(data) - 10 55 | assert f.tell() == len(data) - 10 56 | assert f.seek(-1, 1) == len(data) - 11 57 | with pytest.raises(ValueError): 58 | f.seek(-1) 59 | with pytest.raises(ValueError): 60 | f.seek(0, 7) 61 | 62 | 63 | def test_file_idempotent(ftp_writable): 64 | host, port, user, pw = ftp_writable 65 | ftp = FTPFileSystem(host=host, port=port, username=user, password=pw) 66 | 67 | f = ftp.open("/out", "rb") 68 | f2 = ftp.open("/out", "rb") 69 | assert hash(f) == hash(f2) 70 | assert f == f2 71 | ftp.touch("/out2") 72 | f2 = ftp.open("/out2", "rb") 73 | assert hash(f2) != hash(f) 74 | assert f != f2 75 | f2 = ftp.open("/out", "wb") 76 | assert hash(f2) != hash(f) 77 | 78 | 79 | def test_file_text_attributes(ftp_writable): 80 | host, port, user, pw = ftp_writable 81 | ftp = FTPFileSystem(host=host, port=port, username=user, password=pw) 82 | 83 | data = b"hello\n" * 1000 84 | with ftp.open("/out2", "wb") as f: 85 | f.write(data) 86 | 87 | f = ftp.open("/out2", "rb") 88 | assert f.readline() == b"hello\n" 89 | f.seek(0) 90 | assert list(f) == [d + b"\n" for d in data.split()] 91 | f.seek(0) 92 | assert f.readlines() == [d + b"\n" for d in data.split()] 93 | 94 | f = ftp.open("/out2", "rt") 95 | assert f.readline() == "hello\n" 96 | assert f.encoding 97 | 98 | 99 | def test_file_write_attributes(ftp_writable): 100 | host, port, user, pw = ftp_writable 101 | ftp = FTPFileSystem(host=host, port=port, username=user, password=pw) 102 | f = ftp.open("/out2", "wb") 103 | with pytest.raises(ValueError): 104 | f.info() 105 | with pytest.raises(OSError): 106 | f.seek(0) 107 | with pytest.raises(ValueError): 108 | f.read(0) 109 | assert not f.readable() 110 | assert f.writable() 111 | 112 | f.flush() # no-op 113 | 114 | assert f.write(b"hello") == 5 115 | assert f.write(b"hello") == 5 116 | assert not f.closed 117 | f.close() 118 | assert f.closed 119 | with pytest.raises(ValueError): 120 | f.write(b"") 121 | with pytest.raises(ValueError): 122 | f.flush() 123 | 124 | 125 | def test_midread_cache(ftp_writable): 126 | host, port, user, pw = ftp_writable 127 | fs = FTPFileSystem(host=host, port=port, username=user, password=pw) 128 | fn = "/myfile" 129 | with fs.open(fn, "wb") as f: 130 | f.write(b"a" * 175627146) 131 | with fs.open(fn, "rb") as f: 132 | f.seek(175561610) 133 | d1 = f.read(65536) 134 | assert len(d1) == 65536 135 | 136 | f.seek(4) 137 | size = 17562198 138 | d2 = f.read(size) 139 | assert len(d2) == size 140 | 141 | f.seek(17562288) 142 | size = 17562187 143 | d3 = f.read(size) 144 | assert len(d3) == size 145 | 146 | 147 | def test_read_block(ftp_writable): 148 | # not the same as test_read_block in test_utils, this depends on the 149 | # behaviour of the bytest caching 150 | from fsspec.utils import read_block 151 | 152 | host, port, user, pw = ftp_writable 153 | fs = FTPFileSystem(host=host, port=port, username=user, password=pw) 154 | fn = "/myfile" 155 | with fs.open(fn, "wb") as f: 156 | f.write(b"a,b\n1,2") 157 | f = fs.open(fn, "rb", cache_type="bytes") 158 | assert read_block(f, 0, 6400, b"\n") == b"a,b\n1,2" 159 | 160 | 161 | def test_with_gzip(ftp_writable): 162 | import gzip 163 | 164 | data = b"some compressible stuff" 165 | host, port, user, pw = ftp_writable 166 | fs = FTPFileSystem(host=host, port=port, username=user, password=pw) 167 | fn = "/myfile" 168 | with fs.open(fn, "wb") as f: 169 | gf = gzip.GzipFile(fileobj=f, mode="w") 170 | gf.write(data) 171 | gf.close() 172 | with fs.open(fn, "rb") as f: 173 | gf = gzip.GzipFile(fileobj=f, mode="r") 174 | assert gf.read() == data 175 | 176 | 177 | def test_auto_compression(m): 178 | fs = m 179 | with fs.open("myfile.gz", mode="wt", compression="infer") as f: 180 | f.write("text") 181 | with fs.open("myfile.gz", mode="rt", compression="infer") as f: 182 | assert f.read() == "text" 183 | 184 | 185 | def test_with_zip(ftp_writable): 186 | import zipfile 187 | 188 | data = b"hello zip" 189 | host, port, user, pw = ftp_writable 190 | fs = FTPFileSystem(host=host, port=port, username=user, password=pw) 191 | fn = "/myfile.zip" 192 | inner_file = "test.txt" 193 | with fs.open(fn, "wb") as f: 194 | zf = zipfile.ZipFile(f, mode="w") 195 | zf.writestr(inner_file, data) 196 | zf.close() 197 | with fs.open(fn, "rb") as f: 198 | zf = zipfile.ZipFile(f, mode="r") 199 | assert zf.read(inner_file) == data 200 | -------------------------------------------------------------------------------- /docs/source/conf.py: -------------------------------------------------------------------------------- 1 | # -*- coding: utf-8 -*- 2 | # 3 | # fsspec documentation build configuration file, created by 4 | # sphinx-quickstart on Mon Jan 15 18:11:02 2018. 5 | # 6 | # This file is execfile()d with the current directory set to its 7 | # containing dir. 8 | # 9 | # Note that not all possible configuration values are present in this 10 | # autogenerated file. 11 | # 12 | # All configuration values have a default; values that are commented out 13 | # serve to show the default. 14 | 15 | # If extensions (or modules to document with autodoc) are in another directory, 16 | # add these directories to sys.path here. If the directory is relative to the 17 | # documentation root, use os.path.abspath to make it absolute, like shown here. 18 | # 19 | import os 20 | import sys 21 | 22 | sys.path.insert(0, os.path.abspath("../..")) 23 | 24 | 25 | # -- General configuration ------------------------------------------------ 26 | 27 | # If your documentation needs a minimal Sphinx version, state it here. 28 | # 29 | # needs_sphinx = '1.0' 30 | 31 | # Add any Sphinx extension module names here, as strings. They can be 32 | # extensions coming with Sphinx (named 'sphinx.ext.*') or your custom 33 | # ones. 34 | extensions = [ 35 | "sphinx.ext.autodoc", 36 | "sphinx.ext.viewcode", 37 | "sphinx.ext.autosummary", 38 | "sphinx.ext.extlinks", 39 | "numpydoc", 40 | ] 41 | 42 | numpydoc_show_class_members = False 43 | 44 | # Add any paths that contain templates here, relative to this directory. 45 | templates_path = ["_templates"] 46 | 47 | # The suffix(es) of source filenames. 48 | # You can specify multiple suffix as a list of string: 49 | # 50 | # source_suffix = ['.rst', '.md'] 51 | source_suffix = ".rst" 52 | 53 | # The master toctree document. 54 | master_doc = "index" 55 | 56 | # General information about the project. 57 | project = "fsspec" 58 | copyright = "2018, Martin Durant" 59 | author = "Martin Durant" 60 | 61 | # The version info for the project you're documenting, acts as replacement for 62 | # |version| and |release|, also used in various other places throughout the 63 | # built documents. 64 | # 65 | # The short X.Y version. 66 | 67 | import fsspec 68 | 69 | version = fsspec.__version__ 70 | # The full version, including alpha/beta/rc tags. 71 | release = fsspec.__version__ 72 | 73 | # The language for content autogenerated by Sphinx. Refer to documentation 74 | # for a list of supported languages. 75 | # 76 | # This is also used if you do content translation via gettext catalogs. 77 | # Usually you set "language" from the command line for these cases. 78 | language = None 79 | 80 | # List of patterns, relative to source directory, that match files and 81 | # directories to ignore when looking for source files. 82 | # This patterns also effect to html_static_path and html_extra_path 83 | exclude_patterns = [] 84 | 85 | # The name of the Pygments (syntax highlighting) style to use. 86 | pygments_style = "sphinx" 87 | 88 | # If true, `todo` and `todoList` produce output, else they produce nothing. 89 | todo_include_todos = False 90 | 91 | default_role = "py:obj" 92 | autodoc_mock_imports = [ 93 | "aiohttp", 94 | "dask", 95 | "distributed", 96 | "fuse", 97 | "libarchive", 98 | "panel", 99 | "paramiko", 100 | "pyarrow", 101 | "pygit2", 102 | "requests", 103 | "smbprotocol", 104 | "smbclient", 105 | ] 106 | 107 | 108 | # -- Options for HTML output ---------------------------------------------- 109 | 110 | # The theme to use for HTML and HTML Help pages. See the documentation for 111 | # a list of builtin themes. 112 | # 113 | html_theme = "sphinx_rtd_theme" 114 | 115 | # Theme options are theme-specific and customize the look and feel of a theme 116 | # further. For a list of options available for each theme, see the 117 | # documentation. 118 | # 119 | # html_theme_options = {} 120 | 121 | # Add any paths that contain custom static files (such as style sheets) here, 122 | # relative to this directory. They are copied after the builtin static files, 123 | # so a file named "default.css" will overwrite the builtin "default.css". 124 | html_static_path = [] 125 | 126 | # Custom sidebar templates, must be a dictionary that maps document names 127 | # to template names. 128 | # 129 | # This is required for the alabaster theme 130 | # refs: http://alabaster.readthedocs.io/en/latest/installation.html#sidebars 131 | html_sidebars = { 132 | "**": [ 133 | "relations.html", # needs 'show_related': True theme option to display 134 | "searchbox.html", 135 | ] 136 | } 137 | 138 | 139 | # -- Options for HTMLHelp output ------------------------------------------ 140 | 141 | # Output file base name for HTML help builder. 142 | htmlhelp_basename = "fsspecdoc" 143 | 144 | 145 | # -- Options for LaTeX output --------------------------------------------- 146 | 147 | latex_elements = { 148 | # The paper size ('letterpaper' or 'a4paper'). 149 | # 150 | # 'papersize': 'letterpaper', 151 | # The font size ('10pt', '11pt' or '12pt'). 152 | # 153 | # 'pointsize': '10pt', 154 | # Additional stuff for the LaTeX preamble. 155 | # 156 | # 'preamble': '', 157 | # Latex figure (float) alignment 158 | # 159 | # 'figure_align': 'htbp', 160 | } 161 | 162 | # Grouping the document tree into LaTeX files. List of tuples 163 | # (source start file, target name, title, 164 | # author, documentclass [howto, manual, or own class]). 165 | latex_documents = [ 166 | (master_doc, "fsspec.tex", "fsspec Documentation", "Joseph Crail", "manual") 167 | ] 168 | 169 | 170 | # -- Options for manual page output --------------------------------------- 171 | 172 | # One entry per manual page. List of tuples 173 | # (source start file, name, description, authors, manual section). 174 | man_pages = [(master_doc, "fsspec", "fsspec Documentation", [author], 1)] 175 | 176 | 177 | # -- Options for Texinfo output ------------------------------------------- 178 | 179 | # Grouping the document tree into Texinfo files. List of tuples 180 | # (source start file, target name, title, author, 181 | # dir menu entry, description, category) 182 | texinfo_documents = [ 183 | ( 184 | master_doc, 185 | "fsspec", 186 | "fsspec Documentation", 187 | author, 188 | "fsspec", 189 | "One line description of project.", 190 | "Miscellaneous", 191 | ) 192 | ] 193 | 194 | extlinks = { 195 | "issue": ("https://github.com/fsspec/filesystem_spec/issues/%s", "GH#"), 196 | "pr": ("https://github.com/fsspec/filesystem_spec/pull/%s", "GH#"), 197 | } 198 | -------------------------------------------------------------------------------- /fsspec/implementations/tar.py: -------------------------------------------------------------------------------- 1 | import copy 2 | import logging 3 | import tarfile 4 | import weakref 5 | from io import BufferedReader 6 | 7 | import fsspec 8 | from fsspec.archive import AbstractArchiveFileSystem 9 | from fsspec.compression import compr 10 | from fsspec.utils import infer_compression 11 | 12 | typemap = {b"0": "file", b"5": "directory"} 13 | 14 | logger = logging.getLogger("tar") 15 | 16 | 17 | class TarFileSystem(AbstractArchiveFileSystem): 18 | """Compressed Tar archives as a file-system (read-only) 19 | 20 | Supports the following formats: 21 | tar.gz, tar.bz2, tar.xz 22 | """ 23 | 24 | root_marker = "" 25 | protocol = "tar" 26 | cachable = False 27 | 28 | def __init__( 29 | self, 30 | fo="", 31 | index_store=None, 32 | target_options=None, 33 | target_protocol=None, 34 | compression=None, 35 | **kwargs, 36 | ): 37 | super().__init__(**kwargs) 38 | target_options = target_options or {} 39 | 40 | if isinstance(fo, str): 41 | fo = fsspec.open(fo, protocol=target_protocol, **target_options).open() 42 | 43 | # Try to infer compression. 44 | if compression is None: 45 | name = None 46 | 47 | # Try different ways to get hold of the filename. `fo` might either 48 | # be a `fsspec.LocalFileOpener`, an `io.BufferedReader` or an 49 | # `fsspec.AbstractFileSystem` instance. 50 | try: 51 | # Amended io.BufferedReader or similar. 52 | # This uses a "protocol extension" where original filenames are 53 | # propagated to archive-like filesystems in order to let them 54 | # infer the right compression appropriately. 55 | if hasattr(fo, "original"): 56 | name = fo.original 57 | 58 | # fsspec.LocalFileOpener 59 | elif hasattr(fo, "path"): 60 | name = fo.path 61 | 62 | # io.BufferedReader 63 | elif hasattr(fo, "name"): 64 | name = fo.name 65 | 66 | # fsspec.AbstractFileSystem 67 | elif hasattr(fo, "info"): 68 | name = fo.info()["name"] 69 | 70 | except Exception as ex: 71 | logger.warning( 72 | f"Unable to determine file name, not inferring compression: {ex}" 73 | ) 74 | 75 | if name is not None: 76 | compression = infer_compression(name) 77 | logger.info(f"Inferred compression {compression} from file name {name}") 78 | 79 | if compression is not None: 80 | # TODO: tarfile already implements compression with modes like "'r:gz'", 81 | # but then would seek to offset in the file work? 82 | fo = compr[compression](fo) 83 | 84 | self._fo_ref = fo 85 | weakref.finalize(self, fo.close) 86 | self.fo = fo.__enter__() # the whole instance is a context 87 | self.tar = tarfile.TarFile(fileobj=self.fo) 88 | self.dir_cache = None 89 | 90 | self.index_store = index_store 91 | self.index = None 92 | self._index() 93 | 94 | def _index(self): 95 | # TODO: load and set saved index, if exists 96 | out = {} 97 | for ti in self.tar: 98 | info = ti.get_info() 99 | info["type"] = typemap.get(info["type"], "file") 100 | name = ti.get_info()["name"].rstrip("/") 101 | out[name] = (info, ti.offset_data) 102 | 103 | self.index = out 104 | # TODO: save index to self.index_store here, if set 105 | 106 | def _get_dirs(self): 107 | if self.dir_cache is not None: 108 | return 109 | 110 | # This enables ls to get directories as children as well as files 111 | self.dir_cache = { 112 | dirname + "/": {"name": dirname + "/", "size": 0, "type": "directory"} 113 | for dirname in self._all_dirnames(self.tar.getnames()) 114 | } 115 | for member in self.tar.getmembers(): 116 | info = member.get_info() 117 | info["type"] = typemap.get(info["type"], "file") 118 | self.dir_cache[info["name"]] = info 119 | 120 | def _open(self, path, mode="rb", **kwargs): 121 | if mode != "rb": 122 | raise ValueError("Read-only filesystem implementation") 123 | details, offset = self.index[path] 124 | if details["type"] != "file": 125 | raise ValueError("Can only handle regular files") 126 | 127 | # `LocalFileSystem` offers its resources as `io.BufferedReader` 128 | # objects, those can't be copied. 129 | if isinstance(self.fo, BufferedReader): 130 | newfo = self.fo 131 | else: 132 | newfo = copy.copy(self.fo) 133 | newfo.seek(offset) 134 | 135 | return TarContainedFile(newfo, self.info(path)) 136 | 137 | 138 | class TarContainedFile(object): 139 | """ 140 | Represent/wrap a TarFileSystem's file object. 141 | """ 142 | 143 | def __init__(self, of, info): 144 | self.info = info 145 | self.size = info["size"] 146 | self.of = of 147 | self.start = of.tell() 148 | self.end = self.start + self.size 149 | self.closed = False 150 | 151 | def tell(self): 152 | return self.of.tell() - self.start 153 | 154 | def read(self, n=-1): 155 | if self.closed: 156 | raise ValueError("file is closed") 157 | if n < 0: 158 | n = self.end - self.of.tell() 159 | if n > self.end - self.tell(): 160 | n = self.end - self.tell() 161 | if n < 1: 162 | return b"" 163 | return self.of.read(n) 164 | 165 | def seek(self, to, whence=0): 166 | if self.closed: 167 | raise ValueError("file is closed") 168 | if whence == 0: 169 | to = min(max(self.start, self.start + to), self.end) 170 | elif whence == 1: 171 | to = min(max(self.start, self.tell() + to), self.end) 172 | elif whence == 2: 173 | to = min(max(self.start, self.end + to), self.end) 174 | else: 175 | raise ValueError("Whence must be (0, 1, 2)") 176 | self.of.seek(to) 177 | 178 | def close(self): 179 | self.of.close() 180 | self.closed = True 181 | 182 | def __enter__(self): 183 | return self 184 | 185 | def __exit__(self, exc_type, exc_val, exc_tb): 186 | self.close() 187 | -------------------------------------------------------------------------------- /fsspec/callbacks.py: -------------------------------------------------------------------------------- 1 | class Callback: 2 | """ 3 | Base class and interface for callback mechanism 4 | 5 | This class can be used directly for monitoring file transfers by 6 | providing ``callback=Callback(hooks=...)`` (see the ``hooks`` argument, 7 | below), or subclassed for more specialised behaviour. 8 | 9 | Parameters 10 | ---------- 11 | size: int (optional) 12 | Nominal quantity for the value that corresponds to a complete 13 | transfer, e.g., total number of tiles or total number of 14 | bytes 15 | value: int (0) 16 | Starting internal counter value 17 | hooks: dict or None 18 | A dict of named functions to be called on each update. The signature 19 | of these must be ``f(size, value, **kwargs)`` 20 | """ 21 | 22 | def __init__(self, size=None, value=0, hooks=None, **kwargs): 23 | self.size = size 24 | self.value = value 25 | self.hooks = hooks or {} 26 | self.kw = kwargs 27 | 28 | def set_size(self, size): 29 | """ 30 | Set the internal maximum size attribute 31 | 32 | Usually called if not initially set at instantiation. Note that this 33 | triggers a ``call()``. 34 | 35 | Parameters 36 | ---------- 37 | size: int 38 | """ 39 | self.size = size 40 | self.call() 41 | 42 | def absolute_update(self, value): 43 | """ 44 | Set the internal value state 45 | 46 | Triggers ``call()`` 47 | 48 | Parameters 49 | ---------- 50 | value: int 51 | """ 52 | self.value = value 53 | self.call() 54 | 55 | def relative_update(self, inc=1): 56 | """ 57 | Delta increment the internal counter 58 | 59 | Triggers ``call()`` 60 | 61 | Parameters 62 | ---------- 63 | inc: int 64 | """ 65 | self.value += inc 66 | self.call() 67 | 68 | def call(self, hook_name=None, **kwargs): 69 | """ 70 | Execute hook(s) with current state 71 | 72 | Each function is passed the internal size and current value 73 | 74 | Parameters 75 | ---------- 76 | hook_name: str or None 77 | If given, execute on this hook 78 | kwargs: passed on to (all) hook(s) 79 | """ 80 | if not self.hooks: 81 | return 82 | kw = self.kw.copy() 83 | kw.update(kwargs) 84 | if hook_name: 85 | if hook_name not in self.hooks: 86 | return 87 | return self.hooks[hook_name](self.size, self.value, **kw) 88 | for hook in self.hooks.values() or []: 89 | hook(self.size, self.value, **kw) 90 | 91 | def wrap(self, iterable): 92 | """ 93 | Wrap an iterable to call ``relative_update`` on each iterations 94 | 95 | Parameters 96 | ---------- 97 | iterable: Iterable 98 | The iterable that is being wrapped 99 | """ 100 | for item in iterable: 101 | self.relative_update() 102 | yield item 103 | 104 | def branch(self, path_1, path_2, kwargs): 105 | """ 106 | Set callbacks for child transfers 107 | 108 | If this callback is operating at a higher level, e.g., put, which may 109 | trigger transfers that can also be monitored. The passed kwargs are 110 | to be *mutated* to add ``callback=``, if this class supports branching 111 | to children. 112 | 113 | Parameters 114 | ---------- 115 | path_1: str 116 | Child's source path 117 | path_2: str 118 | Child's destination path 119 | kwargs: dict 120 | arguments passed to child method, e.g., put_file. 121 | 122 | Returns 123 | ------- 124 | 125 | """ 126 | return None 127 | 128 | def no_op(self, *_, **__): 129 | pass 130 | 131 | def __getattr__(self, item): 132 | """ 133 | If undefined methods are called on this class, nothing happens 134 | """ 135 | return self.no_op 136 | 137 | @classmethod 138 | def as_callback(cls, maybe_callback=None): 139 | """Transform callback=... into Callback instance 140 | 141 | For the special value of ``None``, return the global instance of 142 | ``NoOpCallback``. This is an alternative to including 143 | ``callback=_DEFAULT_CALLBACK`` directly in a method signature. 144 | """ 145 | if maybe_callback is None: 146 | return _DEFAULT_CALLBACK 147 | return maybe_callback 148 | 149 | 150 | class NoOpCallback(Callback): 151 | """ 152 | This implementation of Callback does exactly nothing 153 | """ 154 | 155 | def call(self, *args, **kwargs): 156 | return None 157 | 158 | 159 | class DotPrinterCallback(Callback): 160 | """ 161 | Simple example Callback implementation 162 | 163 | Almost identical to Callback with a hook that prints a char; here we 164 | demonstrate how the outer layer may print "#" and the inner layer "." 165 | """ 166 | 167 | def __init__(self, chr_to_print="#", **kwargs): 168 | self.chr = chr_to_print 169 | super().__init__(**kwargs) 170 | 171 | def branch(self, path_1, path_2, kwargs): 172 | """Mutate kwargs to add new instance with different print char""" 173 | kwargs["callback"] = DotPrinterCallback(".") 174 | 175 | def call(self, **kwargs): 176 | """Just outputs a character""" 177 | print(self.chr, end="") 178 | 179 | 180 | class TqdmCallback(Callback): 181 | """ 182 | A callback to display a progress bar using tqdm 183 | 184 | Parameters 185 | ---------- 186 | tqdm_kwargs : dict, (optional) 187 | Any argument accepted by the tqdm constructor. 188 | See the `tqdm doc `_. 189 | Will be forwarded to tqdm. 190 | 191 | Examples 192 | -------- 193 | >>> import fsspec 194 | >>> from fsspec.callbacks import TqdmCallback 195 | >>> fs = fsspec.filesystem("memory") 196 | >>> path2distant_data = "/your-path" 197 | >>> fs.upload( 198 | ".", 199 | path2distant_data, 200 | recursive=True, 201 | callback=TqdmCallback(), 202 | ) 203 | 204 | You can forward args to tqdm using the `tqdm_kwargs` parameter. 205 | 206 | >>> fs.upload( 207 | ".", 208 | path2distant_data, 209 | recursive=True, 210 | callback=TqdmCallback(tqdm_kwargs={"desc": "Your tqdm description"}), 211 | ) 212 | """ 213 | 214 | def __init__(self, tqdm_kwargs=None, *args, **kwargs): 215 | try: 216 | import tqdm 217 | 218 | self._tqdm = tqdm 219 | except ImportError as exce: 220 | raise ImportError( 221 | "Using TqdmCallback requires tqdm to be installed" 222 | ) from exce 223 | 224 | self._tqdm_kwargs = tqdm_kwargs or {} 225 | super().__init__(*args, **kwargs) 226 | 227 | def set_size(self, size): 228 | self.tqdm = self._tqdm.tqdm(total=size, **self._tqdm_kwargs) 229 | 230 | def relative_update(self, inc=1): 231 | self.tqdm.update(inc) 232 | 233 | def __del__(self): 234 | self.tqdm.close() 235 | self.tqdm = None 236 | 237 | 238 | _DEFAULT_CALLBACK = NoOpCallback() 239 | --------------------------------------------------------------------------------